Merge tag 'mac80211-next-for-davem-2017-10-11' of git://git.kernel.org/pub/scm/linux...

author David S. Miller <davem@davemloft.net>

Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)

committer David S. Miller <davem@davemloft.net>

Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)
author David S. Miller <davem@davemloft.net>
Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)
committer David S. Miller <davem@davemloft.net>
Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)
diff --git a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt

index b878a1e..ed1456f 100644 (file)
--- a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
+++ b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt
@@ -16,11 +16,13 @@ Required Properties:
  
  - clocks:
    Array of clocks required for SDHC.
-  Require at least input clock for Xenon IP core.
+  Require at least input clock for Xenon IP core. For Armada AP806 and
+  CP110, the AXI clock is also mandatory.
  
  - clock-names:
    Array of names corresponding to clocks property.
    The input clock for Xenon IP core should be named as "core".
+  The input clock for the AXI bus must be named as "axi".
  
  - reg:
    * For "marvell,armada-3700-sdhci", two register areas.
@@ -106,8 +108,8 @@ Example:
                 compatible = "marvell,armada-ap806-sdhci";
                 reg = <0xaa0000 0x1000>;
                 interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>
-               clocks = <&emmc_clk>;
-               clock-names = "core";
+               clocks = <&emmc_clk>,<&axi_clk>;
+               clock-names = "core", "axi";
                 bus-width = <4>;
                 marvell,xenon-phy-slow-mode;
                 marvell,xenon-tun-count = <11>;
@@ -126,8 +128,8 @@ Example:
                 interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>
                 vqmmc-supply = <&sd_vqmmc_regulator>;
                 vmmc-supply = <&sd_vmmc_regulator>;
-               clocks = <&sdclk>;
-               clock-names = "core";
+               clocks = <&sdclk>, <&axi_clk>;
+               clock-names = "core", "axi";
                 bus-width = <4>;
                 marvell,xenon-tun-count = <9>;
         };
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt

index 36f528a..8caa607 100644 (file)
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -210,8 +210,11 @@ path as another overlay mount and it may use a lower layer path that is
  beneath or above the path of another overlay lower layer path.
  
  Using an upper layer path and/or a workdir path that are already used by
-another overlay mount is not allowed and will fail with EBUSY.  Using
+another overlay mount is not allowed and may fail with EBUSY.  Using
  partially overlapping paths is not allowed but will not fail with EBUSY.
+If files are accessed from two overlayfs mounts which share or overlap the
+upper layer and/or workdir path the behavior of the overlay is undefined,
+though it will not result in a crash or deadlock.
  
  Mounting an overlay using an upper layer path, where the upper layer path
  was previously used by another mounted overlay in combination with a
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801

index 0500193..d477024 100644 (file)
--- a/Documentation/i2c/busses/i2c-i801
+++ b/Documentation/i2c/busses/i2c-i801
@@ -36,6 +36,7 @@ Supported adapters:
    * Intel Gemini Lake (SOC)
    * Intel Cannon Lake-H (PCH)
    * Intel Cannon Lake-LP (PCH)
+  * Intel Cedar Fork (PCH)
     Datasheets: Publicly available at the Intel website
  
  On Intel Patsburg and later chipsets, both the normal host SMBus controller
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt

index 57f52cd..9ba04c0 100644 (file)
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -2387,7 +2387,7 @@ broadcast: Like active-backup, there is not much advantage to this
         and packet type ID), so in a "gatewayed" configuration, all
         outgoing traffic will generally use the same device.  Incoming
         traffic may also end up on a single device, but that is
-       dependent upon the balancing policy of the peer's 8023.ad
+       dependent upon the balancing policy of the peer's 802.3ad
         implementation.  In a "local" configuration, traffic will be
         distributed across the devices in the bond.
  
diff --git a/Documentation/networking/netvsc.txt b/Documentation/networking/netvsc.txt

index 93560fb..92f5b31 100644 (file)
--- a/Documentation/networking/netvsc.txt
+++ b/Documentation/networking/netvsc.txt
@@ -19,12 +19,12 @@ Features
  
    Receive Side Scaling
    --------------------
-  Hyper-V supports receive side scaling. For TCP, packets are
-  distributed among available queues based on IP address and port
+  Hyper-V supports receive side scaling. For TCP & UDP, packets can
+  be distributed among available queues based on IP address and port
    number.
  
-  For UDP, we can switch UDP hash level between L3 and L4 by ethtool
-  command. UDP over IPv4 and v6 can be set differently. The default
+  For TCP & UDP, we can switch hash level between L3 and L4 by ethtool
+  command. TCP/UDP over IPv4 and v6 can be set differently. The default
    hash level is L4. We currently only allow switching TX hash level
    from within the guests.
  
diff --git a/MAINTAINERS b/MAINTAINERS

index e90cdec..3944f16 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5264,7 +5264,8 @@ S:        Maintained
  F:     drivers/iommu/exynos-iommu.c
  
  EZchip NPS platform support
-M:     Noam Camus <noamc@ezchip.com>
+M:     Elad Kanfi <eladkan@mellanox.com>
+M:     Vineet Gupta <vgupta@synopsys.com>
  S:     Supported
  F:     arch/arc/plat-eznps
  F:     arch/arc/boot/dts/eznps.dts
@@ -9366,7 +9367,7 @@ NETWORK BLOCK DEVICE (NBD)
  M:     Josef Bacik <jbacik@fb.com>
  S:     Maintained
  L:     linux-block@vger.kernel.org
-L:     nbd-general@lists.sourceforge.net
+L:     nbd@other.debian.org
  F:     Documentation/blockdev/nbd.txt
  F:     drivers/block/nbd.c
  F:     include/uapi/linux/nbd.h
diff --git a/Makefile b/Makefile

index cf007a3..2835863 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 4
  PATCHLEVEL = 14
  SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc4
  NAME = Fearless Coyote
  
  # *DOCUMENTATION*
diff --git a/arch/Kconfig b/arch/Kconfig

index 1aafb4e..d789a89 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -937,9 +937,6 @@ config STRICT_MODULE_RWX
           and non-text memory will be made non-executable. This provides
           protection against certain security exploits (e.g. writing to text)
  
-config ARCH_WANT_RELAX_ORDER
-       bool
-
  config ARCH_HAS_REFCOUNT
         bool
         help
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig

index a598641..c84e67f 100644 (file)
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -24,7 +24,7 @@ config ARC
         select GENERIC_SMP_IDLE_THREAD
         select HAVE_ARCH_KGDB
         select HAVE_ARCH_TRACEHOOK
-       select HAVE_FUTEX_CMPXCHG
+       select HAVE_FUTEX_CMPXCHG if FUTEX
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select HAVE_KRETPROBES
diff --git a/arch/arc/Makefile b/arch/arc/Makefile

index 3a4b52b..d37f49d 100644 (file)
--- a/arch/arc/Makefile
+++ b/arch/arc/Makefile
@@ -6,8 +6,6 @@
  # published by the Free Software Foundation.
  #
  
-UTS_MACHINE := arc
-
  ifeq ($(CROSS_COMPILE),)
  ifndef CONFIG_CPU_BIG_ENDIAN
  CROSS_COMPILE := arc-linux-
diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi

index 2367a67..e114000 100644 (file)
--- a/arch/arc/boot/dts/axs10x_mb.dtsi
+++ b/arch/arc/boot/dts/axs10x_mb.dtsi
@@ -44,7 +44,14 @@
  
                         mmcclk: mmcclk {
                                 compatible = "fixed-clock";
-                               clock-frequency = <50000000>;
+                               /*
+                                * DW sdio controller has external ciu clock divider
+                                * controlled via register in SDIO IP. It divides
+                                * sdio_ref_clk (which comes from CGU) by 16 for
+                                * default. So default mmcclk clock (which comes
+                                * to sdk_in) is 25000000 Hz.
+                                */
+                               clock-frequency = <25000000>;
                                 #clock-cells = <0>;
                         };
  
diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts

index 229d13a..8adde1b 100644 (file)
--- a/arch/arc/boot/dts/hsdk.dts
+++ b/arch/arc/boot/dts/hsdk.dts
@@ -12,6 +12,7 @@
  /dts-v1/;
  
  #include <dt-bindings/net/ti-dp83867.h>
+#include <dt-bindings/reset/snps,hsdk-reset.h>
  
  / {
         model = "snps,hsdk";
@@ -57,10 +58,10 @@
                 };
         };
  
-       core_clk: core-clk {
+       input_clk: input-clk {
                 #clock-cells = <0>;
                 compatible = "fixed-clock";
-               clock-frequency = <500000000>;
+               clock-frequency = <33333333>;
         };
  
         cpu_intc: cpu-interrupt-controller {
@@ -102,6 +103,19 @@
  
                 ranges = <0x00000000 0xf0000000 0x10000000>;
  
+               cgu_rst: reset-controller@8a0 {
+                       compatible = "snps,hsdk-reset";
+                       #reset-cells = <1>;
+                       reg = <0x8A0 0x4>, <0xFF0 0x4>;
+               };
+
+               core_clk: core-clk@0 {
+                       compatible = "snps,hsdk-core-pll-clock";
+                       reg = <0x00 0x10>, <0x14B8 0x4>;
+                       #clock-cells = <0>;
+                       clocks = <&input_clk>;
+               };
+
                 serial: serial@5000 {
                         compatible = "snps,dw-apb-uart";
                         reg = <0x5000 0x100>;
@@ -120,7 +134,17 @@
  
                 mmcclk_ciu: mmcclk-ciu {
                         compatible = "fixed-clock";
-                       clock-frequency = <100000000>;
+                       /*
+                        * DW sdio controller has external ciu clock divider
+                        * controlled via register in SDIO IP. Due to its
+                        * unexpected default value (it should devide by 1
+                        * but it devides by 8) SDIO IP uses wrong clock and
+                        * works unstable (see STAR 9001204800)
+                        * So add temporary fix and change clock frequency
+                        * from 100000000 to 12500000 Hz until we fix dw sdio
+                        * driver itself.
+                        */
+                       clock-frequency = <12500000>;
                         #clock-cells = <0>;
                 };
  
@@ -141,6 +165,8 @@
                         clocks = <&gmacclk>;
                         clock-names = "stmmaceth";
                         phy-handle = <&phy0>;
+                       resets = <&cgu_rst HSDK_ETH_RESET>;
+                       reset-names = "stmmaceth";
  
                         mdio {
                                 #address-cells = <1>;
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig

index 6980b96..ec7c849 100644 (file)
--- a/arch/arc/configs/axs101_defconfig
+++ b/arch/arc/configs/axs101_defconfig
@@ -105,7 +105,7 @@ CONFIG_NLS_ISO8859_1=y
  # CONFIG_ENABLE_WARN_DEPRECATED is not set
  # CONFIG_ENABLE_MUST_CHECK is not set
  CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
  CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
  # CONFIG_SCHED_DEBUG is not set
  # CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig

index 2233f57..63d3cf6 100644 (file)
--- a/arch/arc/configs/axs103_defconfig
+++ b/arch/arc/configs/axs103_defconfig
@@ -104,7 +104,7 @@ CONFIG_NLS_ISO8859_1=y
  # CONFIG_ENABLE_WARN_DEPRECATED is not set
  # CONFIG_ENABLE_MUST_CHECK is not set
  CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
  CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
  # CONFIG_SCHED_DEBUG is not set
  # CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig

index 30a3d4c..f613eca 100644 (file)
--- a/arch/arc/configs/axs103_smp_defconfig
+++ b/arch/arc/configs/axs103_smp_defconfig
@@ -107,7 +107,7 @@ CONFIG_NLS_ISO8859_1=y
  # CONFIG_ENABLE_WARN_DEPRECATED is not set
  # CONFIG_ENABLE_MUST_CHECK is not set
  CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
  CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
  # CONFIG_SCHED_DEBUG is not set
  # CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig

index 821a2e5..3507be2 100644 (file)
--- a/arch/arc/configs/haps_hs_smp_defconfig
+++ b/arch/arc/configs/haps_hs_smp_defconfig
@@ -84,5 +84,5 @@ CONFIG_TMPFS=y
  CONFIG_NFS_FS=y
  # CONFIG_ENABLE_WARN_DEPRECATED is not set
  # CONFIG_ENABLE_MUST_CHECK is not set
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
  # CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig

index 9a3fcf4..15f0f6b 100644 (file)
--- a/arch/arc/configs/hsdk_defconfig
+++ b/arch/arc/configs/hsdk_defconfig
@@ -63,6 +63,7 @@ CONFIG_MMC_SDHCI=y
  CONFIG_MMC_SDHCI_PLTFM=y
  CONFIG_MMC_DW=y
  # CONFIG_IOMMU_SUPPORT is not set
+CONFIG_RESET_HSDK=y
  CONFIG_EXT3_FS=y
  CONFIG_VFAT_FS=y
  CONFIG_TMPFS=y
@@ -72,7 +73,7 @@ CONFIG_NLS_ISO8859_1=y
  # CONFIG_ENABLE_WARN_DEPRECATED is not set
  # CONFIG_ENABLE_MUST_CHECK is not set
  CONFIG_STRIP_ASM_SYMS=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
  CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
  # CONFIG_SCHED_DEBUG is not set
  # CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig

index c0d6a01..4fcf4f2 100644 (file)
--- a/arch/arc/configs/vdk_hs38_defconfig
+++ b/arch/arc/configs/vdk_hs38_defconfig
@@ -94,7 +94,7 @@ CONFIG_NLS_ISO8859_1=y
  # CONFIG_ENABLE_MUST_CHECK is not set
  CONFIG_STRIP_ASM_SYMS=y
  CONFIG_DEBUG_SHIRQ=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
  CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
  # CONFIG_SCHED_DEBUG is not set
  # CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig

index 5c09717..7b71464 100644 (file)
--- a/arch/arc/configs/vdk_hs38_smp_defconfig
+++ b/arch/arc/configs/vdk_hs38_smp_defconfig
@@ -98,7 +98,7 @@ CONFIG_NLS_ISO8859_1=y
  # CONFIG_ENABLE_MUST_CHECK is not set
  CONFIG_STRIP_ASM_SYMS=y
  CONFIG_DEBUG_SHIRQ=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
  CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10
  # CONFIG_SCHED_DEBUG is not set
  # CONFIG_DEBUG_PREEMPT is not set
diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h

index ba8e802..b1c56d3 100644 (file)
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -98,6 +98,7 @@
  
  /* Auxiliary registers */
  #define AUX_IDENTITY           4
+#define AUX_EXEC_CTRL          8
  #define AUX_INTR_VEC_BASE      0x25
  #define AUX_VOL                        0x5e
  
@@ -135,12 +136,12 @@ struct bcr_identity {
  #endif
  };
  
-struct bcr_isa {
+struct bcr_isa_arcv2 {
  #ifdef CONFIG_CPU_BIG_ENDIAN
         unsigned int div_rem:4, pad2:4, ldd:1, unalign:1, atomic:1, be:1,
-                    pad1:11, atomic1:1, ver:8;
+                    pad1:12, ver:8;
  #else
-       unsigned int ver:8, atomic1:1, pad1:11, be:1, atomic:1, unalign:1,
+       unsigned int ver:8, pad1:12, be:1, atomic:1, unalign:1,
                      ldd:1, pad2:4, div_rem:4;
  #endif
  };
@@ -263,13 +264,13 @@ struct cpuinfo_arc {
         struct cpuinfo_arc_mmu mmu;
         struct cpuinfo_arc_bpu bpu;
         struct bcr_identity core;
-       struct bcr_isa isa;
+       struct bcr_isa_arcv2 isa;
         const char *details, *name;
         unsigned int vec_base;
         struct cpuinfo_arc_ccm iccm, dccm;
         struct {
                 unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, swape:1, pad1:2,
-                            fpu_sp:1, fpu_dp:1, pad2:6,
+                            fpu_sp:1, fpu_dp:1, dual_iss_enb:1, dual_iss_exist:1, pad2:4,
                              debug:1, ap:1, smart:1, rtt:1, pad3:4,
                              timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4;
         } extn;
diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c

index 877cec8..fb83844 100644 (file)
--- a/arch/arc/kernel/setup.c
+++ b/arch/arc/kernel/setup.c
@@ -51,6 +51,7 @@ static const struct id_to_str arc_cpu_rel[] = {
         { 0x51, "R2.0" },
         { 0x52, "R2.1" },
         { 0x53, "R3.0" },
+       { 0x54, "R4.0" },
  #endif
         { 0x00, NULL   }
  };
@@ -62,6 +63,7 @@ static const struct id_to_str arc_cpu_nm[] = {
  #else
         { 0x40, "ARC EM"  },
         { 0x50, "ARC HS38"  },
+       { 0x54, "ARC HS48"  },
  #endif
         { 0x00, "Unknown"   }
  };
@@ -119,11 +121,11 @@ static void read_arc_build_cfg_regs(void)
         struct bcr_generic bcr;
         struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()];
         const struct id_to_str *tbl;
+       struct bcr_isa_arcv2 isa;
  
         FIX_PTR(cpu);
  
         READ_BCR(AUX_IDENTITY, cpu->core);
-       READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa);
  
         for (tbl = &arc_cpu_rel[0]; tbl->id != 0; tbl++) {
                 if (cpu->core.family == tbl->id) {
@@ -133,7 +135,7 @@ static void read_arc_build_cfg_regs(void)
         }
  
         for (tbl = &arc_cpu_nm[0]; tbl->id != 0; tbl++) {
-               if ((cpu->core.family & 0xF0) == tbl->id)
+               if ((cpu->core.family & 0xF4) == tbl->id)
                         break;
         }
         cpu->name = tbl->str;
@@ -192,6 +194,14 @@ static void read_arc_build_cfg_regs(void)
                 cpu->bpu.full = bpu.ft;
                 cpu->bpu.num_cache = 256 << bpu.bce;
                 cpu->bpu.num_pred = 2048 << bpu.pte;
+
+               if (cpu->core.family >= 0x54) {
+                       unsigned int exec_ctrl;
+
+                       READ_BCR(AUX_EXEC_CTRL, exec_ctrl);
+                       cpu->extn.dual_iss_exist = 1;
+                       cpu->extn.dual_iss_enb = exec_ctrl & 1;
+               }
         }
  
         READ_BCR(ARC_REG_AP_BCR, bcr);
@@ -205,18 +215,25 @@ static void read_arc_build_cfg_regs(void)
  
         cpu->extn.debug = cpu->extn.ap | cpu->extn.smart | cpu->extn.rtt;
  
+       READ_BCR(ARC_REG_ISA_CFG_BCR, isa);
+
         /* some hacks for lack of feature BCR info in old ARC700 cores */
         if (is_isa_arcompact()) {
-               if (!cpu->isa.ver)      /* ISA BCR absent, use Kconfig info */
+               if (!isa.ver)   /* ISA BCR absent, use Kconfig info */
                         cpu->isa.atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC);
-               else
-                       cpu->isa.atomic = cpu->isa.atomic1;
+               else {
+                       /* ARC700_BUILD only has 2 bits of isa info */
+                       struct bcr_generic bcr = *(struct bcr_generic *)&isa;
+                       cpu->isa.atomic = bcr.info & 1;
+               }
  
                 cpu->isa.be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
  
                  /* there's no direct way to distinguish 750 vs. 770 */
                 if (unlikely(cpu->core.family < 0x34 || cpu->mmu.ver < 3))
                         cpu->name = "ARC750";
+       } else {
+               cpu->isa = isa;
         }
  }
  
@@ -232,10 +249,11 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len)
                        "\nIDENTITY\t: ARCVER [%#02x] ARCNUM [%#02x] CHIPID [%#4x]\n",
                        core->family, core->cpu_id, core->chip_id);
  
-       n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s\n",
+       n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s%s%s\n",
                        cpu_id, cpu->name, cpu->details,
                        is_isa_arcompact() ? "ARCompact" : "ARCv2",
-                      IS_AVAIL1(cpu->isa.be, "[Big-Endian]"));
+                      IS_AVAIL1(cpu->isa.be, "[Big-Endian]"),
+                      IS_AVAIL3(cpu->extn.dual_iss_exist, cpu->extn.dual_iss_enb, " Dual-Issue"));
  
         n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s%s%s\nISA Extn\t: ",
                        IS_AVAIL1(cpu->extn.timer0, "Timer0 "),
diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c

index f1ac679..cf14ebc 100644 (file)
--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -111,6 +111,13 @@ static void __init axs10x_early_init(void)
  
         axs10x_enable_gpio_intc_wire();
  
+       /*
+        * Reset ethernet IP core.
+        * TODO: get rid of this quirk after axs10x reset driver (or simple
+        * reset driver) will be available in upstream.
+        */
+       iowrite32((1 << 5), (void __iomem *) CREG_MB_SW_RESET);
+
         scnprintf(mb, 32, "MainBoard v%d", mb_rev);
         axs10x_print_board_ver(CREG_MB_VER, mb);
  }
diff --git a/arch/arc/plat-hsdk/Kconfig b/arch/arc/plat-hsdk/Kconfig

index 5a6ed5a..bd08de4 100644 (file)
--- a/arch/arc/plat-hsdk/Kconfig
+++ b/arch/arc/plat-hsdk/Kconfig
@@ -6,4 +6,5 @@
  #
  
  menuconfig ARC_SOC_HSDK
-       bool "ARC HS Development Kit SOC"
+       bool "ARC HS Development Kit SOC"
+       select CLK_HSDK
diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c

index a2e7fd1..744e62e 100644 (file)
--- a/arch/arc/plat-hsdk/platform.c
+++ b/arch/arc/plat-hsdk/platform.c
@@ -38,6 +38,42 @@ static void __init hsdk_init_per_cpu(unsigned int cpu)
  #define CREG_PAE               (CREG_BASE + 0x180)
  #define CREG_PAE_UPDATE                (CREG_BASE + 0x194)
  
+#define CREG_CORE_IF_CLK_DIV   (CREG_BASE + 0x4B8)
+#define CREG_CORE_IF_CLK_DIV_2 0x1
+#define CGU_BASE               ARC_PERIPHERAL_BASE
+#define CGU_PLL_STATUS         (ARC_PERIPHERAL_BASE + 0x4)
+#define CGU_PLL_CTRL           (ARC_PERIPHERAL_BASE + 0x0)
+#define CGU_PLL_STATUS_LOCK    BIT(0)
+#define CGU_PLL_STATUS_ERR     BIT(1)
+#define CGU_PLL_CTRL_1GHZ      0x3A10
+#define HSDK_PLL_LOCK_TIMEOUT  500
+
+#define HSDK_PLL_LOCKED() \
+       !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK)
+
+#define HSDK_PLL_ERR() \
+       !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR)
+
+static void __init hsdk_set_cpu_freq_1ghz(void)
+{
+       u32 timeout = HSDK_PLL_LOCK_TIMEOUT;
+
+       /*
+        * As we set cpu clock which exceeds 500MHz, the divider for the interface
+        * clock must be programmed to div-by-2.
+        */
+       iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV);
+
+       /* Set cpu clock to 1GHz */
+       iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL);
+
+       while (!HSDK_PLL_LOCKED() && timeout--)
+               cpu_relax();
+
+       if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR())
+               pr_err("Failed to setup CPU frequency to 1GHz!");
+}
+
  static void __init hsdk_init_early(void)
  {
         /*
@@ -52,6 +88,12 @@ static void __init hsdk_init_early(void)
  
         /* Really apply settings made above */
         writel(1, (void __iomem *) CREG_PAE_UPDATE);
+
+       /*
+        * Setup CPU frequency to 1GHz.
+        * TODO: remove it after smart hsdk pll driver will be introduced.
+        */
+       hsdk_set_cpu_freq_1ghz();
  }
  
  static const char *hsdk_compat[] __initconst = {
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h

index 3585a5e..f7c4d21 100644 (file)
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -95,16 +95,19 @@
  #define KERNEL_END        _end
  
  /*
- * The size of the KASAN shadow region. This should be 1/8th of the
- * size of the entire kernel virtual address space.
+ * KASAN requires 1/8th of the kernel virtual address space for the shadow
+ * region. KASAN can bloat the stack significantly, so double the (minimum)
+ * stack size when KASAN is in use.
   */
  #ifdef CONFIG_KASAN
  #define KASAN_SHADOW_SIZE      (UL(1) << (VA_BITS - 3))
+#define KASAN_THREAD_SHIFT     1
  #else
  #define KASAN_SHADOW_SIZE      (0)
+#define KASAN_THREAD_SHIFT     0
  #endif
  
-#define MIN_THREAD_SHIFT       14
+#define MIN_THREAD_SHIFT       (14 + KASAN_THREAD_SHIFT)
  
  /*
   * VMAP'd stacks are allocated at page granularity, so we must ensure that such
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c

index f0e6d71..d06fbe4 100644 (file)
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -649,4 +649,4 @@ static int __init armv8_deprecated_init(void)
         return 0;
  }
  
-late_initcall(armv8_deprecated_init);
+core_initcall(armv8_deprecated_init);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c

index cd52d36..21e2c95 100644 (file)
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1307,4 +1307,4 @@ static int __init enable_mrs_emulation(void)
         return 0;
  }
  
-late_initcall(enable_mrs_emulation);
+core_initcall(enable_mrs_emulation);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c

index f444f37..5d547de 100644 (file)
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -444,4 +444,4 @@ static int __init fpsimd_init(void)
  
         return 0;
  }
-late_initcall(fpsimd_init);
+core_initcall(fpsimd_init);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c

index 2069e9b..b64958b 100644 (file)
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -97,7 +97,7 @@ static void data_abort_decode(unsigned int esr)
                          (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
                          (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
         } else {
-               pr_alert("  ISV = 0, ISS = 0x%08lu\n", esr & ESR_ELx_ISS_MASK);
+               pr_alert("  ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK);
         }
  
         pr_alert("  CM = %lu, WnR = %lu\n",
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c

index a45a67d..30f9239 100644 (file)
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -146,7 +146,7 @@ void machine_power_off(void)
  
         /* prevent soft lockup/stalled CPU messages for endless loop. */
         rcu_sysrq_start();
-       lockup_detector_suspend();
+       lockup_detector_soft_poweroff();
         for (;;);
  }
  
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c

index 1df770e..7275fed 100644 (file)
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -102,10 +102,10 @@ static void cpufeatures_flush_tlb(void)
         case PVR_POWER8:
         case PVR_POWER8E:
         case PVR_POWER8NVL:
-               __flush_tlb_power8(POWER8_TLB_SETS);
+               __flush_tlb_power8(TLB_INVAL_SCOPE_GLOBAL);
                 break;
         case PVR_POWER9:
-               __flush_tlb_power9(POWER9_TLB_SETS_HASH);
+               __flush_tlb_power9(TLB_INVAL_SCOPE_GLOBAL);
                 break;
         default:
                 pr_err("unknown CPU version for boot TLB flush\n");
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S

index 48da0f5..b82586c 100644 (file)
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -734,7 +734,29 @@ EXC_REAL(program_check, 0x700, 0x100)
  EXC_VIRT(program_check, 0x4700, 0x100, 0x700)
  TRAMP_KVM(PACA_EXGEN, 0x700)
  EXC_COMMON_BEGIN(program_check_common)
-       EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
+       /*
+        * It's possible to receive a TM Bad Thing type program check with
+        * userspace register values (in particular r1), but with SRR1 reporting
+        * that we came from the kernel. Normally that would confuse the bad
+        * stack logic, and we would report a bad kernel stack pointer. Instead
+        * we switch to the emergency stack if we're taking a TM Bad Thing from
+        * the kernel.
+        */
+       li      r10,MSR_PR              /* Build a mask of MSR_PR ..    */
+       oris    r10,r10,0x200000@h      /* .. and SRR1_PROGTM           */
+       and     r10,r10,r12             /* Mask SRR1 with that.         */
+       srdi    r10,r10,8               /* Shift it so we can compare   */
+       cmpldi  r10,(0x200000 >> 8)     /* .. with an immediate.        */
+       bne 1f                          /* If != go to normal path.     */
+
+       /* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack  */
+       andi.   r10,r12,MSR_PR;         /* Set CR0 correctly for label  */
+                                       /* 3 in EXCEPTION_PROLOG_COMMON */
+       mr      r10,r1                  /* Save r1                      */
+       ld      r1,PACAEMERGSP(r13)     /* Use emergency stack          */
+       subi    r1,r1,INT_FRAME_SIZE    /* alloc stack frame            */
+       b 3f                            /* Jump into the macro !!       */
+1:     EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
         bl      save_nvgprs
         RECONCILE_IRQ_STATE(r10, r11)
         addi    r3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c

index b76ca19..72f153c 100644 (file)
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -624,5 +624,18 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs)
  
  long __machine_check_early_realmode_p9(struct pt_regs *regs)
  {
+       /*
+        * On POWER9 DD2.1 and below, it's possible to get a machine check
+        * caused by a paste instruction where only DSISR bit 25 is set. This
+        * will result in the MCE handler seeing an unknown event and the kernel
+        * crashing. An MCE that occurs like this is spurious, so we don't need
+        * to do anything in terms of servicing it. If there is something that
+        * needs to be serviced, the CPU will raise the MCE again with the
+        * correct DSISR so that it can be serviced properly. So detect this
+        * case and mark it as handled.
+        */
+       if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000)
+               return 1;
+
         return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
  }
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c

index 0ac741f..2e3bc16 100644 (file)
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -904,9 +904,6 @@ void __init setup_arch(char **cmdline_p)
  #endif
  #endif
  
-#ifdef CONFIG_PPC_64K_PAGES
-       init_mm.context.pte_frag = NULL;
-#endif
  #ifdef CONFIG_SPAPR_TCE_IOMMU
         mm_iommu_init(&init_mm);
  #endif
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c

index c83c115..b2c0029 100644 (file)
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -452,9 +452,20 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
         if (MSR_TM_RESV(msr))
                 return -EINVAL;
  
-       /* pull in MSR TM from user context */
+       /* pull in MSR TS bits from user context */
         regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
  
+       /*
+        * Ensure that TM is enabled in regs->msr before we leave the signal
+        * handler. It could be the case that (a) user disabled the TM bit
+        * through the manipulation of the MSR bits in uc_mcontext or (b) the
+        * TM bit was disabled because a sufficient number of context switches
+        * happened whilst in the signal handler and load_tm overflowed,
+        * disabling the TM bit. In either case we can end up with an illegal
+        * TM state leading to a TM Bad Thing when we return to userspace.
+        */
+       regs->msr |= MSR_TM;
+
         /* pull in MSR LE from user context */
         regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
  
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c

index 2f6eadd..c702a89 100644 (file)
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -310,9 +310,6 @@ static int start_wd_on_cpu(unsigned int cpu)
         if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
                 return 0;
  
-       if (watchdog_suspended)
-               return 0;
-
         if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
                 return 0;
  
@@ -358,36 +355,39 @@ static void watchdog_calc_timeouts(void)
         wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
  }
  
-void watchdog_nmi_reconfigure(void)
+void watchdog_nmi_stop(void)
  {
         int cpu;
  
-       watchdog_calc_timeouts();
-
         for_each_cpu(cpu, &wd_cpus_enabled)
                 stop_wd_on_cpu(cpu);
+}
  
+void watchdog_nmi_start(void)
+{
+       int cpu;
+
+       watchdog_calc_timeouts();
         for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
                 start_wd_on_cpu(cpu);
  }
  
  /*
- * This runs after lockup_detector_init() which sets up watchdog_cpumask.
+ * Invoked from core watchdog init.
   */
-static int __init powerpc_watchdog_init(void)
+int __init watchdog_nmi_probe(void)
  {
         int err;
  
-       watchdog_calc_timeouts();
-
-       err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online",
-                               start_wd_on_cpu, stop_wd_on_cpu);
-       if (err < 0)
+       err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+                                       "powerpc/watchdog:online",
+                                       start_wd_on_cpu, stop_wd_on_cpu);
+       if (err < 0) {
                 pr_warn("Watchdog could not be initialized");
-
+               return err;
+       }
         return 0;
  }
-arch_initcall(powerpc_watchdog_init);
  
  static void handle_backtrace_ipi(struct pt_regs *regs)
  {
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c

index 1330462..bf45784 100644 (file)
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -622,7 +622,7 @@ int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
                 return -EINVAL;
         state = &sb->irq_state[idx];
         arch_spin_lock(&sb->lock);
-       *server = state->guest_server;
+       *server = state->act_server;
         *priority = state->guest_priority;
         arch_spin_unlock(&sb->lock);
  
@@ -1331,7 +1331,7 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
         xive->saved_src_count++;
  
         /* Convert saved state into something compatible with xics */
-       val = state->guest_server;
+       val = state->act_server;
         prio = state->saved_scan_prio;
  
         if (prio == MASKED) {
@@ -1507,7 +1507,6 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
         /* First convert prio and mark interrupt as untargetted */
         act_prio = xive_prio_from_guest(guest_prio);
         state->act_priority = MASKED;
-       state->guest_server = server;
  
         /*
          * We need to drop the lock due to the mutex below. Hopefully
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h

index 5938f76..6ba63f8 100644 (file)
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -35,7 +35,6 @@ struct kvmppc_xive_irq_state {
         struct xive_irq_data *pt_data;  /* XIVE Pass-through associated data */
  
         /* Targetting as set by guest */
-       u32 guest_server;               /* Current guest selected target */
         u8 guest_priority;              /* Guest set priority */
         u8 saved_priority;              /* Saved priority when masking */
  
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c

index 65eda19..f6c7f54 100644 (file)
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -361,9 +361,9 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
                         break;
         }
         wmb();
+       local_irq_restore(flags);
         flush_tlb_kernel_range((unsigned long)page_address(start),
                                (unsigned long)page_address(page));
-       local_irq_restore(flags);
         return err;
  }
  
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c

index 897aa14..bbb73aa 100644 (file)
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -272,7 +272,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
  #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
  static unsigned long pnv_memory_block_size(void)
  {
-       return 256UL * 1024 * 1024;
+       /*
+        * We map the kernel linear region with 1GB large pages on radix. For
+        * memory hot unplug to work our memory block size must be at least
+        * this size.
+        */
+       if (radix_enabled())
+               return 1UL * 1024 * 1024 * 1024;
+       else
+               return 256UL * 1024 * 1024;
  }
  #endif
  
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c

index f387318..a3b8d7d 100644 (file)
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1402,6 +1402,14 @@ void xive_teardown_cpu(void)
  
         if (xive_ops->teardown_cpu)
                 xive_ops->teardown_cpu(cpu, xc);
+
+#ifdef CONFIG_SMP
+       /* Get rid of IPI */
+       xive_cleanup_cpu_ipi(cpu, xc);
+#endif
+
+       /* Disable and free the queues */
+       xive_cleanup_cpu_queues(cpu, xc);
  }
  
  void xive_kexec_teardown_cpu(int secondary)
diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c

index f24a70b..d9c4c93 100644 (file)
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -431,7 +431,11 @@ static int xive_spapr_get_ipi(unsigned int cpu, struct xive_cpu *xc)
  
  static void xive_spapr_put_ipi(unsigned int cpu, struct xive_cpu *xc)
  {
+       if (!xc->hw_ipi)
+               return;
+
         xive_irq_bitmap_free(xc->hw_ipi);
+       xc->hw_ipi = 0;
  }
  #endif /* CONFIG_SMP */
  
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig

index 0be3828..4e83f95 100644 (file)
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -44,7 +44,6 @@ config SPARC
         select ARCH_HAS_SG_CHAIN
         select CPU_NO_EFFICIENT_FFS
         select LOCKDEP_SMALL if LOCKDEP
-       select ARCH_WANT_RELAX_ORDER
  
  config SPARC32
         def_bool !64BIT
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c

index 829e89c..9fb9a1f 100644 (file)
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void)
                 return 0;
         }
  
-       if (lockup_detector_suspend() != 0) {
-               pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-               return 0;
-       }
+       cpus_read_lock();
+
+       hardlockup_detector_perf_stop();
  
         x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
  
@@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void)
         x86_pmu.commit_scheduling = NULL;
         x86_pmu.stop_scheduling = NULL;
  
-       lockup_detector_resume();
-
-       cpus_read_lock();
+       hardlockup_detector_perf_restart();
  
         for_each_online_cpu(c)
                 free_excl_cntrs(c);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h

index bc62e7c..59ad3d1 100644 (file)
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
  bool kvm_para_available(void);
  unsigned int kvm_arch_para_features(void);
  void __init kvm_guest_init(void);
-void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
  void kvm_async_pf_task_wake(u32 token);
  u32 kvm_read_and_reset_pf_reason(void);
  extern void kvm_disable_steal_time(void);
@@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void)
  
  #else /* CONFIG_KVM_GUEST */
  #define kvm_guest_init() do {} while (0)
-#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wait(T, I) do {} while(0)
  #define kvm_async_pf_task_wake(T) do {} while(0)
  
  static inline bool kvm_para_available(void)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c

index e675704..8bb9594 100644 (file)
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
         return NULL;
  }
  
-void kvm_async_pf_task_wait(u32 token)
+/*
+ * @interrupt_kernel: Is this called from a routine which interrupts the kernel
+ *                   (other than user space)?
+ */
+void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
  {
         u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
         struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
@@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token)
  
         n.token = token;
         n.cpu = smp_processor_id();
-       n.halted = is_idle_task(current) || preempt_count() > 1 ||
-                  rcu_preempt_depth();
+       n.halted = is_idle_task(current) ||
+                  (IS_ENABLED(CONFIG_PREEMPT_COUNT)
+                   ? preempt_count() > 1 || rcu_preempt_depth()
+                   : interrupt_kernel);
         init_swait_queue_head(&n.wq);
         hlist_add_head(&n.link, &b->list);
         raw_spin_unlock(&b->lock);
@@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
         case KVM_PV_REASON_PAGE_NOT_PRESENT:
                 /* page is swapped out by the host. */
                 prev_state = exception_enter();
-               kvm_async_pf_task_wait((u32)read_cr2());
+               kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
                 exception_exit(prev_state);
                 break;
         case KVM_PV_REASON_PAGE_READY:
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig

index 3ea6244..3c48bc8 100644 (file)
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
         depends on HIGH_RES_TIMERS
         # for TASKSTATS/TASK_DELAY_ACCT:
         depends on NET && MULTIUSER
+       depends on X86_LOCAL_APIC
         select PREEMPT_NOTIFIERS
         select MMU_NOTIFIER
         select ANON_INODES
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c

index a36254c..d90cdc7 100644 (file)
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
         #op " %al \n\t" \
         FOP_RET
  
-asm(".global kvm_fastop_exception \n"
-    "kvm_fastop_exception: xor %esi, %esi; ret");
+asm(".pushsection .fixup, \"ax\"\n"
+    ".global kvm_fastop_exception \n"
+    "kvm_fastop_exception: xor %esi, %esi; ret\n"
+    ".popsection");
  
  FOP_START(setcc)
  FOP_SETCC(seto)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index eca30c1..106d4a0 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
         case KVM_PV_REASON_PAGE_NOT_PRESENT:
                 vcpu->arch.apf.host_apf_reason = 0;
                 local_irq_disable();
-               kvm_async_pf_task_wait(fault_address);
+               kvm_async_pf_task_wait(fault_address, 0);
                 local_irq_enable();
                 break;
         case KVM_PV_REASON_PAGE_READY:
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c

index 980e730..de294d7 100644 (file)
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -815,10 +815,14 @@ int blk_mq_debugfs_register(struct request_queue *q)
                 goto err;
  
         /*
-        * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir
+        * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
          * didn't exist yet (because we don't know what to name the directory
          * until the queue is registered to a gendisk).
          */
+       if (q->elevator && !q->sched_debugfs_dir)
+               blk_mq_debugfs_register_sched(q);
+
+       /* Similarly, blk_mq_init_hctx() couldn't do this previously. */
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx))
                         goto err;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c

index 0fea76a..17816a0 100644 (file)
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1911,11 +1911,11 @@ static void throtl_upgrade_state(struct throtl_data *td)
  
                 tg->disptime = jiffies - 1;
                 throtl_select_dispatch(sq);
-               throtl_schedule_next_dispatch(sq, false);
+               throtl_schedule_next_dispatch(sq, true);
         }
         rcu_read_unlock();
         throtl_select_dispatch(&td->service_queue);
-       throtl_schedule_next_dispatch(&td->service_queue, false);
+       throtl_schedule_next_dispatch(&td->service_queue, true);
         queue_work(kthrotld_workqueue, &td->dispatch_work);
  }
  
diff --git a/block/bsg-lib.c b/block/bsg-lib.c

index dbddff8..15d25cc 100644 (file)
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -207,20 +207,34 @@ static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
         struct bsg_job *job = blk_mq_rq_to_pdu(req);
         struct scsi_request *sreq = &job->sreq;
  
+       /* called right after the request is allocated for the request_queue */
+
+       sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
+       if (!sreq->sense)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void bsg_initialize_rq(struct request *req)
+{
+       struct bsg_job *job = blk_mq_rq_to_pdu(req);
+       struct scsi_request *sreq = &job->sreq;
+       void *sense = sreq->sense;
+
+       /* called right before the request is given to the request_queue user */
+
         memset(job, 0, sizeof(*job));
  
         scsi_req_init(sreq);
+
+       sreq->sense = sense;
         sreq->sense_len = SCSI_SENSE_BUFFERSIZE;
-       sreq->sense = kzalloc(sreq->sense_len, gfp);
-       if (!sreq->sense)
-               return -ENOMEM;
  
         job->req = req;
-       job->reply = sreq->sense;
+       job->reply = sense;
         job->reply_len = sreq->sense_len;
         job->dd_data = job + 1;
-
-       return 0;
  }
  
  static void bsg_exit_rq(struct request_queue *q, struct request *req)
@@ -251,6 +265,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
         q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
         q->init_rq_fn = bsg_init_rq;
         q->exit_rq_fn = bsg_exit_rq;
+       q->initialize_rq_fn = bsg_initialize_rq;
         q->request_fn = bsg_request_fn;
  
         ret = blk_init_allocated_queue(q);
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c

index 9565d57..de56394 100644 (file)
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1178,12 +1178,44 @@ dev_put:
         return ret;
  }
  
+static bool __init iort_enable_acs(struct acpi_iort_node *iort_node)
+{
+       if (iort_node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) {
+               struct acpi_iort_node *parent;
+               struct acpi_iort_id_mapping *map;
+               int i;
+
+               map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, iort_node,
+                                  iort_node->mapping_offset);
+
+               for (i = 0; i < iort_node->mapping_count; i++, map++) {
+                       if (!map->output_reference)
+                               continue;
+
+                       parent = ACPI_ADD_PTR(struct acpi_iort_node,
+                                       iort_table,  map->output_reference);
+                       /*
+                        * If we detect a RC->SMMU mapping, make sure
+                        * we enable ACS on the system.
+                        */
+                       if ((parent->type == ACPI_IORT_NODE_SMMU) ||
+                               (parent->type == ACPI_IORT_NODE_SMMU_V3)) {
+                               pci_request_acs();
+                               return true;
+                       }
+               }
+       }
+
+       return false;
+}
+
  static void __init iort_init_platform_devices(void)
  {
         struct acpi_iort_node *iort_node, *iort_end;
         struct acpi_table_iort *iort;
         struct fwnode_handle *fwnode;
         int i, ret;
+       bool acs_enabled = false;
  
         /*
          * iort_table and iort both point to the start of IORT table, but
@@ -1203,6 +1235,9 @@ static void __init iort_init_platform_devices(void)
                         return;
                 }
  
+               if (!acs_enabled)
+                       acs_enabled = iort_enable_acs(iort_node);
+
                 if ((iort_node->type == ACPI_IORT_NODE_SMMU) ||
                         (iort_node->type == ACPI_IORT_NODE_SMMU_V3)) {
  
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index 4a438b8..2dfe99b 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -17,7 +17,7 @@ if BLK_DEV
  
  config BLK_DEV_NULL_BLK
         tristate "Null test block driver"
-       depends on CONFIGFS_FS
+       select CONFIGFS_FS
  
  config BLK_DEV_FD
         tristate "Normal floppy disk support"
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c

index 3684e21..883dfeb 100644 (file)
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -820,9 +820,13 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
          * appropriate.
          */
         ret = nbd_handle_cmd(cmd, hctx->queue_num);
+       if (ret < 0)
+               ret = BLK_STS_IOERR;
+       else if (!ret)
+               ret = BLK_STS_OK;
         complete(&cmd->send_complete);
  
-       return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK;
+       return ret;
  }
  
  static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c

index c834f5a..4c10456 100644 (file)
--- a/drivers/clk/clk-bulk.c
+++ b/drivers/clk/clk-bulk.c
@@ -105,6 +105,7 @@ err:
  
         return  ret;
  }
+EXPORT_SYMBOL_GPL(clk_bulk_prepare);
  
  #endif /* CONFIG_HAVE_CLK_PREPARE */
  
diff --git a/drivers/clk/rockchip/clk-rk3128.c b/drivers/clk/rockchip/clk-rk3128.c

index 62d7854..5970a50 100644 (file)
--- a/drivers/clk/rockchip/clk-rk3128.c
+++ b/drivers/clk/rockchip/clk-rk3128.c
@@ -315,13 +315,13 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = {
                         RK2928_CLKGATE_CON(10), 8, GFLAGS),
  
         GATE(SCLK_PVTM_CORE, "clk_pvtm_core", "xin24m", 0,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(10), 0, GFLAGS),
         GATE(SCLK_PVTM_GPU, "clk_pvtm_gpu", "xin24m", 0,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(10), 1, GFLAGS),
         GATE(SCLK_PVTM_FUNC, "clk_pvtm_func", "xin24m", 0,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(10), 2, GFLAGS),
         GATE(SCLK_MIPI_24M, "clk_mipi_24m", "xin24m", CLK_IGNORE_UNUSED,
-                       RK2928_CLKGATE_CON(10), 8, GFLAGS),
+                       RK2928_CLKGATE_CON(2), 15, GFLAGS),
  
         COMPOSITE(SCLK_SDMMC, "sclk_sdmmc0", mux_mmc_src_p, 0,
                         RK2928_CLKSEL_CON(11), 6, 2, MFLAGS, 0, 6, DFLAGS,
@@ -541,7 +541,7 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = {
         GATE(0, "pclk_grf", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 4, GFLAGS),
         GATE(0, "pclk_mipiphy", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 0, GFLAGS),
  
-       GATE(0, "pclk_pmu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 2, GFLAGS),
+       GATE(0, "pclk_pmu", "pclk_pmu_pre", 0, RK2928_CLKGATE_CON(9), 2, GFLAGS),
         GATE(0, "pclk_pmu_niu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 3, GFLAGS),
  
         /* PD_MMC */
@@ -577,6 +577,8 @@ static const char *const rk3128_critical_clocks[] __initconst = {
         "aclk_peri",
         "hclk_peri",
         "pclk_peri",
+       "pclk_pmu",
+       "sclk_timer5",
  };
  
  static struct rockchip_clk_provider *__init rk3128_common_clk_init(struct device_node *np)
diff --git a/drivers/clk/samsung/clk-exynos4.c b/drivers/clk/samsung/clk-exynos4.c

index e40b775..d8d3cb6 100644 (file)
--- a/drivers/clk/samsung/clk-exynos4.c
+++ b/drivers/clk/samsung/clk-exynos4.c
@@ -294,6 +294,18 @@ static const struct samsung_clk_reg_dump src_mask_suspend_e4210[] = {
  #define PLL_ENABLED    (1 << 31)
  #define PLL_LOCKED     (1 << 29)
  
+static void exynos4_clk_enable_pll(u32 reg)
+{
+       u32 pll_con = readl(reg_base + reg);
+       pll_con |= PLL_ENABLED;
+       writel(pll_con, reg_base + reg);
+
+       while (!(pll_con & PLL_LOCKED)) {
+               cpu_relax();
+               pll_con = readl(reg_base + reg);
+       }
+}
+
  static void exynos4_clk_wait_for_pll(u32 reg)
  {
         u32 pll_con;
@@ -315,6 +327,9 @@ static int exynos4_clk_suspend(void)
         samsung_clk_save(reg_base, exynos4_save_pll,
                                 ARRAY_SIZE(exynos4_clk_pll_regs));
  
+       exynos4_clk_enable_pll(EPLL_CON0);
+       exynos4_clk_enable_pll(VPLL_CON0);
+
         if (exynos4_soc == EXYNOS4210) {
                 samsung_clk_save(reg_base, exynos4_save_soc,
                                         ARRAY_SIZE(exynos4210_clk_save));
diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c

index d805b6e..27743be 100644 (file)
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -606,11 +606,6 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder,
                          connector->encoder->base.id,
                          connector->encoder->name);
  
-       /* ELD Conn_Type */
-       connector->eld[5] &= ~(3 << 2);
-       if (intel_crtc_has_dp_encoder(crtc_state))
-               connector->eld[5] |= (1 << 2);
-
         connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2;
  
         if (dev_priv->display.audio_codec_enable)
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c

index 183e87e..00c6aee 100644 (file)
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -1163,6 +1163,13 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port,
         is_hdmi = is_dvi && (child->common.device_type & DEVICE_TYPE_NOT_HDMI_OUTPUT) == 0;
         is_edp = is_dp && (child->common.device_type & DEVICE_TYPE_INTERNAL_CONNECTOR);
  
+       if (port == PORT_A && is_dvi) {
+               DRM_DEBUG_KMS("VBT claims port A supports DVI%s, ignoring\n",
+                             is_hdmi ? "/HDMI" : "");
+               is_dvi = false;
+               is_hdmi = false;
+       }
+
         info->supports_dvi = is_dvi;
         info->supports_hdmi = is_hdmi;
         info->supports_dp = is_dp;
diff --git a/drivers/gpu/drm/i915/intel_csr.c b/drivers/gpu/drm/i915/intel_csr.c

index 965988f..92c1f8e 100644 (file)
--- a/drivers/gpu/drm/i915/intel_csr.c
+++ b/drivers/gpu/drm/i915/intel_csr.c
@@ -216,7 +216,7 @@ static void gen9_set_dc_state_debugmask(struct drm_i915_private *dev_priv)
  
         mask = DC_STATE_DEBUG_MASK_MEMORY_UP;
  
-       if (IS_BROXTON(dev_priv))
+       if (IS_GEN9_LP(dev_priv))
                 mask |= DC_STATE_DEBUG_MASK_CORES;
  
         /* The below bit doesn't need to be cleared ever afterwards */
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c

index 4b4fd1f..476681d 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -1655,7 +1655,8 @@ bool intel_ddi_get_hw_state(struct intel_encoder *encoder,
  out:
         if (ret && IS_GEN9_LP(dev_priv)) {
                 tmp = I915_READ(BXT_PHY_CTL(port));
-               if ((tmp & (BXT_PHY_LANE_POWERDOWN_ACK |
+               if ((tmp & (BXT_PHY_CMNLANE_POWERDOWN_ACK |
+                           BXT_PHY_LANE_POWERDOWN_ACK |
                             BXT_PHY_LANE_ENABLED)) != BXT_PHY_LANE_ENABLED)
                         DRM_ERROR("Port %c enabled but PHY powered down? "
                                   "(PHY_CTL %08x)\n", port_name(port), tmp);
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c

index 00cd17c..64f7b51 100644 (file)
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -12359,7 +12359,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state)
         struct drm_crtc_state *old_crtc_state, *new_crtc_state;
         struct drm_crtc *crtc;
         struct intel_crtc_state *intel_cstate;
-       bool hw_check = intel_state->modeset;
         u64 put_domains[I915_MAX_PIPES] = {};
         unsigned crtc_vblank_mask = 0;
         int i;
@@ -12376,7 +12375,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state)
  
                 if (needs_modeset(new_crtc_state) ||
                     to_intel_crtc_state(new_crtc_state)->update_pipe) {
-                       hw_check = true;
  
                         put_domains[to_intel_crtc(crtc)->pipe] =
                                 modeset_get_crtc_power_domains(crtc,
diff --git a/drivers/gpu/drm/i915/intel_dpio_phy.c b/drivers/gpu/drm/i915/intel_dpio_phy.c

index 09b6709..de38d01 100644 (file)
--- a/drivers/gpu/drm/i915/intel_dpio_phy.c
+++ b/drivers/gpu/drm/i915/intel_dpio_phy.c
@@ -208,12 +208,6 @@ static const struct bxt_ddi_phy_info glk_ddi_phy_info[] = {
         },
  };
  
-static u32 bxt_phy_port_mask(const struct bxt_ddi_phy_info *phy_info)
-{
-       return (phy_info->dual_channel * BIT(phy_info->channel[DPIO_CH1].port)) |
-               BIT(phy_info->channel[DPIO_CH0].port);
-}
-
  static const struct bxt_ddi_phy_info *
  bxt_get_phy_list(struct drm_i915_private *dev_priv, int *count)
  {
@@ -313,7 +307,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv,
                             enum dpio_phy phy)
  {
         const struct bxt_ddi_phy_info *phy_info;
-       enum port port;
  
         phy_info = bxt_get_phy_info(dev_priv, phy);
  
@@ -335,19 +328,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv,
                 return false;
         }
  
-       for_each_port_masked(port, bxt_phy_port_mask(phy_info)) {
-               u32 tmp = I915_READ(BXT_PHY_CTL(port));
-
-               if (tmp & BXT_PHY_CMNLANE_POWERDOWN_ACK) {
-                       DRM_DEBUG_DRIVER("DDI PHY %d powered, but common lane "
-                                        "for port %c powered down "
-                                        "(PHY_CTL %08x)\n",
-                                        phy, port_name(port), tmp);
-
-                       return false;
-               }
-       }
-
         return true;
  }
  
diff --git a/drivers/gpu/drm/i915/intel_modes.c b/drivers/gpu/drm/i915/intel_modes.c

index 951e834..28a778b 100644 (file)
--- a/drivers/gpu/drm/i915/intel_modes.c
+++ b/drivers/gpu/drm/i915/intel_modes.c
@@ -30,6 +30,21 @@
  #include "intel_drv.h"
  #include "i915_drv.h"
  
+static void intel_connector_update_eld_conn_type(struct drm_connector *connector)
+{
+       u8 conn_type;
+
+       if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort ||
+           connector->connector_type == DRM_MODE_CONNECTOR_eDP) {
+               conn_type = DRM_ELD_CONN_TYPE_DP;
+       } else {
+               conn_type = DRM_ELD_CONN_TYPE_HDMI;
+       }
+
+       connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] &= ~DRM_ELD_CONN_TYPE_MASK;
+       connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= conn_type;
+}
+
  /**
   * intel_connector_update_modes - update connector from edid
   * @connector: DRM connector device to use
@@ -44,6 +59,8 @@ int intel_connector_update_modes(struct drm_connector *connector,
         ret = drm_add_edid_modes(connector, edid);
         drm_edid_to_eld(connector, edid);
  
+       intel_connector_update_eld_conn_type(connector);
+
         return ret;
  }
  
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c

index b66d8e1..b3a087c 100644 (file)
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -2782,6 +2782,9 @@ static void cnl_display_core_init(struct drm_i915_private *dev_priv, bool resume
  
         /* 6. Enable DBUF */
         gen9_dbuf_enable(dev_priv);
+
+       if (resume && dev_priv->csr.dmc_payload)
+               intel_csr_load_program(dev_priv);
  }
  
  #undef CNL_PROCMON_IDX
diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c

index 9ea6cd5..3cf1a69 100644 (file)
--- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
+++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
@@ -302,26 +302,29 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master,
         hdmi->mod_clk = devm_clk_get(dev, "mod");
         if (IS_ERR(hdmi->mod_clk)) {
                 dev_err(dev, "Couldn't get the HDMI mod clock\n");
-               return PTR_ERR(hdmi->mod_clk);
+               ret = PTR_ERR(hdmi->mod_clk);
+               goto err_disable_bus_clk;
         }
         clk_prepare_enable(hdmi->mod_clk);
  
         hdmi->pll0_clk = devm_clk_get(dev, "pll-0");
         if (IS_ERR(hdmi->pll0_clk)) {
                 dev_err(dev, "Couldn't get the HDMI PLL 0 clock\n");
-               return PTR_ERR(hdmi->pll0_clk);
+               ret = PTR_ERR(hdmi->pll0_clk);
+               goto err_disable_mod_clk;
         }
  
         hdmi->pll1_clk = devm_clk_get(dev, "pll-1");
         if (IS_ERR(hdmi->pll1_clk)) {
                 dev_err(dev, "Couldn't get the HDMI PLL 1 clock\n");
-               return PTR_ERR(hdmi->pll1_clk);
+               ret = PTR_ERR(hdmi->pll1_clk);
+               goto err_disable_mod_clk;
         }
  
         ret = sun4i_tmds_create(hdmi);
         if (ret) {
                 dev_err(dev, "Couldn't create the TMDS clock\n");
-               return ret;
+               goto err_disable_mod_clk;
         }
  
         writel(SUN4I_HDMI_CTRL_ENABLE, hdmi->base + SUN4I_HDMI_CTRL_REG);
@@ -362,7 +365,7 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master,
         ret = sun4i_hdmi_i2c_create(dev, hdmi);
         if (ret) {
                 dev_err(dev, "Couldn't create the HDMI I2C adapter\n");
-               return ret;
+               goto err_disable_mod_clk;
         }
  
         drm_encoder_helper_add(&hdmi->encoder,
@@ -422,6 +425,10 @@ err_cleanup_connector:
         drm_encoder_cleanup(&hdmi->encoder);
  err_del_i2c_adapter:
         i2c_del_adapter(hdmi->i2c);
+err_disable_mod_clk:
+       clk_disable_unprepare(hdmi->mod_clk);
+err_disable_bus_clk:
+       clk_disable_unprepare(hdmi->bus_clk);
         return ret;
  }
  
@@ -434,6 +441,8 @@ static void sun4i_hdmi_unbind(struct device *dev, struct device *master,
         drm_connector_cleanup(&hdmi->connector);
         drm_encoder_cleanup(&hdmi->encoder);
         i2c_del_adapter(hdmi->i2c);
+       clk_disable_unprepare(hdmi->mod_clk);
+       clk_disable_unprepare(hdmi->bus_clk);
  }
  
  static const struct component_ops sun4i_hdmi_ops = {
diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c

index 9c0dbb8..e1be610 100644 (file)
--- a/drivers/hwmon/xgene-hwmon.c
+++ b/drivers/hwmon/xgene-hwmon.c
@@ -630,7 +630,7 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                          sizeof(struct slimpro_resp_msg) * ASYNC_MSG_FIFO_SIZE,
                          GFP_KERNEL);
         if (rc)
-               goto out_mbox_free;
+               return -ENOMEM;
  
         INIT_WORK(&ctx->workq, xgene_hwmon_evt_work);
  
@@ -646,7 +646,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                 if (IS_ERR(ctx->mbox_chan)) {
                         dev_err(&pdev->dev,
                                 "SLIMpro mailbox channel request failed\n");
-                       return -ENODEV;
+                       rc = -ENODEV;
+                       goto out_mbox_free;
                 }
         } else {
                 struct acpi_pcct_hw_reduced *cppc_ss;
@@ -654,7 +655,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                 if (device_property_read_u32(&pdev->dev, "pcc-channel",
                                              &ctx->mbox_idx)) {
                         dev_err(&pdev->dev, "no pcc-channel property\n");
-                       return -ENODEV;
+                       rc = -ENODEV;
+                       goto out_mbox_free;
                 }
  
                 cl->rx_callback = xgene_hwmon_pcc_rx_cb;
@@ -662,7 +664,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                 if (IS_ERR(ctx->mbox_chan)) {
                         dev_err(&pdev->dev,
                                 "PPC channel request failed\n");
-                       return -ENODEV;
+                       rc = -ENODEV;
+                       goto out_mbox_free;
                 }
  
                 /*
@@ -675,13 +678,13 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                 if (!cppc_ss) {
                         dev_err(&pdev->dev, "PPC subspace not found\n");
                         rc = -ENODEV;
-                       goto out_mbox_free;
+                       goto out;
                 }
  
                 if (!ctx->mbox_chan->mbox->txdone_irq) {
                         dev_err(&pdev->dev, "PCC IRQ not supported\n");
                         rc = -ENODEV;
-                       goto out_mbox_free;
+                       goto out;
                 }
  
                 /*
@@ -696,14 +699,14 @@ static int xgene_hwmon_probe(struct platform_device *pdev)
                 } else {
                         dev_err(&pdev->dev, "Failed to get PCC comm region\n");
                         rc = -ENODEV;
-                       goto out_mbox_free;
+                       goto out;
                 }
  
                 if (!ctx->pcc_comm_addr) {
                         dev_err(&pdev->dev,
                                 "Failed to ioremap PCC comm region\n");
                         rc = -ENOMEM;
-                       goto out_mbox_free;
+                       goto out;
                 }
  
                 /*
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig

index c06dce2..45a3f3c 100644 (file)
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -131,6 +131,7 @@ config I2C_I801
             Gemini Lake (SOC)
             Cannon Lake-H (PCH)
             Cannon Lake-LP (PCH)
+           Cedar Fork (PCH)
  
           This driver can also be built as a module.  If so, the module
           will be called i2c-i801.
diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c

index e114e4e..9e12a53 100644 (file)
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -68,6 +68,7 @@
   * Gemini Lake (SOC)           0x31d4  32      hard    yes     yes     yes
   * Cannon Lake-H (PCH)         0xa323  32      hard    yes     yes     yes
   * Cannon Lake-LP (PCH)                0x9da3  32      hard    yes     yes     yes
+ * Cedar Fork (PCH)            0x18df  32      hard    yes     yes     yes
   *
   * Features supported by this driver:
   * Software PEC                                no
@@ -204,6 +205,7 @@
  
  /* Older devices have their ID defined in <linux/pci_ids.h> */
  #define PCI_DEVICE_ID_INTEL_BAYTRAIL_SMBUS             0x0f12
+#define PCI_DEVICE_ID_INTEL_CDF_SMBUS                  0x18df
  #define PCI_DEVICE_ID_INTEL_DNV_SMBUS                  0x19df
  #define PCI_DEVICE_ID_INTEL_COUGARPOINT_SMBUS          0x1c22
  #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS             0x1d22
@@ -1025,6 +1027,7 @@ static const struct pci_device_id i801_ids[] = {
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BRASWELL_SMBUS) },
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS) },
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS) },
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CDF_SMBUS) },
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_DNV_SMBUS) },
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BROXTON_SMBUS) },
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS) },
@@ -1513,6 +1516,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id)
         case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS:
         case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS:
         case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS:
+       case PCI_DEVICE_ID_INTEL_CDF_SMBUS:
         case PCI_DEVICE_ID_INTEL_DNV_SMBUS:
         case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS:
                 priv->features |= FEATURE_I2C_BLOCK_READ;
diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c

index 22e08ae..25fcc3c 100644 (file)
--- a/drivers/i2c/busses/i2c-sprd.c
+++ b/drivers/i2c/busses/i2c-sprd.c
@@ -627,6 +627,7 @@ static const struct dev_pm_ops sprd_i2c_pm_ops = {
  
  static const struct of_device_id sprd_i2c_of_match[] = {
         { .compatible = "sprd,sc9860-i2c", },
+       {},
  };
  
  static struct platform_driver sprd_i2c_driver = {
diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c

index 47c67b0..d4a6e9c 100644 (file)
--- a/drivers/i2c/busses/i2c-stm32f7.c
+++ b/drivers/i2c/busses/i2c-stm32f7.c
@@ -215,7 +215,7 @@ struct stm32f7_i2c_dev {
         unsigned int msg_num;
         unsigned int msg_id;
         struct stm32f7_i2c_msg f7_msg;
-       struct stm32f7_i2c_setup *setup;
+       struct stm32f7_i2c_setup setup;
         struct stm32f7_i2c_timings timing;
  };
  
@@ -265,7 +265,7 @@ static struct stm32f7_i2c_spec i2c_specs[] = {
         },
  };
  
-struct stm32f7_i2c_setup stm32f7_setup = {
+static const struct stm32f7_i2c_setup stm32f7_setup = {
         .rise_time = STM32F7_I2C_RISE_TIME_DEFAULT,
         .fall_time = STM32F7_I2C_FALL_TIME_DEFAULT,
         .dnf = STM32F7_I2C_DNF_DEFAULT,
@@ -537,7 +537,7 @@ static void stm32f7_i2c_hw_config(struct stm32f7_i2c_dev *i2c_dev)
         writel_relaxed(timing, i2c_dev->base + STM32F7_I2C_TIMINGR);
  
         /* Enable I2C */
-       if (i2c_dev->setup->analog_filter)
+       if (i2c_dev->setup.analog_filter)
                 stm32f7_i2c_clr_bits(i2c_dev->base + STM32F7_I2C_CR1,
                                      STM32F7_I2C_CR1_ANFOFF);
         else
@@ -887,22 +887,19 @@ static int stm32f7_i2c_probe(struct platform_device *pdev)
         }
  
         setup = of_device_get_match_data(&pdev->dev);
-       i2c_dev->setup->rise_time = setup->rise_time;
-       i2c_dev->setup->fall_time = setup->fall_time;
-       i2c_dev->setup->dnf = setup->dnf;
-       i2c_dev->setup->analog_filter = setup->analog_filter;
+       i2c_dev->setup = *setup;
  
         ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-rising-time-ns",
                                        &rise_time);
         if (!ret)
-               i2c_dev->setup->rise_time = rise_time;
+               i2c_dev->setup.rise_time = rise_time;
  
         ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-falling-time-ns",
                                        &fall_time);
         if (!ret)
-               i2c_dev->setup->fall_time = fall_time;
+               i2c_dev->setup.fall_time = fall_time;
  
-       ret = stm32f7_i2c_setup_timing(i2c_dev, i2c_dev->setup);
+       ret = stm32f7_i2c_setup_timing(i2c_dev, &i2c_dev->setup);
         if (ret)
                 goto clk_free;
  
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c

index 01b2adf..eaf39e5 100644 (file)
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1451,6 +1451,7 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d,
                 if (hwif_init(hwif) == 0) {
                         printk(KERN_INFO "%s: failed to initialize IDE "
                                          "interface\n", hwif->name);
+                       device_unregister(hwif->portdev);
                         device_unregister(&hwif->gendev);
                         ide_disable_port(hwif);
                         continue;
diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c

index 86aa88a..acf8748 100644 (file)
--- a/drivers/ide/ide-scan-pci.c
+++ b/drivers/ide/ide-scan-pci.c
@@ -56,6 +56,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev)
  {
         struct list_head *l;
         struct pci_driver *d;
+       int ret;
  
         list_for_each(l, &ide_pci_drivers) {
                 d = list_entry(l, struct pci_driver, node);
@@ -63,10 +64,14 @@ static int __init ide_scan_pcidev(struct pci_dev *dev)
                         const struct pci_device_id *id =
                                 pci_match_id(d->id_table, dev);
  
-                       if (id != NULL && d->probe(dev, id) >= 0) {
-                               dev->driver = d;
-                               pci_dev_get(dev);
-                               return 1;
+                       if (id != NULL) {
+                               pci_assign_irq(dev);
+                               ret = d->probe(dev, id);
+                               if (ret >= 0) {
+                                       dev->driver = d;
+                                       pci_dev_get(dev);
+                                       return 1;
+                               }
                         }
                 }
         }
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c

index 112d2fe..fdc8e81 100644 (file)
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -179,6 +179,7 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise);
  /**
   *     ide_pci_enable  -       do PCI enables
   *     @dev: PCI device
+ *     @bars: PCI BARs mask
   *     @d: IDE port info
   *
   *     Enable the IDE PCI device. We attempt to enable the device in full
@@ -189,9 +190,10 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise);
   *     Returns zero on success or an error code
   */
  
-static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d)
+static int ide_pci_enable(struct pci_dev *dev, int bars,
+                         const struct ide_port_info *d)
  {
-       int ret, bars;
+       int ret;
  
         if (pci_enable_device(dev)) {
                 ret = pci_enable_device_io(dev);
@@ -216,18 +218,6 @@ static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d)
                 goto out;
         }
  
-       if (d->host_flags & IDE_HFLAG_SINGLE)
-               bars = (1 << 2) - 1;
-       else
-               bars = (1 << 4) - 1;
-
-       if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) {
-               if (d->host_flags & IDE_HFLAG_CS5520)
-                       bars |= (1 << 2);
-               else
-                       bars |= (1 << 4);
-       }
-
         ret = pci_request_selected_regions(dev, bars, d->name);
         if (ret < 0)
                 printk(KERN_ERR "%s %s: can't reserve resources\n",
@@ -403,6 +393,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
  /**
   *     ide_setup_pci_controller        -       set up IDE PCI
   *     @dev: PCI device
+ *     @bars: PCI BARs mask
   *     @d: IDE port info
   *     @noisy: verbose flag
   *
@@ -411,7 +402,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
   *     and enables it if need be
   */
  
-static int ide_setup_pci_controller(struct pci_dev *dev,
+static int ide_setup_pci_controller(struct pci_dev *dev, int bars,
                                     const struct ide_port_info *d, int noisy)
  {
         int ret;
@@ -420,7 +411,7 @@ static int ide_setup_pci_controller(struct pci_dev *dev,
         if (noisy)
                 ide_setup_pci_noise(dev, d);
  
-       ret = ide_pci_enable(dev, d);
+       ret = ide_pci_enable(dev, bars, d);
         if (ret < 0)
                 goto out;
  
@@ -428,16 +419,20 @@ static int ide_setup_pci_controller(struct pci_dev *dev,
         if (ret < 0) {
                 printk(KERN_ERR "%s %s: error accessing PCI regs\n",
                         d->name, pci_name(dev));
-               goto out;
+               goto out_free_bars;
         }
         if (!(pcicmd & PCI_COMMAND_IO)) {       /* is device disabled? */
                 ret = ide_pci_configure(dev, d);
                 if (ret < 0)
-                       goto out;
+                       goto out_free_bars;
                 printk(KERN_INFO "%s %s: device enabled (Linux)\n",
                         d->name, pci_name(dev));
         }
  
+       goto out;
+
+out_free_bars:
+       pci_release_selected_regions(dev, bars);
  out:
         return ret;
  }
@@ -540,13 +535,28 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
  {
         struct pci_dev *pdev[] = { dev1, dev2 };
         struct ide_host *host;
-       int ret, i, n_ports = dev2 ? 4 : 2;
+       int ret, i, n_ports = dev2 ? 4 : 2, bars;
         struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL };
  
+       if (d->host_flags & IDE_HFLAG_SINGLE)
+               bars = (1 << 2) - 1;
+       else
+               bars = (1 << 4) - 1;
+
+       if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) {
+               if (d->host_flags & IDE_HFLAG_CS5520)
+                       bars |= (1 << 2);
+               else
+                       bars |= (1 << 4);
+       }
+
         for (i = 0; i < n_ports / 2; i++) {
-               ret = ide_setup_pci_controller(pdev[i], d, !i);
-               if (ret < 0)
+               ret = ide_setup_pci_controller(pdev[i], bars, d, !i);
+               if (ret < 0) {
+                       if (i == 1)
+                               pci_release_selected_regions(pdev[0], bars);
                         goto out;
+               }
  
                 ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]);
         }
@@ -554,7 +564,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
         host = ide_host_alloc(d, hws, n_ports);
         if (host == NULL) {
                 ret = -ENOMEM;
-               goto out;
+               goto out_free_bars;
         }
  
         host->dev[0] = &dev1->dev;
@@ -576,7 +586,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
                  * do_ide_setup_pci_device() on the first device!
                  */
                 if (ret < 0)
-                       goto out;
+                       goto out_free_bars;
  
                 /* fixup IRQ */
                 if (ide_pci_is_in_compatibility_mode(pdev[i])) {
@@ -589,6 +599,13 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2,
         ret = ide_host_register(host, d, hws);
         if (ret)
                 ide_host_free(host);
+       else
+               goto out;
+
+out_free_bars:
+       i = n_ports / 2;
+       while (i--)
+               pci_release_selected_regions(pdev[i], bars);
  out:
         return ret;
  }
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c

index 30825bb..8861c05 100644 (file)
--- a/drivers/infiniband/core/iwpm_msg.c
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -100,6 +100,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
         if (ret)
                 goto pid_query_error;
  
+       nlmsg_end(skb, nlh);
+
         pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
                 __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
  
@@ -170,6 +172,8 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
                                 &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
         if (ret)
                 goto add_mapping_error;
+
+       nlmsg_end(skb, nlh);
         nlmsg_request->req_buffer = pm_msg;
  
         ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
@@ -246,6 +250,8 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
                                 &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
         if (ret)
                 goto query_mapping_error;
+
+       nlmsg_end(skb, nlh);
         nlmsg_request->req_buffer = pm_msg;
  
         ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
@@ -308,6 +314,8 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
         if (ret)
                 goto remove_mapping_error;
  
+       nlmsg_end(skb, nlh);
+
         ret = rdma_nl_unicast_wait(skb, iwpm_user_pid);
         if (ret) {
                 skb = NULL; /* skb is freed in the netlink send-op handling */
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c

index c81c559..3c4faad 100644 (file)
--- a/drivers/infiniband/core/iwpm_util.c
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -597,6 +597,9 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
                                 &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
         if (ret)
                 goto mapinfo_num_error;
+
+       nlmsg_end(skb, nlh);
+
         ret = rdma_nl_unicast(skb, iwpm_pid);
         if (ret) {
                 skb = NULL;
@@ -678,6 +681,8 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
                         if (ret)
                                 goto send_mapping_info_unlock;
  
+                       nlmsg_end(skb, nlh);
+
                         iwpm_print_sockaddr(&map_info->local_sockaddr,
                                 "send_mapping_info: Local sockaddr:");
                         iwpm_print_sockaddr(&map_info->mapped_sockaddr,
diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c

index d1f5345..42ca534 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -48,7 +48,7 @@
   * @wqe: cqp wqe for header
   * @header: header for the cqp wqe
   */
-static inline void i40iw_insert_wqe_hdr(u64 *wqe, u64 header)
+void i40iw_insert_wqe_hdr(u64 *wqe, u64 header)
  {
         wmb();            /* make sure WQE is populated before polarity is set */
         set_64bit_val(wqe, 24, header);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h

index e217a12..5498ad0 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_p.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_p.h
@@ -59,6 +59,8 @@ enum i40iw_status_code i40iw_sc_mr_fast_register(struct i40iw_sc_qp *qp,
                                                  struct i40iw_fast_reg_stag_info *info,
                                                  bool post_sq);
  
+void i40iw_insert_wqe_hdr(u64 *wqe, u64 header);
+
  /* HMC/FPM functions */
  enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev,
                                             u8 hmc_fn_id);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c

index c2cab20..59f7067 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_puda.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c
@@ -123,12 +123,11 @@ static void i40iw_puda_post_recvbuf(struct i40iw_puda_rsrc *rsrc, u32 wqe_idx,
                 get_64bit_val(wqe, 24, &offset24);
  
         offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID);
-       set_64bit_val(wqe, 24, offset24);
  
         set_64bit_val(wqe, 0, buf->mem.pa);
         set_64bit_val(wqe, 8,
                       LS_64(buf->mem.size, I40IWQPSQ_FRAG_LEN));
-       set_64bit_val(wqe, 24, offset24);
+       i40iw_insert_wqe_hdr(wqe, offset24);
  }
  
  /**
@@ -409,9 +408,7 @@ enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp,
         set_64bit_val(wqe, 8, LS_64(info->len, I40IWQPSQ_FRAG_LEN));
         set_64bit_val(wqe, 16, header[0]);
  
-       /* Ensure all data is written before writing valid bit */
-       wmb();
-       set_64bit_val(wqe, 24, header[1]);
+       i40iw_insert_wqe_hdr(wqe, header[1]);
  
         i40iw_debug_buf(qp->dev, I40IW_DEBUG_PUDA, "PUDA SEND WQE", wqe, 32);
         i40iw_qp_post_wr(&qp->qp_uk);
@@ -539,7 +536,7 @@ static enum i40iw_status_code i40iw_puda_qp_wqe(struct i40iw_sc_dev *dev, struct
                  LS_64(2, I40IW_CQPSQ_QP_NEXTIWSTATE) |
                  LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
  
-       set_64bit_val(wqe, 24, header);
+       i40iw_insert_wqe_hdr(wqe, header);
  
         i40iw_debug_buf(cqp->dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, 32);
         i40iw_sc_cqp_post_sq(cqp);
@@ -655,7 +652,7 @@ static enum i40iw_status_code i40iw_puda_cq_wqe(struct i40iw_sc_dev *dev, struct
             LS_64(1, I40IW_CQPSQ_CQ_ENCEQEMASK) |
             LS_64(1, I40IW_CQPSQ_CQ_CEQIDVALID) |
             LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
-       set_64bit_val(wqe, 24, header);
+       i40iw_insert_wqe_hdr(wqe, header);
  
         i40iw_debug_buf(dev, I40IW_DEBUG_PUDA, "PUDA CQE",
                         wqe, I40IW_CQP_WQE_SIZE * 8);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c

index 28b3d02..62be0a4 100644 (file)
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -826,12 +826,14 @@ static int i40iw_query_qp(struct ib_qp *ibqp,
         attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE;
         attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
         attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT;
+       attr->port_num = 1;
         init_attr->event_handler = iwqp->ibqp.event_handler;
         init_attr->qp_context = iwqp->ibqp.qp_context;
         init_attr->send_cq = iwqp->ibqp.send_cq;
         init_attr->recv_cq = iwqp->ibqp.recv_cq;
         init_attr->srq = iwqp->ibqp.srq;
         init_attr->cap = attr->cap;
+       init_attr->port_num = 1;
         return 0;
  }
  
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c

index d6fbad8..552f7bd 100644 (file)
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4174,9 +4174,9 @@ err_bfreg:
  err_uar_page:
         mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
  
-err_cnt:
-       mlx5_ib_cleanup_cong_debugfs(dev);
  err_cong:
+       mlx5_ib_cleanup_cong_debugfs(dev);
+err_cnt:
         if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
                 mlx5_ib_dealloc_counters(dev);
  
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h

index b2bb42e..254083b 100644 (file)
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -387,7 +387,7 @@ struct qedr_qp {
                 u8 wqe_size;
  
                 u8 smac[ETH_ALEN];
-               u16 vlan_id;
+               u16 vlan;
                 int rc;
         } *rqe_wr_id;
  
diff --git a/drivers/infiniband/hw/qedr/qedr_cm.c b/drivers/infiniband/hw/qedr/qedr_cm.c

index 4689e80..ad89653 100644 (file)
--- a/drivers/infiniband/hw/qedr/qedr_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_cm.c
@@ -105,7 +105,7 @@ void qedr_ll2_complete_rx_packet(void *cxt,
  
         qp->rqe_wr_id[qp->rq.gsi_cons].rc = data->u.data_length_error ?
                 -EINVAL : 0;
-       qp->rqe_wr_id[qp->rq.gsi_cons].vlan_id = data->vlan;
+       qp->rqe_wr_id[qp->rq.gsi_cons].vlan = data->vlan;
         /* note: length stands for data length i.e. GRH is excluded */
         qp->rqe_wr_id[qp->rq.gsi_cons].sg_list[0].length =
                 data->length.data_length;
@@ -694,6 +694,7 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
         struct qedr_cq *cq = get_qedr_cq(ibcq);
         struct qedr_qp *qp = dev->gsi_qp;
         unsigned long flags;
+       u16 vlan_id;
         int i = 0;
  
         spin_lock_irqsave(&cq->cq_lock, flags);
@@ -712,9 +713,14 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
                 wc[i].wc_flags |= IB_WC_GRH | IB_WC_IP_CSUM_OK;
                 ether_addr_copy(wc[i].smac, qp->rqe_wr_id[qp->rq.cons].smac);
                 wc[i].wc_flags |= IB_WC_WITH_SMAC;
-               if (qp->rqe_wr_id[qp->rq.cons].vlan_id) {
+
+               vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan &
+                         VLAN_VID_MASK;
+               if (vlan_id) {
                         wc[i].wc_flags |= IB_WC_WITH_VLAN;
-                       wc[i].vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan_id;
+                       wc[i].vlan_id = vlan_id;
+                       wc[i].sl = (qp->rqe_wr_id[qp->rq.cons].vlan &
+                                   VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
                 }
  
                 qedr_inc_sw_cons(&qp->rq);
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c

index 7d5286b..1841d03 100644 (file)
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(closure_put);
  void __closure_wake_up(struct closure_waitlist *wait_list)
  {
         struct llist_node *list;
-       struct closure *cl;
+       struct closure *cl, *t;
         struct llist_node *reverse = NULL;
  
         list = llist_del_all(&wait_list->list);
@@ -73,7 +73,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
         reverse = llist_reverse_order(list);
  
         /* Then do the wakeups */
-       llist_for_each_entry(cl, reverse, list) {
+       llist_for_each_entry_safe(cl, t, reverse, list) {
                 closure_set_waiting(cl, 0);
                 closure_sub(cl, CLOSURE_WAITING + 1);
         }
diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c

index 5dba23c..dc9bc18 100644 (file)
--- a/drivers/misc/cxl/cxllib.c
+++ b/drivers/misc/cxl/cxllib.c
@@ -219,8 +219,17 @@ int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags)
  
         down_read(&mm->mmap_sem);
  
-       for (dar = addr; dar < addr + size; dar += page_size) {
-               if (!vma || dar < vma->vm_start || dar > vma->vm_end) {
+       vma = find_vma(mm, addr);
+       if (!vma) {
+               pr_err("Can't find vma for addr %016llx\n", addr);
+               rc = -EFAULT;
+               goto out;
+       }
+       /* get the size of the pages allocated */
+       page_size = vma_kernel_pagesize(vma);
+
+       for (dar = (addr & ~(page_size - 1)); dar < (addr + size); dar += page_size) {
+               if (dar < vma->vm_start || dar >= vma->vm_end) {
                         vma = find_vma(mm, addr);
                         if (!vma) {
                                 pr_err("Can't find vma for addr %016llx\n", addr);
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c

index 29fc1e6..2ad7b5c 100644 (file)
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1634,8 +1634,6 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq,
         }
  
         mqrq->areq.mrq = &brq->mrq;
-
-       mmc_queue_bounce_pre(mqrq);
  }
  
  static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
@@ -1829,7 +1827,6 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req)
                 brq = &mq_rq->brq;
                 old_req = mmc_queue_req_to_req(mq_rq);
                 type = rq_data_dir(old_req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE;
-               mmc_queue_bounce_post(mq_rq);
  
                 switch (status) {
                 case MMC_BLK_SUCCESS:
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c

index a7eb623..36217ad 100644 (file)
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1286,6 +1286,23 @@ out_err:
         return err;
  }
  
+static void mmc_select_driver_type(struct mmc_card *card)
+{
+       int card_drv_type, drive_strength, drv_type;
+
+       card_drv_type = card->ext_csd.raw_driver_strength |
+                       mmc_driver_type_mask(0);
+
+       drive_strength = mmc_select_drive_strength(card,
+                                                  card->ext_csd.hs200_max_dtr,
+                                                  card_drv_type, &drv_type);
+
+       card->drive_strength = drive_strength;
+
+       if (drv_type)
+               mmc_set_driver_type(card->host, drv_type);
+}
+
  static int mmc_select_hs400es(struct mmc_card *card)
  {
         struct mmc_host *host = card->host;
@@ -1341,6 +1358,8 @@ static int mmc_select_hs400es(struct mmc_card *card)
                 goto out_err;
         }
  
+       mmc_select_driver_type(card);
+
         /* Switch card to HS400 */
         val = EXT_CSD_TIMING_HS400 |
               card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
@@ -1374,23 +1393,6 @@ out_err:
         return err;
  }
  
-static void mmc_select_driver_type(struct mmc_card *card)
-{
-       int card_drv_type, drive_strength, drv_type;
-
-       card_drv_type = card->ext_csd.raw_driver_strength |
-                       mmc_driver_type_mask(0);
-
-       drive_strength = mmc_select_drive_strength(card,
-                                                  card->ext_csd.hs200_max_dtr,
-                                                  card_drv_type, &drv_type);
-
-       card->drive_strength = drive_strength;
-
-       if (drv_type)
-               mmc_set_driver_type(card->host, drv_type);
-}
-
  /*
   * For device supporting HS200 mode, the following sequence
   * should be done before executing the tuning process.
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c

index 74c663b..0a4e77a 100644 (file)
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -23,8 +23,6 @@
  #include "core.h"
  #include "card.h"
  
-#define MMC_QUEUE_BOUNCESZ     65536
-
  /*
   * Prepare a MMC request. This just filters out odd stuff.
   */
@@ -150,26 +148,6 @@ static void mmc_queue_setup_discard(struct request_queue *q,
                 queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
  }
  
-static unsigned int mmc_queue_calc_bouncesz(struct mmc_host *host)
-{
-       unsigned int bouncesz = MMC_QUEUE_BOUNCESZ;
-
-       if (host->max_segs != 1 || (host->caps & MMC_CAP_NO_BOUNCE_BUFF))
-               return 0;
-
-       if (bouncesz > host->max_req_size)
-               bouncesz = host->max_req_size;
-       if (bouncesz > host->max_seg_size)
-               bouncesz = host->max_seg_size;
-       if (bouncesz > host->max_blk_count * 512)
-               bouncesz = host->max_blk_count * 512;
-
-       if (bouncesz <= 512)
-               return 0;
-
-       return bouncesz;
-}
-
  /**
   * mmc_init_request() - initialize the MMC-specific per-request data
   * @q: the request queue
@@ -184,26 +162,9 @@ static int mmc_init_request(struct request_queue *q, struct request *req,
         struct mmc_card *card = mq->card;
         struct mmc_host *host = card->host;
  
-       if (card->bouncesz) {
-               mq_rq->bounce_buf = kmalloc(card->bouncesz, gfp);
-               if (!mq_rq->bounce_buf)
-                       return -ENOMEM;
-               if (card->bouncesz > 512) {
-                       mq_rq->sg = mmc_alloc_sg(1, gfp);
-                       if (!mq_rq->sg)
-                               return -ENOMEM;
-                       mq_rq->bounce_sg = mmc_alloc_sg(card->bouncesz / 512,
-                                                       gfp);
-                       if (!mq_rq->bounce_sg)
-                               return -ENOMEM;
-               }
-       } else {
-               mq_rq->bounce_buf = NULL;
-               mq_rq->bounce_sg = NULL;
-               mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp);
-               if (!mq_rq->sg)
-                       return -ENOMEM;
-       }
+       mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp);
+       if (!mq_rq->sg)
+               return -ENOMEM;
  
         return 0;
  }
@@ -212,13 +173,6 @@ static void mmc_exit_request(struct request_queue *q, struct request *req)
  {
         struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req);
  
-       /* It is OK to kfree(NULL) so this will be smooth */
-       kfree(mq_rq->bounce_sg);
-       mq_rq->bounce_sg = NULL;
-
-       kfree(mq_rq->bounce_buf);
-       mq_rq->bounce_buf = NULL;
-
         kfree(mq_rq->sg);
         mq_rq->sg = NULL;
  }
@@ -242,12 +196,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
         if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
                 limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
  
-       /*
-        * mmc_init_request() depends on card->bouncesz so it must be calculated
-        * before blk_init_allocated_queue() starts allocating requests.
-        */
-       card->bouncesz = mmc_queue_calc_bouncesz(host);
-
         mq->card = card;
         mq->queue = blk_alloc_queue(GFP_KERNEL);
         if (!mq->queue)
@@ -271,17 +219,11 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
         if (mmc_can_erase(card))
                 mmc_queue_setup_discard(mq->queue, card);
  
-       if (card->bouncesz) {
-               blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512);
-               blk_queue_max_segments(mq->queue, card->bouncesz / 512);
-               blk_queue_max_segment_size(mq->queue, card->bouncesz);
-       } else {
-               blk_queue_bounce_limit(mq->queue, limit);
-               blk_queue_max_hw_sectors(mq->queue,
-                       min(host->max_blk_count, host->max_req_size / 512));
-               blk_queue_max_segments(mq->queue, host->max_segs);
-               blk_queue_max_segment_size(mq->queue, host->max_seg_size);
-       }
+       blk_queue_bounce_limit(mq->queue, limit);
+       blk_queue_max_hw_sectors(mq->queue,
+               min(host->max_blk_count, host->max_req_size / 512));
+       blk_queue_max_segments(mq->queue, host->max_segs);
+       blk_queue_max_segment_size(mq->queue, host->max_seg_size);
  
         sema_init(&mq->thread_sem, 1);
  
@@ -370,56 +312,7 @@ void mmc_queue_resume(struct mmc_queue *mq)
   */
  unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq)
  {
-       unsigned int sg_len;
-       size_t buflen;
-       struct scatterlist *sg;
         struct request *req = mmc_queue_req_to_req(mqrq);
-       int i;
-
-       if (!mqrq->bounce_buf)
-               return blk_rq_map_sg(mq->queue, req, mqrq->sg);
-
-       sg_len = blk_rq_map_sg(mq->queue, req, mqrq->bounce_sg);
-
-       mqrq->bounce_sg_len = sg_len;
-
-       buflen = 0;
-       for_each_sg(mqrq->bounce_sg, sg, sg_len, i)
-               buflen += sg->length;
-
-       sg_init_one(mqrq->sg, mqrq->bounce_buf, buflen);
-
-       return 1;
-}
-
-/*
- * If writing, bounce the data to the buffer before the request
- * is sent to the host driver
- */
-void mmc_queue_bounce_pre(struct mmc_queue_req *mqrq)
-{
-       if (!mqrq->bounce_buf)
-               return;
-
-       if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != WRITE)
-               return;
-
-       sg_copy_to_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len,
-               mqrq->bounce_buf, mqrq->sg[0].length);
-}
-
-/*
- * If reading, bounce the data from the buffer after the request
- * has been handled by the host driver
- */
-void mmc_queue_bounce_post(struct mmc_queue_req *mqrq)
-{
-       if (!mqrq->bounce_buf)
-               return;
-
-       if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != READ)
-               return;
  
-       sg_copy_from_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len,
-               mqrq->bounce_buf, mqrq->sg[0].length);
+       return blk_rq_map_sg(mq->queue, req, mqrq->sg);
  }
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h

index 04fc893..f18d3f6 100644 (file)
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -49,9 +49,6 @@ enum mmc_drv_op {
  struct mmc_queue_req {
         struct mmc_blk_request  brq;
         struct scatterlist      *sg;
-       char                    *bounce_buf;
-       struct scatterlist      *bounce_sg;
-       unsigned int            bounce_sg_len;
         struct mmc_async_req    areq;
         enum mmc_drv_op         drv_op;
         int                     drv_op_result;
@@ -81,11 +78,8 @@ extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
  extern void mmc_cleanup_queue(struct mmc_queue *);
  extern void mmc_queue_suspend(struct mmc_queue *);
  extern void mmc_queue_resume(struct mmc_queue *);
-
  extern unsigned int mmc_queue_map_sg(struct mmc_queue *,
                                      struct mmc_queue_req *);
-extern void mmc_queue_bounce_pre(struct mmc_queue_req *);
-extern void mmc_queue_bounce_post(struct mmc_queue_req *);
  
  extern int mmc_access_rpmb(struct mmc_queue *);
  
diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c

index 27fb625..fbd29f0 100644 (file)
--- a/drivers/mmc/host/cavium.c
+++ b/drivers/mmc/host/cavium.c
@@ -1038,7 +1038,7 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host)
          */
         mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED |
                      MMC_CAP_ERASE | MMC_CAP_CMD23 | MMC_CAP_POWER_OFF_CARD |
-                    MMC_CAP_3_3V_DDR | MMC_CAP_NO_BOUNCE_BUFF;
+                    MMC_CAP_3_3V_DDR;
  
         if (host->use_sg)
                 mmc->max_segs = 16;
diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c

index c885c2d..85745ef 100644 (file)
--- a/drivers/mmc/host/meson-gx-mmc.c
+++ b/drivers/mmc/host/meson-gx-mmc.c
@@ -531,8 +531,7 @@ static int meson_mmc_clk_init(struct meson_host *host)
         div->shift = __ffs(CLK_DIV_MASK);
         div->width = __builtin_popcountl(CLK_DIV_MASK);
         div->hw.init = &init;
-       div->flags = (CLK_DIVIDER_ONE_BASED |
-                     CLK_DIVIDER_ROUND_CLOSEST);
+       div->flags = CLK_DIVIDER_ONE_BASED;
  
         clk = devm_clk_register(host->dev, &div->hw);
         if (WARN_ON(IS_ERR(clk)))
@@ -717,6 +716,22 @@ static int meson_mmc_clk_phase_tuning(struct mmc_host *mmc, u32 opcode,
  static int meson_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode)
  {
         struct meson_host *host = mmc_priv(mmc);
+       int ret;
+
+       /*
+        * If this is the initial tuning, try to get a sane Rx starting
+        * phase before doing the actual tuning.
+        */
+       if (!mmc->doing_retune) {
+               ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk);
+
+               if (ret)
+                       return ret;
+       }
+
+       ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->tx_clk);
+       if (ret)
+               return ret;
  
         return meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk);
  }
@@ -746,6 +761,11 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
         case MMC_POWER_UP:
                 if (!IS_ERR(mmc->supply.vmmc))
                         mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd);
+
+               /* Reset phases */
+               clk_set_phase(host->rx_clk, 0);
+               clk_set_phase(host->tx_clk, 270);
+
                 break;
  
         case MMC_POWER_ON:
@@ -759,8 +779,6 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
                                 host->vqmmc_enabled = true;
                 }
  
-               /* Reset rx phase */
-               clk_set_phase(host->rx_clk, 0);
                 break;
         }
  
diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c

index 59ab194..c763b40 100644 (file)
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -702,11 +702,7 @@ static int pxamci_probe(struct platform_device *pdev)
  
         pxamci_init_ocr(host);
  
-       /*
-        * This architecture used to disable bounce buffers through its
-        * defconfig, now it is done at runtime as a host property.
-        */
-       mmc->caps = MMC_CAP_NO_BOUNCE_BUFF;
+       mmc->caps = 0;
         host->cmdat = 0;
         if (!cpu_is_pxa25x()) {
                 mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ;
diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c

index 2eec2e6..0842bbc 100644 (file)
--- a/drivers/mmc/host/sdhci-xenon.c
+++ b/drivers/mmc/host/sdhci-xenon.c
@@ -466,6 +466,7 @@ static int xenon_probe(struct platform_device *pdev)
  {
         struct sdhci_pltfm_host *pltfm_host;
         struct sdhci_host *host;
+       struct xenon_priv *priv;
         int err;
  
         host = sdhci_pltfm_init(pdev, &sdhci_xenon_pdata,
@@ -474,6 +475,7 @@ static int xenon_probe(struct platform_device *pdev)
                 return PTR_ERR(host);
  
         pltfm_host = sdhci_priv(host);
+       priv = sdhci_pltfm_priv(pltfm_host);
  
         /*
          * Link Xenon specific mmc_host_ops function,
@@ -491,9 +493,20 @@ static int xenon_probe(struct platform_device *pdev)
         if (err)
                 goto free_pltfm;
  
+       priv->axi_clk = devm_clk_get(&pdev->dev, "axi");
+       if (IS_ERR(priv->axi_clk)) {
+               err = PTR_ERR(priv->axi_clk);
+               if (err == -EPROBE_DEFER)
+                       goto err_clk;
+       } else {
+               err = clk_prepare_enable(priv->axi_clk);
+               if (err)
+                       goto err_clk;
+       }
+
         err = mmc_of_parse(host->mmc);
         if (err)
-               goto err_clk;
+               goto err_clk_axi;
  
         sdhci_get_of_property(pdev);
  
@@ -502,11 +515,11 @@ static int xenon_probe(struct platform_device *pdev)
         /* Xenon specific dt parse */
         err = xenon_probe_dt(pdev);
         if (err)
-               goto err_clk;
+               goto err_clk_axi;
  
         err = xenon_sdhc_prepare(host);
         if (err)
-               goto err_clk;
+               goto err_clk_axi;
  
         pm_runtime_get_noresume(&pdev->dev);
         pm_runtime_set_active(&pdev->dev);
@@ -527,6 +540,8 @@ remove_sdhc:
         pm_runtime_disable(&pdev->dev);
         pm_runtime_put_noidle(&pdev->dev);
         xenon_sdhc_unprepare(host);
+err_clk_axi:
+       clk_disable_unprepare(priv->axi_clk);
  err_clk:
         clk_disable_unprepare(pltfm_host->clk);
  free_pltfm:
@@ -538,6 +553,7 @@ static int xenon_remove(struct platform_device *pdev)
  {
         struct sdhci_host *host = platform_get_drvdata(pdev);
         struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+       struct xenon_priv *priv = sdhci_pltfm_priv(pltfm_host);
  
         pm_runtime_get_sync(&pdev->dev);
         pm_runtime_disable(&pdev->dev);
@@ -546,7 +562,7 @@ static int xenon_remove(struct platform_device *pdev)
         sdhci_remove_host(host, 0);
  
         xenon_sdhc_unprepare(host);
-
+       clk_disable_unprepare(priv->axi_clk);
         clk_disable_unprepare(pltfm_host->clk);
  
         sdhci_pltfm_free(pdev);
diff --git a/drivers/mmc/host/sdhci-xenon.h b/drivers/mmc/host/sdhci-xenon.h

index 2bc0510..9994995 100644 (file)
--- a/drivers/mmc/host/sdhci-xenon.h
+++ b/drivers/mmc/host/sdhci-xenon.h
@@ -83,6 +83,7 @@ struct xenon_priv {
         unsigned char   bus_width;
         unsigned char   timing;
         unsigned int    clock;
+       struct clk      *axi_clk;
  
         int             phy_type;
         /*
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c

index 9ca994d..3591077 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -1074,11 +1074,6 @@ static void bnx2x_vf_set_bars(struct bnx2x *bp, struct bnx2x_virtf *vf)
         }
  }
  
-static int bnx2x_ari_enabled(struct pci_dev *dev)
-{
-       return dev->bus->self && dev->bus->self->ari_enabled;
-}
-
  static int
  bnx2x_get_vf_igu_cam_info(struct bnx2x *bp)
  {
@@ -1212,7 +1207,7 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param,
  
         err = -EIO;
         /* verify ari is enabled */
-       if (!bnx2x_ari_enabled(bp->pdev)) {
+       if (!pci_ari_enabled(bp->pdev->bus)) {
                 BNX2X_ERR("ARI not supported (check pci bridge ARI forwarding), SRIOV can not be enabled\n");
                 return 0;
         }
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile

index 4f0cb8e..457201f 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -1,3 +1,4 @@
  obj-$(CONFIG_BNXT) += bnxt_en.o
  
-bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o bnxt_tc.o
+bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o
+bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c

index 7dd3d13..4730c04 100644 (file)
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -23,8 +23,6 @@
  #include "bnxt_tc.h"
  #include "bnxt_vfr.h"
  
-#ifdef CONFIG_BNXT_FLOWER_OFFLOAD
-
  #define BNXT_FID_INVALID                       0xffff
  #define VLAN_TCI(vid, prio)    ((vid) | ((prio) << VLAN_PRIO_SHIFT))
  
@@ -833,6 +831,3 @@ void bnxt_shutdown_tc(struct bnxt *bp)
         rhashtable_destroy(&tc_info->flow_table);
         rhashtable_destroy(&tc_info->l2_table);
  }
-
-#else
-#endif
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c

index d68478a..71989e1 100644 (file)
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -566,8 +566,10 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
                 return true;
         default:
                 bpf_warn_invalid_xdp_action(action);
+               /* fall through */
         case XDP_ABORTED:
                 trace_xdp_exception(nic->netdev, prog, action);
+               /* fall through */
         case XDP_DROP:
                 /* Check if it's a recycled page, if not
                  * unmap the DMA mapping.
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c

index b65ce26..b3fd1f4 100644 (file)
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -8205,7 +8205,7 @@ struct flash_desc {
         u32 size_mb;
  };
  
-static int get_flash_params(struct adapter *adap)
+static int t4_get_flash_params(struct adapter *adap)
  {
         /* Table for non-Numonix supported flash parts.  Numonix parts are left
          * to the preexisting code.  All flash parts have 64KB sectors.
@@ -8214,40 +8214,136 @@ static int get_flash_params(struct adapter *adap)
                 { 0x150201, 4 << 20 },       /* Spansion 4MB S25FL032P */
         };
  
+       unsigned int part, manufacturer;
+       unsigned int density, size;
+       u32 flashid = 0;
         int ret;
-       u32 info;
+
+       /* Issue a Read ID Command to the Flash part.  We decode supported
+        * Flash parts and their sizes from this.  There's a newer Query
+        * Command which can retrieve detailed geometry information but many
+        * Flash parts don't support it.
+        */
  
         ret = sf1_write(adap, 1, 1, 0, SF_RD_ID);
         if (!ret)
-               ret = sf1_read(adap, 3, 0, 1, &info);
+               ret = sf1_read(adap, 3, 0, 1, &flashid);
         t4_write_reg(adap, SF_OP_A, 0);                    /* unlock SF */
         if (ret)
                 return ret;
  
-       for (ret = 0; ret < ARRAY_SIZE(supported_flash); ++ret)
-               if (supported_flash[ret].vendor_and_model_id == info) {
-                       adap->params.sf_size = supported_flash[ret].size_mb;
+       /* Check to see if it's one of our non-standard supported Flash parts.
+        */
+       for (part = 0; part < ARRAY_SIZE(supported_flash); part++)
+               if (supported_flash[part].vendor_and_model_id == flashid) {
+                       adap->params.sf_size = supported_flash[part].size_mb;
                         adap->params.sf_nsec =
                                 adap->params.sf_size / SF_SEC_SIZE;
-                       return 0;
+                       goto found;
                 }
  
-       if ((info & 0xff) != 0x20)             /* not a Numonix flash */
+       /* Decode Flash part size.  The code below looks repetative with
+        * common encodings, but that's not guaranteed in the JEDEC
+        * specification for the Read JADEC ID command.  The only thing that
+        * we're guaranteed by the JADEC specification is where the
+        * Manufacturer ID is in the returned result.  After that each
+        * Manufacturer ~could~ encode things completely differently.
+        * Note, all Flash parts must have 64KB sectors.
+        */
+       manufacturer = flashid & 0xff;
+       switch (manufacturer) {
+       case 0x20: { /* Micron/Numonix */
+               /* This Density -> Size decoding table is taken from Micron
+                * Data Sheets.
+                */
+               density = (flashid >> 16) & 0xff;
+               switch (density) {
+               case 0x14: /* 1MB */
+                       size = 1 << 20;
+                       break;
+               case 0x15: /* 2MB */
+                       size = 1 << 21;
+                       break;
+               case 0x16: /* 4MB */
+                       size = 1 << 22;
+                       break;
+               case 0x17: /* 8MB */
+                       size = 1 << 23;
+                       break;
+               case 0x18: /* 16MB */
+                       size = 1 << 24;
+                       break;
+               case 0x19: /* 32MB */
+                       size = 1 << 25;
+                       break;
+               case 0x20: /* 64MB */
+                       size = 1 << 26;
+                       break;
+               case 0x21: /* 128MB */
+                       size = 1 << 27;
+                       break;
+               case 0x22: /* 256MB */
+                       size = 1 << 28;
+                       break;
+
+               default:
+                       dev_err(adap->pdev_dev, "Micron Flash Part has bad size, ID = %#x, Density code = %#x\n",
+                               flashid, density);
                 return -EINVAL;
-       info >>= 16;                           /* log2 of size */
-       if (info >= 0x14 && info < 0x18)
-               adap->params.sf_nsec = 1 << (info - 16);
-       else if (info == 0x18)
-               adap->params.sf_nsec = 64;
-       else
+               }
+               break;
+       }
+       case 0xc2: { /* Macronix */
+               /* This Density -> Size decoding table is taken from Macronix
+                * Data Sheets.
+                */
+               density = (flashid >> 16) & 0xff;
+               switch (density) {
+               case 0x17: /* 8MB */
+                       size = 1 << 23;
+                       break;
+               case 0x18: /* 16MB */
+                       size = 1 << 24;
+                       break;
+               default:
+                       dev_err(adap->pdev_dev, "Macronix Flash Part has bad size, ID = %#x, Density code = %#x\n",
+                               flashid, density);
+               return -EINVAL;
+               }
+       }
+       case 0xef: { /* Winbond */
+               /* This Density -> Size decoding table is taken from Winbond
+                * Data Sheets.
+                */
+               density = (flashid >> 16) & 0xff;
+               switch (density) {
+               case 0x17: /* 8MB */
+                       size = 1 << 23;
+                       break;
+               case 0x18: /* 16MB */
+                       size = 1 << 24;
+                       break;
+               default:
+                       dev_err(adap->pdev_dev, "Winbond Flash Part has bad size, ID = %#x, Density code = %#x\n",
+                               flashid, density);
                 return -EINVAL;
-       adap->params.sf_size = 1 << info;
-       adap->params.sf_fw_start =
-               t4_read_reg(adap, CIM_BOOT_CFG_A) & BOOTADDR_M;
+               }
+               break;
+       }
+       default:
+               dev_err(adap->pdev_dev, "Unsupported Flash Part, ID = %#x\n",
+                       flashid);
+               return -EINVAL;
+       }
+
+       /* Store decoded Flash size and fall through into vetting code. */
+       adap->params.sf_size = size;
+       adap->params.sf_nsec = size / SF_SEC_SIZE;
  
+found:
         if (adap->params.sf_size < FLASH_MIN_SIZE)
-               dev_warn(adap->pdev_dev, "WARNING!!! FLASH size %#x < %#x!!!\n",
-                        adap->params.sf_size, FLASH_MIN_SIZE);
+               dev_warn(adap->pdev_dev, "WARNING: Flash Part ID %#x, size %#x < %#x\n",
+                        flashid, adap->params.sf_size, FLASH_MIN_SIZE);
         return 0;
  }
  
@@ -8285,7 +8381,7 @@ int t4_prep_adapter(struct adapter *adapter)
         get_pci_mode(adapter, &adapter->params.pci);
         pl_rev = REV_G(t4_read_reg(adapter, PL_REV_A));
  
-       ret = get_flash_params(adapter);
+       ret = t4_get_flash_params(adapter);
         if (ret < 0) {
                 dev_err(adapter->pdev_dev, "error %d identifying flash\n", ret);
                 return ret;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h

index 633e975..8c22bb8 100644 (file)
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h
@@ -181,6 +181,8 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN
         CH_PCI_ID_TABLE_FENTRY(0x50a7), /* Custom T580-CR */
         CH_PCI_ID_TABLE_FENTRY(0x50a8), /* Custom T580-KR */
         CH_PCI_ID_TABLE_FENTRY(0x50a9), /* Custom T580-KR */
+       CH_PCI_ID_TABLE_FENTRY(0x50aa), /* Custom T580-CR */
+       CH_PCI_ID_TABLE_FENTRY(0x50ab), /* Custom T520-CR */
  
         /* T6 adapters:
          */
diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig

index 9d7cb03..30000b6 100644 (file)
--- a/drivers/net/ethernet/hisilicon/Kconfig
+++ b/drivers/net/ethernet/hisilicon/Kconfig
@@ -78,7 +78,7 @@ config HNS_ENET
  
  config HNS3
         tristate "Hisilicon Network Subsystem Support HNS3 (Framework)"
-    depends on PCI
+       depends on PCI
         ---help---
           This selects the framework support for Hisilicon Network Subsystem 3.
           This layer facilitates clients like ENET, RoCE and user-space ethernet
@@ -87,7 +87,7 @@ config HNS3
  
  config HNS3_HCLGE
         tristate "Hisilicon HNS3 HCLGE Acceleration Engine & Compatibility Layer Support"
-    depends on PCI_MSI
+       depends on PCI_MSI
         depends on HNS3
         ---help---
           This selects the HNS3_HCLGE network acceleration engine & its hardware
@@ -96,7 +96,7 @@ config HNS3_HCLGE
  
  config HNS3_ENET
         tristate "Hisilicon HNS3 Ethernet Device Support"
-    depends on 64BIT && PCI
+       depends on 64BIT && PCI
         depends on HNS3 && HNS3_HCLGE
         ---help---
           This selects the Ethernet Driver for Hisilicon Network Subsystem 3 for hip08
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h

index c677530..575f50d 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -339,6 +339,10 @@ struct hnae3_ae_ops {
                        u8 *hfunc);
         int (*set_rss)(struct hnae3_handle *handle, const u32 *indir,
                        const u8 *key, const u8 hfunc);
+       int (*set_rss_tuple)(struct hnae3_handle *handle,
+                            struct ethtool_rxnfc *cmd);
+       int (*get_rss_tuple)(struct hnae3_handle *handle,
+                            struct ethtool_rxnfc *cmd);
  
         int (*get_tc_size)(struct hnae3_handle *handle);
  
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c

index 8b511e6..60960e5 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -85,6 +85,15 @@ static int hclge_init_cmd_queue(struct hclge_dev *hdev, int ring_type)
         return 0;
  }
  
+void hclge_cmd_reuse_desc(struct hclge_desc *desc, bool is_read)
+{
+       desc->flag = cpu_to_le16(HCLGE_CMD_FLAG_NO_INTR | HCLGE_CMD_FLAG_IN);
+       if (is_read)
+               desc->flag |= cpu_to_le16(HCLGE_CMD_FLAG_WR);
+       else
+               desc->flag &= cpu_to_le16(~HCLGE_CMD_FLAG_WR);
+}
+
  void hclge_cmd_setup_basic_desc(struct hclge_desc *desc,
                                 enum hclge_opcode_type opcode, bool is_read)
  {
@@ -208,7 +217,7 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
          * which will be use for hardware to write back
          */
         ntc = hw->cmq.csq.next_to_use;
-       opcode = desc[0].opcode;
+       opcode = le16_to_cpu(desc[0].opcode);
         while (handle < num) {
                 desc_to_use = &hw->cmq.csq.desc[hw->cmq.csq.next_to_use];
                 *desc_to_use = desc[handle];
@@ -225,7 +234,7 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
          * If the command is sync, wait for the firmware to write back,
          * if multi descriptors to be sent, use the first one to check
          */
-       if (HCLGE_SEND_SYNC(desc->flag)) {
+       if (HCLGE_SEND_SYNC(le16_to_cpu(desc->flag))) {
                 do {
                         if (hclge_cmd_csq_done(hw))
                                 break;
@@ -244,9 +253,9 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
                         pr_debug("Get cmd desc:\n");
  
                         if (likely(!hclge_is_special_opcode(opcode)))
-                               desc_ret = desc[handle].retval;
+                               desc_ret = le16_to_cpu(desc[handle].retval);
                         else
-                               desc_ret = desc[0].retval;
+                               desc_ret = le16_to_cpu(desc[0].retval);
  
                         if ((enum hclge_cmd_return_status)desc_ret ==
                             HCLGE_CMD_EXEC_SUCCESS)
@@ -276,15 +285,15 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
         return retval;
  }
  
-enum hclge_cmd_status hclge_cmd_query_firmware_version(struct hclge_hw *hw,
-                                                      u32 *version)
+static enum hclge_cmd_status hclge_cmd_query_firmware_version(
+               struct hclge_hw *hw, u32 *version)
  {
-       struct hclge_query_version *resp;
+       struct hclge_query_version_cmd *resp;
         struct hclge_desc desc;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_FW_VER, 1);
-       resp = (struct hclge_query_version *)desc.data;
+       resp = (struct hclge_query_version_cmd *)desc.data;
  
         ret = hclge_cmd_send(hw, &desc, 1);
         if (!ret)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h

index 6b6d28e..b437334 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
@@ -221,12 +221,12 @@ enum hclge_opcode_type {
  #define HCLGE_RCB_INIT_QUERY_TIMEOUT   10
  #define HCLGE_RCB_INIT_FLAG_EN_B       0
  #define HCLGE_RCB_INIT_FLAG_FINI_B     8
-struct hclge_config_rcb_init {
+struct hclge_config_rcb_init_cmd {
         __le16 rcb_init_flag;
         u8 rsv[22];
  };
  
-struct hclge_tqp_map {
+struct hclge_tqp_map_cmd {
         __le16 tqp_id;  /* Absolute tqp id for in this pf */
         u8 tqp_vf;      /* VF id */
  #define HCLGE_TQP_MAP_TYPE_PF          0
@@ -246,15 +246,15 @@ enum hclge_int_type {
         HCLGE_INT_EVENT,
  };
  
-struct hclge_ctrl_vector_chain {
+struct hclge_ctrl_vector_chain_cmd {
         u8 int_vector_id;
         u8 int_cause_num;
  #define HCLGE_INT_TYPE_S       0
-#define HCLGE_INT_TYPE_M       0x3
+#define HCLGE_INT_TYPE_M       GENMASK(1, 0)
  #define HCLGE_TQP_ID_S         2
-#define HCLGE_TQP_ID_M         (0x7ff << HCLGE_TQP_ID_S)
+#define HCLGE_TQP_ID_M         GENMASK(12, 2)
  #define HCLGE_INT_GL_IDX_S     13
-#define HCLGE_INT_GL_IDX_M     (0x3 << HCLGE_INT_GL_IDX_S)
+#define HCLGE_INT_GL_IDX_M     GENMASK(14, 13)
         __le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD];
         u8 vfid;
         u8 rsv;
@@ -263,18 +263,18 @@ struct hclge_ctrl_vector_chain {
  #define HCLGE_TC_NUM           8
  #define HCLGE_TC0_PRI_BUF_EN_B 15 /* Bit 15 indicate enable or not */
  #define HCLGE_BUF_UNIT_S       7  /* Buf size is united by 128 bytes */
-struct hclge_tx_buff_alloc {
+struct hclge_tx_buff_alloc_cmd {
         __le16 tx_pkt_buff[HCLGE_TC_NUM];
         u8 tx_buff_rsv[8];
  };
  
-struct hclge_rx_priv_buff {
+struct hclge_rx_priv_buff_cmd {
         __le16 buf_num[HCLGE_TC_NUM];
         __le16 shared_buf;
         u8 rsv[6];
  };
  
-struct hclge_query_version {
+struct hclge_query_version_cmd {
         __le32 firmware;
         __le32 firmware_rsv[5];
  };
@@ -328,14 +328,14 @@ struct hclge_pkt_buf_alloc {
  };
  
  #define HCLGE_RX_COM_WL_EN_B   15
-struct hclge_rx_com_wl_buf {
+struct hclge_rx_com_wl_buf_cmd {
         __le16 high_wl;
         __le16 low_wl;
         u8 rsv[20];
  };
  
  #define HCLGE_RX_PKT_EN_B      15
-struct hclge_rx_pkt_buf {
+struct hclge_rx_pkt_buf_cmd {
         __le16 high_pkt;
         __le16 low_pkt;
         u8 rsv[20];
@@ -348,7 +348,7 @@ struct hclge_rx_pkt_buf {
  #define HCLGE_PF_MAC_NUM_MASK  0x3
  #define HCLGE_PF_STATE_MAIN    BIT(HCLGE_PF_STATE_MAIN_B)
  #define HCLGE_PF_STATE_DONE    BIT(HCLGE_PF_STATE_DONE_B)
-struct hclge_func_status {
+struct hclge_func_status_cmd {
         __le32  vf_rst_state[4];
         u8 pf_state;
         u8 mac_id;
@@ -359,7 +359,7 @@ struct hclge_func_status {
         u8 rsv[2];
  };
  
-struct hclge_pf_res {
+struct hclge_pf_res_cmd {
         __le16 tqp_num;
         __le16 buf_size;
         __le16 msixcap_localid_ba_nic;
@@ -372,30 +372,30 @@ struct hclge_pf_res {
  };
  
  #define HCLGE_CFG_OFFSET_S     0
-#define HCLGE_CFG_OFFSET_M     0xfffff /* Byte (8-10.3) */
+#define HCLGE_CFG_OFFSET_M     GENMASK(19, 0)
  #define HCLGE_CFG_RD_LEN_S     24
-#define HCLGE_CFG_RD_LEN_M     (0xf << HCLGE_CFG_RD_LEN_S)
+#define HCLGE_CFG_RD_LEN_M     GENMASK(27, 24)
  #define HCLGE_CFG_RD_LEN_BYTES 16
  #define HCLGE_CFG_RD_LEN_UNIT  4
  
  #define HCLGE_CFG_VMDQ_S       0
-#define HCLGE_CFG_VMDQ_M       (0xff << HCLGE_CFG_VMDQ_S)
+#define HCLGE_CFG_VMDQ_M       GENMASK(7, 0)
  #define HCLGE_CFG_TC_NUM_S     8
-#define HCLGE_CFG_TC_NUM_M     (0xff << HCLGE_CFG_TC_NUM_S)
+#define HCLGE_CFG_TC_NUM_M     GENMASK(15, 8)
  #define HCLGE_CFG_TQP_DESC_N_S 16
-#define HCLGE_CFG_TQP_DESC_N_M (0xffff << HCLGE_CFG_TQP_DESC_N_S)
+#define HCLGE_CFG_TQP_DESC_N_M GENMASK(31, 16)
  #define HCLGE_CFG_PHY_ADDR_S   0
-#define HCLGE_CFG_PHY_ADDR_M   (0x1f << HCLGE_CFG_PHY_ADDR_S)
+#define HCLGE_CFG_PHY_ADDR_M   GENMASK(4, 0)
  #define HCLGE_CFG_MEDIA_TP_S   8
-#define HCLGE_CFG_MEDIA_TP_M   (0xff << HCLGE_CFG_MEDIA_TP_S)
+#define HCLGE_CFG_MEDIA_TP_M   GENMASK(15, 8)
  #define HCLGE_CFG_RX_BUF_LEN_S 16
-#define HCLGE_CFG_RX_BUF_LEN_M (0xffff << HCLGE_CFG_RX_BUF_LEN_S)
+#define HCLGE_CFG_RX_BUF_LEN_M GENMASK(31, 16)
  #define HCLGE_CFG_MAC_ADDR_H_S 0
-#define HCLGE_CFG_MAC_ADDR_H_M (0xffff << HCLGE_CFG_MAC_ADDR_H_S)
+#define HCLGE_CFG_MAC_ADDR_H_M GENMASK(15, 0)
  #define HCLGE_CFG_DEFAULT_SPEED_S      16
-#define HCLGE_CFG_DEFAULT_SPEED_M      (0xff << HCLGE_CFG_DEFAULT_SPEED_S)
+#define HCLGE_CFG_DEFAULT_SPEED_M      GENMASK(23, 16)
  
-struct hclge_cfg_param {
+struct hclge_cfg_param_cmd {
         __le32 offset;
         __le32 rsv;
         __le32 param[4];
@@ -405,7 +405,7 @@ struct hclge_cfg_param {
  #define HCLGE_DESC_NUM         0x40
  
  #define HCLGE_ALLOC_VALID_B    0
-struct hclge_vf_num {
+struct hclge_vf_num_cmd {
         u8 alloc_valid;
         u8 rsv[23];
  };
@@ -413,13 +413,13 @@ struct hclge_vf_num {
  #define HCLGE_RSS_DEFAULT_OUTPORT_B    4
  #define HCLGE_RSS_HASH_KEY_OFFSET_B    4
  #define HCLGE_RSS_HASH_KEY_NUM         16
-struct hclge_rss_config {
+struct hclge_rss_config_cmd {
         u8 hash_config;
         u8 rsv[7];
         u8 hash_key[HCLGE_RSS_HASH_KEY_NUM];
  };
  
-struct hclge_rss_input_tuple {
+struct hclge_rss_input_tuple_cmd {
         u8 ipv4_tcp_en;
         u8 ipv4_udp_en;
         u8 ipv4_sctp_en;
@@ -433,26 +433,26 @@ struct hclge_rss_input_tuple {
  
  #define HCLGE_RSS_CFG_TBL_SIZE 16
  
-struct hclge_rss_indirection_table {
-       u16 start_table_index;
-       u16 rss_set_bitmap;
+struct hclge_rss_indirection_table_cmd {
+       __le16 start_table_index;
+       __le16 rss_set_bitmap;
         u8 rsv[4];
         u8 rss_result[HCLGE_RSS_CFG_TBL_SIZE];
  };
  
  #define HCLGE_RSS_TC_OFFSET_S          0
-#define HCLGE_RSS_TC_OFFSET_M          (0x3ff << HCLGE_RSS_TC_OFFSET_S)
+#define HCLGE_RSS_TC_OFFSET_M          GENMASK(9, 0)
  #define HCLGE_RSS_TC_SIZE_S            12
-#define HCLGE_RSS_TC_SIZE_M            (0x7 << HCLGE_RSS_TC_SIZE_S)
+#define HCLGE_RSS_TC_SIZE_M            GENMASK(14, 12)
  #define HCLGE_RSS_TC_VALID_B           15
-struct hclge_rss_tc_mode {
-       u16 rss_tc_mode[HCLGE_MAX_TC_NUM];
+struct hclge_rss_tc_mode_cmd {
+       __le16 rss_tc_mode[HCLGE_MAX_TC_NUM];
         u8 rsv[8];
  };
  
  #define HCLGE_LINK_STS_B       0
  #define HCLGE_LINK_STATUS      BIT(HCLGE_LINK_STS_B)
-struct hclge_link_status {
+struct hclge_link_status_cmd {
         u8 status;
         u8 rsv[23];
  };
@@ -467,7 +467,7 @@ struct hclge_promisc_param {
  #define HCLGE_PROMISC_EN_UC    0x1
  #define HCLGE_PROMISC_EN_MC    0x2
  #define HCLGE_PROMISC_EN_BC    0x4
-struct hclge_promisc_cfg {
+struct hclge_promisc_cfg_cmd {
         u8 flag;
         u8 vf_id;
         __le16 rsv0;
@@ -495,18 +495,18 @@ enum hclge_promisc_type {
  #define HCLGE_MAC_TX_UNDER_MIN_ERR_B           21
  #define HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B       22
  
-struct hclge_config_mac_mode {
+struct hclge_config_mac_mode_cmd {
         __le32 txrx_pad_fcs_loop_en;
         u8 rsv[20];
  };
  
  #define HCLGE_CFG_SPEED_S              0
-#define HCLGE_CFG_SPEED_M              (0x3f << HCLGE_CFG_SPEED_S)
+#define HCLGE_CFG_SPEED_M              GENMASK(5, 0)
  
  #define HCLGE_CFG_DUPLEX_B             7
  #define HCLGE_CFG_DUPLEX_M             BIT(HCLGE_CFG_DUPLEX_B)
  
-struct hclge_config_mac_speed_dup {
+struct hclge_config_mac_speed_dup_cmd {
         u8 speed_dup;
  
  #define HCLGE_CFG_MAC_SPEED_CHANGE_EN_B        0
@@ -518,17 +518,17 @@ struct hclge_config_mac_speed_dup {
  #define HCLGE_QUERY_AN_B               0
  #define HCLGE_QUERY_DUPLEX_B           2
  
-#define HCLGE_QUERY_SPEED_M            (0x1f << HCLGE_QUERY_SPEED_S)
+#define HCLGE_QUERY_SPEED_M            GENMASK(4, 0)
  #define HCLGE_QUERY_AN_M               BIT(HCLGE_QUERY_AN_B)
  #define HCLGE_QUERY_DUPLEX_M           BIT(HCLGE_QUERY_DUPLEX_B)
  
-struct hclge_query_an_speed_dup {
+struct hclge_query_an_speed_dup_cmd {
         u8 an_syn_dup_speed;
         u8 pause;
         u8 rsv[23];
  };
  
-#define HCLGE_RING_ID_MASK             0x3ff
+#define HCLGE_RING_ID_MASK             GENMASK(9, 0)
  #define HCLGE_TQP_ENABLE_B             0
  
  #define HCLGE_MAC_CFG_AN_EN_B          0
@@ -539,7 +539,7 @@ struct hclge_query_an_speed_dup {
  
  #define HCLGE_MAC_CFG_AN_EN    BIT(HCLGE_MAC_CFG_AN_EN_B)
  
-struct hclge_config_auto_neg {
+struct hclge_config_auto_neg_cmd {
         __le32  cfg_an_cmd_flag;
         u8      rsv[20];
  };
@@ -548,7 +548,7 @@ struct hclge_config_auto_neg {
  #define HCLGE_MAC_MAX_MTU              9728
  #define HCLGE_MAC_UPLINK_PORT          0x100
  
-struct hclge_config_max_frm_size {
+struct hclge_config_max_frm_size_cmd {
         __le16  max_frm_size;
         u8      rsv[22];
  };
@@ -565,10 +565,10 @@ enum hclge_mac_vlan_tbl_opcode {
  #define HCLGE_MAC_EPORT_SW_EN_B                0xc
  #define HCLGE_MAC_EPORT_TYPE_B         0xb
  #define HCLGE_MAC_EPORT_VFID_S         0x3
-#define HCLGE_MAC_EPORT_VFID_M         (0xff << HCLGE_MAC_EPORT_VFID_S)
+#define HCLGE_MAC_EPORT_VFID_M         GENMASK(10, 3)
  #define HCLGE_MAC_EPORT_PFID_S         0x0
-#define HCLGE_MAC_EPORT_PFID_M         (0x7 << HCLGE_MAC_EPORT_PFID_S)
-struct hclge_mac_vlan_tbl_entry {
+#define HCLGE_MAC_EPORT_PFID_M         GENMASK(2, 0)
+struct hclge_mac_vlan_tbl_entry_cmd {
         u8      flags;
         u8      resp_code;
         __le16  vlan_tag;
@@ -583,15 +583,15 @@ struct hclge_mac_vlan_tbl_entry {
  };
  
  #define HCLGE_CFG_MTA_MAC_SEL_S                0x0
-#define HCLGE_CFG_MTA_MAC_SEL_M                (0x3 << HCLGE_CFG_MTA_MAC_SEL_S)
+#define HCLGE_CFG_MTA_MAC_SEL_M                GENMASK(1, 0)
  #define HCLGE_CFG_MTA_MAC_EN_B         0x7
-struct hclge_mta_filter_mode {
+struct hclge_mta_filter_mode_cmd {
         u8      dmac_sel_en; /* Use lowest 2 bit as sel_mode, bit 7 as enable */
         u8      rsv[23];
  };
  
  #define HCLGE_CFG_FUNC_MTA_ACCEPT_B    0x0
-struct hclge_cfg_func_mta_filter {
+struct hclge_cfg_func_mta_filter_cmd {
         u8      accept; /* Only used lowest 1 bit */
         u8      function_id;
         u8      rsv[22];
@@ -599,14 +599,14 @@ struct hclge_cfg_func_mta_filter {
  
  #define HCLGE_CFG_MTA_ITEM_ACCEPT_B    0x0
  #define HCLGE_CFG_MTA_ITEM_IDX_S       0x0
-#define HCLGE_CFG_MTA_ITEM_IDX_M       (0xfff << HCLGE_CFG_MTA_ITEM_IDX_S)
-struct hclge_cfg_func_mta_item {
-       u16     item_idx; /* Only used lowest 12 bit */
+#define HCLGE_CFG_MTA_ITEM_IDX_M       GENMASK(11, 0)
+struct hclge_cfg_func_mta_item_cmd {
+       __le16  item_idx; /* Only used lowest 12 bit */
         u8      accept;   /* Only used lowest 1 bit */
         u8      rsv[21];
  };
  
-struct hclge_mac_vlan_add {
+struct hclge_mac_vlan_add_cmd {
         __le16  flags;
         __le16  mac_addr_hi16;
         __le32  mac_addr_lo32;
@@ -619,7 +619,7 @@ struct hclge_mac_vlan_add {
  };
  
  #define HNS3_MAC_VLAN_CFG_FLAG_BIT 0
-struct hclge_mac_vlan_remove {
+struct hclge_mac_vlan_remove_cmd {
         __le16  flags;
         __le16  mac_addr_hi16;
         __le32  mac_addr_lo32;
@@ -631,21 +631,21 @@ struct hclge_mac_vlan_remove {
         u8      rsv[4];
  };
  
-struct hclge_vlan_filter_ctrl {
+struct hclge_vlan_filter_ctrl_cmd {
         u8 vlan_type;
         u8 vlan_fe;
         u8 rsv[22];
  };
  
-struct hclge_vlan_filter_pf_cfg {
+struct hclge_vlan_filter_pf_cfg_cmd {
         u8 vlan_offset;
         u8 vlan_cfg;
         u8 rsv[2];
         u8 vlan_offset_bitmap[20];
  };
  
-struct hclge_vlan_filter_vf_cfg {
-       u16 vlan_id;
+struct hclge_vlan_filter_vf_cfg_cmd {
+       __le16 vlan_id;
         u8  resp_code;
         u8  rsv;
         u8  vlan_cfg;
@@ -653,14 +653,14 @@ struct hclge_vlan_filter_vf_cfg {
         u8  vf_bitmap[16];
  };
  
-struct hclge_cfg_com_tqp_queue {
+struct hclge_cfg_com_tqp_queue_cmd {
         __le16 tqp_id;
         __le16 stream_id;
         u8 enable;
         u8 rsv[19];
  };
  
-struct hclge_cfg_tx_queue_pointer {
+struct hclge_cfg_tx_queue_pointer_cmd {
         __le16 tqp_id;
         __le16 tx_tail;
         __le16 tx_head;
@@ -670,12 +670,12 @@ struct hclge_cfg_tx_queue_pointer {
  };
  
  #define HCLGE_TSO_MSS_MIN_S    0
-#define HCLGE_TSO_MSS_MIN_M    (0x3FFF << HCLGE_TSO_MSS_MIN_S)
+#define HCLGE_TSO_MSS_MIN_M    GENMASK(13, 0)
  
  #define HCLGE_TSO_MSS_MAX_S    16
-#define HCLGE_TSO_MSS_MAX_M    (0x3FFF << HCLGE_TSO_MSS_MAX_S)
+#define HCLGE_TSO_MSS_MAX_M    GENMASK(29, 16)
  
-struct hclge_cfg_tso_status {
+struct hclge_cfg_tso_status_cmd {
         __le16 tso_mss_min;
         __le16 tso_mss_max;
         u8 rsv[20];
@@ -685,7 +685,7 @@ struct hclge_cfg_tso_status {
  #define HCLGE_TSO_MSS_MAX      9668
  
  #define HCLGE_TQP_RESET_B      0
-struct hclge_reset_tqp_queue {
+struct hclge_reset_tqp_queue_cmd {
         __le16 tqp_id;
         u8 reset_req;
         u8 ready_to_reset;
@@ -739,6 +739,7 @@ struct hclge_hw;
  int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num);
  void hclge_cmd_setup_basic_desc(struct hclge_desc *desc,
                                 enum hclge_opcode_type opcode, bool is_read);
+void hclge_cmd_reuse_desc(struct hclge_desc *desc, bool is_read);
  
  int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev,
                                struct hclge_promisc_param *param);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c

index 1a13614..c322b45 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -362,7 +362,7 @@ static int hclge_64_bit_update_stats(struct hclge_dev *hdev)
  #define HCLGE_64_BIT_RTN_DATANUM 4
         u64 *data = (u64 *)(&hdev->hw_stats.all_64_bit_stats);
         struct hclge_desc desc[HCLGE_64_BIT_CMD_NUM];
-       u64 *desc_data;
+       __le64 *desc_data;
         int i, k, n;
         int ret;
  
@@ -376,14 +376,14 @@ static int hclge_64_bit_update_stats(struct hclge_dev *hdev)
  
         for (i = 0; i < HCLGE_64_BIT_CMD_NUM; i++) {
                 if (unlikely(i == 0)) {
-                       desc_data = (u64 *)(&desc[i].data[0]);
+                       desc_data = (__le64 *)(&desc[i].data[0]);
                         n = HCLGE_64_BIT_RTN_DATANUM - 1;
                 } else {
-                       desc_data = (u64 *)(&desc[i]);
+                       desc_data = (__le64 *)(&desc[i]);
                         n = HCLGE_64_BIT_RTN_DATANUM;
                 }
                 for (k = 0; k < n; k++) {
-                       *data++ += cpu_to_le64(*desc_data);
+                       *data++ += le64_to_cpu(*desc_data);
                         desc_data++;
                 }
         }
@@ -411,7 +411,7 @@ static int hclge_32_bit_update_stats(struct hclge_dev *hdev)
  
         struct hclge_desc desc[HCLGE_32_BIT_CMD_NUM];
         struct hclge_32_bit_stats *all_32_bit_stats;
-       u32 *desc_data;
+       __le32 *desc_data;
         int i, k, n;
         u64 *data;
         int ret;
@@ -431,21 +431,27 @@ static int hclge_32_bit_update_stats(struct hclge_dev *hdev)
         hclge_reset_partial_32bit_counter(all_32_bit_stats);
         for (i = 0; i < HCLGE_32_BIT_CMD_NUM; i++) {
                 if (unlikely(i == 0)) {
+                       __le16 *desc_data_16bit;
+
                         all_32_bit_stats->igu_rx_err_pkt +=
-                               cpu_to_le32(desc[i].data[0]);
+                               le32_to_cpu(desc[i].data[0]);
+
+                       desc_data_16bit = (__le16 *)&desc[i].data[1];
                         all_32_bit_stats->igu_rx_no_eof_pkt +=
-                               cpu_to_le32(desc[i].data[1] & 0xffff);
+                               le16_to_cpu(*desc_data_16bit);
+
+                       desc_data_16bit++;
                         all_32_bit_stats->igu_rx_no_sof_pkt +=
-                               cpu_to_le32((desc[i].data[1] >> 16) & 0xffff);
+                               le16_to_cpu(*desc_data_16bit);
  
-                       desc_data = (u32 *)(&desc[i].data[2]);
+                       desc_data = &desc[i].data[2];
                         n = HCLGE_32_BIT_RTN_DATANUM - 4;
                 } else {
-                       desc_data = (u32 *)(&desc[i]);
+                       desc_data = (__le32 *)&desc[i];
                         n = HCLGE_32_BIT_RTN_DATANUM;
                 }
                 for (k = 0; k < n; k++) {
-                       *data++ += cpu_to_le32(*desc_data);
+                       *data++ += le32_to_cpu(*desc_data);
                         desc_data++;
                 }
         }
@@ -460,7 +466,7 @@ static int hclge_mac_update_stats(struct hclge_dev *hdev)
  
         u64 *data = (u64 *)(&hdev->hw_stats.mac_stats);
         struct hclge_desc desc[HCLGE_MAC_CMD_NUM];
-       u64 *desc_data;
+       __le64 *desc_data;
         int i, k, n;
         int ret;
  
@@ -475,14 +481,14 @@ static int hclge_mac_update_stats(struct hclge_dev *hdev)
  
         for (i = 0; i < HCLGE_MAC_CMD_NUM; i++) {
                 if (unlikely(i == 0)) {
-                       desc_data = (u64 *)(&desc[i].data[0]);
+                       desc_data = (__le64 *)(&desc[i].data[0]);
                         n = HCLGE_RTN_DATA_NUM - 2;
                 } else {
-                       desc_data = (u64 *)(&desc[i]);
+                       desc_data = (__le64 *)(&desc[i]);
                         n = HCLGE_RTN_DATA_NUM;
                 }
                 for (k = 0; k < n; k++) {
-                       *data++ += cpu_to_le64(*desc_data);
+                       *data++ += le64_to_cpu(*desc_data);
                         desc_data++;
                 }
         }
@@ -508,7 +514,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                                            HCLGE_OPC_QUERY_RX_STATUS,
                                            true);
  
-               desc[0].data[0] = (tqp->index & 0x1ff);
+               desc[0].data[0] = cpu_to_le32((tqp->index & 0x1ff));
                 ret = hclge_cmd_send(&hdev->hw, desc, 1);
                 if (ret) {
                         dev_err(&hdev->pdev->dev,
@@ -517,7 +523,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                         return ret;
                 }
                 tqp->tqp_stats.rcb_rx_ring_pktnum_rcd +=
-                       cpu_to_le32(desc[0].data[4]);
+                       le32_to_cpu(desc[0].data[4]);
         }
  
         for (i = 0; i < kinfo->num_tqps; i++) {
@@ -528,7 +534,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                                            HCLGE_OPC_QUERY_TX_STATUS,
                                            true);
  
-               desc[0].data[0] = (tqp->index & 0x1ff);
+               desc[0].data[0] = cpu_to_le32((tqp->index & 0x1ff));
                 ret = hclge_cmd_send(&hdev->hw, desc, 1);
                 if (ret) {
                         dev_err(&hdev->pdev->dev,
@@ -537,7 +543,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle)
                         return ret;
                 }
                 tqp->tqp_stats.rcb_tx_ring_pktnum_rcd +=
-                       cpu_to_le32(desc[0].data[4]);
+                       le32_to_cpu(desc[0].data[4]);
         }
  
         return 0;
@@ -552,12 +558,12 @@ static u64 *hclge_tqps_get_stats(struct hnae3_handle *handle, u64 *data)
  
         for (i = 0; i < kinfo->num_tqps; i++) {
                 tqp = container_of(kinfo->tqp[i], struct hclge_tqp, q);
-               *buff++ = cpu_to_le64(tqp->tqp_stats.rcb_tx_ring_pktnum_rcd);
+               *buff++ = tqp->tqp_stats.rcb_tx_ring_pktnum_rcd;
         }
  
         for (i = 0; i < kinfo->num_tqps; i++) {
                 tqp = container_of(kinfo->tqp[i], struct hclge_tqp, q);
-               *buff++ = cpu_to_le64(tqp->tqp_stats.rcb_rx_ring_pktnum_rcd);
+               *buff++ = tqp->tqp_stats.rcb_rx_ring_pktnum_rcd;
         }
  
         return buff;
@@ -820,7 +826,7 @@ static void hclge_get_stats(struct hnae3_handle *handle, u64 *data)
  }
  
  static int hclge_parse_func_status(struct hclge_dev *hdev,
-                                  struct hclge_func_status *status)
+                                  struct hclge_func_status_cmd *status)
  {
         if (!(status->pf_state & HCLGE_PF_STATE_DONE))
                 return -EINVAL;
@@ -837,13 +843,13 @@ static int hclge_parse_func_status(struct hclge_dev *hdev,
  
  static int hclge_query_function_status(struct hclge_dev *hdev)
  {
-       struct hclge_func_status *req;
+       struct hclge_func_status_cmd *req;
         struct hclge_desc desc;
         int timeout = 0;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_FUNC_STATUS, true);
-       req = (struct hclge_func_status *)desc.data;
+       req = (struct hclge_func_status_cmd *)desc.data;
  
         do {
                 ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -868,7 +874,7 @@ static int hclge_query_function_status(struct hclge_dev *hdev)
  
  static int hclge_query_pf_resource(struct hclge_dev *hdev)
  {
-       struct hclge_pf_res *req;
+       struct hclge_pf_res_cmd *req;
         struct hclge_desc desc;
         int ret;
  
@@ -880,7 +886,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev)
                 return ret;
         }
  
-       req = (struct hclge_pf_res *)desc.data;
+       req = (struct hclge_pf_res_cmd *)desc.data;
         hdev->num_tqps = __le16_to_cpu(req->tqp_num);
         hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S;
  
@@ -938,12 +944,12 @@ static int hclge_parse_speed(int speed_cmd, int *speed)
  
  static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc)
  {
-       struct hclge_cfg_param *req;
+       struct hclge_cfg_param_cmd *req;
         u64 mac_addr_tmp_high;
         u64 mac_addr_tmp;
         int i;
  
-       req = (struct hclge_cfg_param *)desc[0].data;
+       req = (struct hclge_cfg_param_cmd *)desc[0].data;
  
         /* get the configuration */
         cfg->vmdq_vport_num = hnae_get_field(__le32_to_cpu(req->param[0]),
@@ -978,7 +984,7 @@ static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc)
         for (i = 0; i < ETH_ALEN; i++)
                 cfg->mac_addr[i] = (mac_addr_tmp >> (8 * i)) & 0xff;
  
-       req = (struct hclge_cfg_param *)desc[1].data;
+       req = (struct hclge_cfg_param_cmd *)desc[1].data;
         cfg->numa_node_map = __le32_to_cpu(req->param[0]);
  }
  
@@ -989,20 +995,21 @@ static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc)
  static int hclge_get_cfg(struct hclge_dev *hdev, struct hclge_cfg *hcfg)
  {
         struct hclge_desc desc[HCLGE_PF_CFG_DESC_NUM];
-       struct hclge_cfg_param *req;
+       struct hclge_cfg_param_cmd *req;
         int i, ret;
  
         for (i = 0; i < HCLGE_PF_CFG_DESC_NUM; i++) {
-               req = (struct hclge_cfg_param *)desc[i].data;
+               u32 offset = 0;
+
+               req = (struct hclge_cfg_param_cmd *)desc[i].data;
                 hclge_cmd_setup_basic_desc(&desc[i], HCLGE_OPC_GET_CFG_PARAM,
                                            true);
-               hnae_set_field(req->offset, HCLGE_CFG_OFFSET_M,
+               hnae_set_field(offset, HCLGE_CFG_OFFSET_M,
                                HCLGE_CFG_OFFSET_S, i * HCLGE_CFG_RD_LEN_BYTES);
                 /* Len should be united by 4 bytes when send to hardware */
-               hnae_set_field(req->offset, HCLGE_CFG_RD_LEN_M,
-                              HCLGE_CFG_RD_LEN_S,
+               hnae_set_field(offset, HCLGE_CFG_RD_LEN_M, HCLGE_CFG_RD_LEN_S,
                                HCLGE_CFG_RD_LEN_BYTES / HCLGE_CFG_RD_LEN_UNIT);
-               req->offset = cpu_to_le32(req->offset);
+               req->offset = cpu_to_le32(offset);
         }
  
         ret = hclge_cmd_send(&hdev->hw, desc, HCLGE_PF_CFG_DESC_NUM);
@@ -1099,16 +1106,23 @@ static int hclge_configure(struct hclge_dev *hdev)
  static int hclge_config_tso(struct hclge_dev *hdev, int tso_mss_min,
                             int tso_mss_max)
  {
-       struct hclge_cfg_tso_status *req;
+       struct hclge_cfg_tso_status_cmd *req;
         struct hclge_desc desc;
+       u16 tso_mss;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TSO_GENERIC_CONFIG, false);
  
-       req = (struct hclge_cfg_tso_status *)desc.data;
-       hnae_set_field(req->tso_mss_min, HCLGE_TSO_MSS_MIN_M,
+       req = (struct hclge_cfg_tso_status_cmd *)desc.data;
+
+       tso_mss = 0;
+       hnae_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M,
                        HCLGE_TSO_MSS_MIN_S, tso_mss_min);
-       hnae_set_field(req->tso_mss_max, HCLGE_TSO_MSS_MIN_M,
+       req->tso_mss_min = cpu_to_le16(tso_mss);
+
+       tso_mss = 0;
+       hnae_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M,
                        HCLGE_TSO_MSS_MIN_S, tso_mss_max);
+       req->tso_mss_max = cpu_to_le16(tso_mss);
  
         return hclge_cmd_send(&hdev->hw, &desc, 1);
  }
@@ -1144,15 +1158,15 @@ static int hclge_alloc_tqps(struct hclge_dev *hdev)
  static int hclge_map_tqps_to_func(struct hclge_dev *hdev, u16 func_id,
                                   u16 tqp_pid, u16 tqp_vid, bool is_pf)
  {
-       struct hclge_tqp_map *req;
+       struct hclge_tqp_map_cmd *req;
         struct hclge_desc desc;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_SET_TQP_MAP, false);
  
-       req = (struct hclge_tqp_map *)desc.data;
+       req = (struct hclge_tqp_map_cmd *)desc.data;
         req->tqp_id = cpu_to_le16(tqp_pid);
-       req->tqp_vf = cpu_to_le16(func_id);
+       req->tqp_vf = func_id;
         req->tqp_flag = !is_pf << HCLGE_TQP_MAP_TYPE_B |
                         1 << HCLGE_TQP_MAP_EN_B;
         req->tqp_vid = cpu_to_le16(tqp_vid);
@@ -1340,12 +1354,12 @@ static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev,
  /* TX buffer size is unit by 128 byte */
  #define HCLGE_BUF_SIZE_UNIT_SHIFT      7
  #define HCLGE_BUF_SIZE_UPDATE_EN_MSK   BIT(15)
-       struct hclge_tx_buff_alloc *req;
+       struct hclge_tx_buff_alloc_cmd *req;
         struct hclge_desc desc;
         int ret;
         u8 i;
  
-       req = (struct hclge_tx_buff_alloc *)desc.data;
+       req = (struct hclge_tx_buff_alloc_cmd *)desc.data;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
         for (i = 0; i < HCLGE_TC_NUM; i++) {
@@ -1536,8 +1550,8 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev,
   * @buf_alloc: pointer to buffer calculation data
   * @return: 0: calculate sucessful, negative: fail
   */
-int hclge_rx_buffer_calc(struct hclge_dev *hdev,
-                        struct hclge_pkt_buf_alloc *buf_alloc)
+static int hclge_rx_buffer_calc(struct hclge_dev *hdev,
+                               struct hclge_pkt_buf_alloc *buf_alloc)
  {
         u32 rx_all = hdev->pkt_buf_size;
         int no_pfc_priv_num, pfc_priv_num;
@@ -1672,13 +1686,13 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev,
  static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev,
                                    struct hclge_pkt_buf_alloc *buf_alloc)
  {
-       struct hclge_rx_priv_buff *req;
+       struct hclge_rx_priv_buff_cmd *req;
         struct hclge_desc desc;
         int ret;
         int i;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RX_PRIV_BUFF_ALLOC, false);
-       req = (struct hclge_rx_priv_buff *)desc.data;
+       req = (struct hclge_rx_priv_buff_cmd *)desc.data;
  
         /* Alloc private buffer TCs */
         for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
@@ -1687,7 +1701,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev,
                 req->buf_num[i] =
                         cpu_to_le16(priv->buf_size >> HCLGE_BUF_UNIT_S);
                 req->buf_num[i] |=
-                       cpu_to_le16(true << HCLGE_TC0_PRI_BUF_EN_B);
+                       cpu_to_le16(1 << HCLGE_TC0_PRI_BUF_EN_B);
         }
  
         req->shared_buf =
@@ -2000,11 +2014,11 @@ static void hclge_check_speed_dup(struct hclge_dev *hdev, int duplex, int speed)
  
  int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex)
  {
-       struct hclge_config_mac_speed_dup *req;
+       struct hclge_config_mac_speed_dup_cmd *req;
         struct hclge_desc desc;
         int ret;
  
-       req = (struct hclge_config_mac_speed_dup *)desc.data;
+       req = (struct hclge_config_mac_speed_dup_cmd *)desc.data;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_SPEED_DUP, false);
  
@@ -2075,12 +2089,12 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed,
  static int hclge_query_mac_an_speed_dup(struct hclge_dev *hdev, int *speed,
                                         u8 *duplex)
  {
-       struct hclge_query_an_speed_dup *req;
+       struct hclge_query_an_speed_dup_cmd *req;
         struct hclge_desc desc;
         int speed_tmp;
         int ret;
  
-       req = (struct hclge_query_an_speed_dup *)desc.data;
+       req = (struct hclge_query_an_speed_dup_cmd *)desc.data;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_AN_RESULT, true);
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -2108,11 +2122,11 @@ static int hclge_query_mac_an_speed_dup(struct hclge_dev *hdev, int *speed,
  static int hclge_query_autoneg_result(struct hclge_dev *hdev)
  {
         struct hclge_mac *mac = &hdev->hw.mac;
-       struct hclge_query_an_speed_dup *req;
+       struct hclge_query_an_speed_dup_cmd *req;
         struct hclge_desc desc;
         int ret;
  
-       req = (struct hclge_query_an_speed_dup *)desc.data;
+       req = (struct hclge_query_an_speed_dup_cmd *)desc.data;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_AN_RESULT, true);
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -2129,14 +2143,16 @@ static int hclge_query_autoneg_result(struct hclge_dev *hdev)
  
  static int hclge_set_autoneg_en(struct hclge_dev *hdev, bool enable)
  {
-       struct hclge_config_auto_neg *req;
+       struct hclge_config_auto_neg_cmd *req;
         struct hclge_desc desc;
+       u32 flag = 0;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_AN_MODE, false);
  
-       req = (struct hclge_config_auto_neg *)desc.data;
-       hnae_set_bit(req->cfg_an_cmd_flag, HCLGE_MAC_CFG_AN_EN_B, !!enable);
+       req = (struct hclge_config_auto_neg_cmd *)desc.data;
+       hnae_set_bit(flag, HCLGE_MAC_CFG_AN_EN_B, !!enable);
+       req->cfg_an_cmd_flag = cpu_to_le32(flag);
  
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
         if (ret) {
@@ -2214,7 +2230,7 @@ static void hclge_task_schedule(struct hclge_dev *hdev)
  
  static int hclge_get_mac_link_status(struct hclge_dev *hdev)
  {
-       struct hclge_link_status *req;
+       struct hclge_link_status_cmd *req;
         struct hclge_desc desc;
         int link_status;
         int ret;
@@ -2227,7 +2243,7 @@ static int hclge_get_mac_link_status(struct hclge_dev *hdev)
                 return ret;
         }
  
-       req = (struct hclge_link_status *)desc.data;
+       req = (struct hclge_link_status_cmd *)desc.data;
         link_status = req->status & HCLGE_LINK_STATUS;
  
         return !!link_status;
@@ -2451,7 +2467,7 @@ static u32 hclge_get_rss_indir_size(struct hnae3_handle *handle)
  
  static int hclge_get_rss_algo(struct hclge_dev *hdev)
  {
-       struct hclge_rss_config *req;
+       struct hclge_rss_config_cmd *req;
         struct hclge_desc desc;
         int rss_hash_algo;
         int ret;
@@ -2465,7 +2481,7 @@ static int hclge_get_rss_algo(struct hclge_dev *hdev)
                 return ret;
         }
  
-       req = (struct hclge_rss_config *)desc.data;
+       req = (struct hclge_rss_config_cmd *)desc.data;
         rss_hash_algo = (req->hash_config & HCLGE_RSS_HASH_ALGO_MASK);
  
         if (rss_hash_algo == HCLGE_RSS_HASH_ALGO_TOEPLITZ)
@@ -2477,13 +2493,13 @@ static int hclge_get_rss_algo(struct hclge_dev *hdev)
  static int hclge_set_rss_algo_key(struct hclge_dev *hdev,
                                   const u8 hfunc, const u8 *key)
  {
-       struct hclge_rss_config *req;
+       struct hclge_rss_config_cmd *req;
         struct hclge_desc desc;
         int key_offset;
         int key_size;
         int ret;
  
-       req = (struct hclge_rss_config *)desc.data;
+       req = (struct hclge_rss_config_cmd *)desc.data;
  
         for (key_offset = 0; key_offset < 3; key_offset++) {
                 hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_GENERIC_CONFIG,
@@ -2514,19 +2530,20 @@ static int hclge_set_rss_algo_key(struct hclge_dev *hdev,
  
  static int hclge_set_rss_indir_table(struct hclge_dev *hdev, const u32 *indir)
  {
-       struct hclge_rss_indirection_table *req;
+       struct hclge_rss_indirection_table_cmd *req;
         struct hclge_desc desc;
         int i, j;
         int ret;
  
-       req = (struct hclge_rss_indirection_table *)desc.data;
+       req = (struct hclge_rss_indirection_table_cmd *)desc.data;
  
         for (i = 0; i < HCLGE_RSS_CFG_TBL_NUM; i++) {
                 hclge_cmd_setup_basic_desc
                         (&desc, HCLGE_OPC_RSS_INDIR_TABLE, false);
  
-               req->start_table_index = i * HCLGE_RSS_CFG_TBL_SIZE;
-               req->rss_set_bitmap = HCLGE_RSS_SET_BITMAP_MSK;
+               req->start_table_index =
+                       cpu_to_le16(i * HCLGE_RSS_CFG_TBL_SIZE);
+               req->rss_set_bitmap = cpu_to_le16(HCLGE_RSS_SET_BITMAP_MSK);
  
                 for (j = 0; j < HCLGE_RSS_CFG_TBL_SIZE; j++)
                         req->rss_result[j] =
@@ -2546,21 +2563,24 @@ static int hclge_set_rss_indir_table(struct hclge_dev *hdev, const u32 *indir)
  static int hclge_set_rss_tc_mode(struct hclge_dev *hdev, u16 *tc_valid,
                                  u16 *tc_size, u16 *tc_offset)
  {
-       struct hclge_rss_tc_mode *req;
+       struct hclge_rss_tc_mode_cmd *req;
         struct hclge_desc desc;
         int ret;
         int i;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_TC_MODE, false);
-       req = (struct hclge_rss_tc_mode *)desc.data;
+       req = (struct hclge_rss_tc_mode_cmd *)desc.data;
  
         for (i = 0; i < HCLGE_MAX_TC_NUM; i++) {
-               hnae_set_bit(req->rss_tc_mode[i], HCLGE_RSS_TC_VALID_B,
-                            (tc_valid[i] & 0x1));
-               hnae_set_field(req->rss_tc_mode[i], HCLGE_RSS_TC_SIZE_M,
+               u16 mode = 0;
+
+               hnae_set_bit(mode, HCLGE_RSS_TC_VALID_B, (tc_valid[i] & 0x1));
+               hnae_set_field(mode, HCLGE_RSS_TC_SIZE_M,
                                HCLGE_RSS_TC_SIZE_S, tc_size[i]);
-               hnae_set_field(req->rss_tc_mode[i], HCLGE_RSS_TC_OFFSET_M,
+               hnae_set_field(mode, HCLGE_RSS_TC_OFFSET_M,
                                HCLGE_RSS_TC_OFFSET_S, tc_offset[i]);
+
+               req->rss_tc_mode[i] = cpu_to_le16(mode);
         }
  
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -2575,15 +2595,13 @@ static int hclge_set_rss_tc_mode(struct hclge_dev *hdev, u16 *tc_valid,
  
  static int hclge_set_rss_input_tuple(struct hclge_dev *hdev)
  {
-#define HCLGE_RSS_INPUT_TUPLE_OTHER            0xf
-#define HCLGE_RSS_INPUT_TUPLE_SCTP             0x1f
-       struct hclge_rss_input_tuple *req;
+       struct hclge_rss_input_tuple_cmd *req;
         struct hclge_desc desc;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, false);
  
-       req = (struct hclge_rss_input_tuple *)desc.data;
+       req = (struct hclge_rss_input_tuple_cmd *)desc.data;
         req->ipv4_tcp_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
         req->ipv4_udp_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
         req->ipv4_sctp_en = HCLGE_RSS_INPUT_TUPLE_SCTP;
@@ -2657,6 +2675,161 @@ static int hclge_set_rss(struct hnae3_handle *handle, const u32 *indir,
         return ret;
  }
  
+static u8 hclge_get_rss_hash_bits(struct ethtool_rxnfc *nfc)
+{
+       u8 hash_sets = nfc->data & RXH_L4_B_0_1 ? HCLGE_S_PORT_BIT : 0;
+
+       if (nfc->data & RXH_L4_B_2_3)
+               hash_sets |= HCLGE_D_PORT_BIT;
+       else
+               hash_sets &= ~HCLGE_D_PORT_BIT;
+
+       if (nfc->data & RXH_IP_SRC)
+               hash_sets |= HCLGE_S_IP_BIT;
+       else
+               hash_sets &= ~HCLGE_S_IP_BIT;
+
+       if (nfc->data & RXH_IP_DST)
+               hash_sets |= HCLGE_D_IP_BIT;
+       else
+               hash_sets &= ~HCLGE_D_IP_BIT;
+
+       if (nfc->flow_type == SCTP_V4_FLOW || nfc->flow_type == SCTP_V6_FLOW)
+               hash_sets |= HCLGE_V_TAG_BIT;
+
+       return hash_sets;
+}
+
+static int hclge_set_rss_tuple(struct hnae3_handle *handle,
+                              struct ethtool_rxnfc *nfc)
+{
+       struct hclge_vport *vport = hclge_get_vport(handle);
+       struct hclge_dev *hdev = vport->back;
+       struct hclge_rss_input_tuple_cmd *req;
+       struct hclge_desc desc;
+       u8 tuple_sets;
+       int ret;
+
+       if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST |
+                         RXH_L4_B_0_1 | RXH_L4_B_2_3))
+               return -EINVAL;
+
+       req = (struct hclge_rss_input_tuple_cmd *)desc.data;
+       hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, true);
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret) {
+               dev_err(&hdev->pdev->dev,
+                       "Read rss tuple fail, status = %d\n", ret);
+               return ret;
+       }
+
+       hclge_cmd_reuse_desc(&desc, false);
+
+       tuple_sets = hclge_get_rss_hash_bits(nfc);
+       switch (nfc->flow_type) {
+       case TCP_V4_FLOW:
+               req->ipv4_tcp_en = tuple_sets;
+               break;
+       case TCP_V6_FLOW:
+               req->ipv6_tcp_en = tuple_sets;
+               break;
+       case UDP_V4_FLOW:
+               req->ipv4_udp_en = tuple_sets;
+               break;
+       case UDP_V6_FLOW:
+               req->ipv6_udp_en = tuple_sets;
+               break;
+       case SCTP_V4_FLOW:
+               req->ipv4_sctp_en = tuple_sets;
+               break;
+       case SCTP_V6_FLOW:
+               if ((nfc->data & RXH_L4_B_0_1) ||
+                   (nfc->data & RXH_L4_B_2_3))
+                       return -EINVAL;
+
+               req->ipv6_sctp_en = tuple_sets;
+               break;
+       case IPV4_FLOW:
+               req->ipv4_fragment_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
+               break;
+       case IPV6_FLOW:
+               req->ipv6_fragment_en = HCLGE_RSS_INPUT_TUPLE_OTHER;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret)
+               dev_err(&hdev->pdev->dev,
+                       "Set rss tuple fail, status = %d\n", ret);
+
+       return ret;
+}
+
+static int hclge_get_rss_tuple(struct hnae3_handle *handle,
+                              struct ethtool_rxnfc *nfc)
+{
+       struct hclge_vport *vport = hclge_get_vport(handle);
+       struct hclge_dev *hdev = vport->back;
+       struct hclge_rss_input_tuple_cmd *req;
+       struct hclge_desc desc;
+       u8 tuple_sets;
+       int ret;
+
+       nfc->data = 0;
+
+       req = (struct hclge_rss_input_tuple_cmd *)desc.data;
+       hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, true);
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret) {
+               dev_err(&hdev->pdev->dev,
+                       "Read rss tuple fail, status = %d\n", ret);
+               return ret;
+       }
+
+       switch (nfc->flow_type) {
+       case TCP_V4_FLOW:
+               tuple_sets = req->ipv4_tcp_en;
+               break;
+       case UDP_V4_FLOW:
+               tuple_sets = req->ipv4_udp_en;
+               break;
+       case TCP_V6_FLOW:
+               tuple_sets = req->ipv6_tcp_en;
+               break;
+       case UDP_V6_FLOW:
+               tuple_sets = req->ipv6_udp_en;
+               break;
+       case SCTP_V4_FLOW:
+               tuple_sets = req->ipv4_sctp_en;
+               break;
+       case SCTP_V6_FLOW:
+               tuple_sets = req->ipv6_sctp_en;
+               break;
+       case IPV4_FLOW:
+       case IPV6_FLOW:
+               tuple_sets = HCLGE_S_IP_BIT | HCLGE_D_IP_BIT;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (!tuple_sets)
+               return 0;
+
+       if (tuple_sets & HCLGE_D_PORT_BIT)
+               nfc->data |= RXH_L4_B_2_3;
+       if (tuple_sets & HCLGE_S_PORT_BIT)
+               nfc->data |= RXH_L4_B_0_1;
+       if (tuple_sets & HCLGE_D_IP_BIT)
+               nfc->data |= RXH_IP_DST;
+       if (tuple_sets & HCLGE_S_IP_BIT)
+               nfc->data |= RXH_IP_SRC;
+
+       return 0;
+}
+
  static int hclge_get_tc_size(struct hnae3_handle *handle)
  {
         struct hclge_vport *vport = hclge_get_vport(handle);
@@ -2750,7 +2923,7 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id,
                                    struct hnae3_ring_chain_node *ring_chain)
  {
         struct hclge_dev *hdev = vport->back;
-       struct hclge_ctrl_vector_chain *req;
+       struct hclge_ctrl_vector_chain_cmd *req;
         struct hnae3_ring_chain_node *node;
         struct hclge_desc desc;
         int ret;
@@ -2758,20 +2931,21 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id,
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_ADD_RING_TO_VECTOR, false);
  
-       req = (struct hclge_ctrl_vector_chain *)desc.data;
+       req = (struct hclge_ctrl_vector_chain_cmd *)desc.data;
         req->int_vector_id = vector_id;
  
         i = 0;
         for (node = ring_chain; node; node = node->next) {
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_TYPE_M,
-                              HCLGE_INT_TYPE_S,
+               u16 type_and_id = 0;
+
+               hnae_set_field(type_and_id, HCLGE_INT_TYPE_M, HCLGE_INT_TYPE_S,
                                hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M,
-                              HCLGE_TQP_ID_S,  node->tqp_index);
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M,
+               hnae_set_field(type_and_id, HCLGE_TQP_ID_M, HCLGE_TQP_ID_S,
+                              node->tqp_index);
+               hnae_set_field(type_and_id, HCLGE_INT_GL_IDX_M,
                                HCLGE_INT_GL_IDX_S,
                                hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
-               req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]);
+               req->tqp_type_and_id[i] = cpu_to_le16(type_and_id);
                 req->vfid = vport->vport_id;
  
                 if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) {
@@ -2807,9 +2981,9 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id,
         return 0;
  }
  
-int hclge_map_handle_ring_to_vector(struct hnae3_handle *handle,
-                                   int vector,
-                                   struct hnae3_ring_chain_node *ring_chain)
+static int hclge_map_handle_ring_to_vector(
+               struct hnae3_handle *handle, int vector,
+               struct hnae3_ring_chain_node *ring_chain)
  {
         struct hclge_vport *vport = hclge_get_vport(handle);
         struct hclge_dev *hdev = vport->back;
@@ -2831,7 +3005,7 @@ static int hclge_unmap_ring_from_vector(
  {
         struct hclge_vport *vport = hclge_get_vport(handle);
         struct hclge_dev *hdev = vport->back;
-       struct hclge_ctrl_vector_chain *req;
+       struct hclge_ctrl_vector_chain_cmd *req;
         struct hnae3_ring_chain_node *node;
         struct hclge_desc desc;
         int i, vector_id;
@@ -2846,21 +3020,22 @@ static int hclge_unmap_ring_from_vector(
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_DEL_RING_TO_VECTOR, false);
  
-       req = (struct hclge_ctrl_vector_chain *)desc.data;
+       req = (struct hclge_ctrl_vector_chain_cmd *)desc.data;
         req->int_vector_id = vector_id;
  
         i = 0;
         for (node = ring_chain; node; node = node->next) {
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_TYPE_M,
-                              HCLGE_INT_TYPE_S,
+               u16 type_and_id = 0;
+
+               hnae_set_field(type_and_id, HCLGE_INT_TYPE_M, HCLGE_INT_TYPE_S,
                                hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M,
-                              HCLGE_TQP_ID_S,  node->tqp_index);
-               hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M,
+               hnae_set_field(type_and_id, HCLGE_TQP_ID_M, HCLGE_TQP_ID_S,
+                              node->tqp_index);
+               hnae_set_field(type_and_id, HCLGE_INT_GL_IDX_M,
                                HCLGE_INT_GL_IDX_S,
                                hnae_get_bit(node->flag, HNAE3_RING_TYPE_B));
  
-               req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]);
+               req->tqp_type_and_id[i] = cpu_to_le16(type_and_id);
                 req->vfid = vport->vport_id;
  
                 if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) {
@@ -2898,13 +3073,13 @@ static int hclge_unmap_ring_from_vector(
  int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev,
                                struct hclge_promisc_param *param)
  {
-       struct hclge_promisc_cfg *req;
+       struct hclge_promisc_cfg_cmd *req;
         struct hclge_desc desc;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_PROMISC_MODE, false);
  
-       req = (struct hclge_promisc_cfg *)desc.data;
+       req = (struct hclge_promisc_cfg_cmd *)desc.data;
         req->vf_id = param->vf_id;
         req->flag = (param->enable << HCLGE_PROMISC_EN_B);
  
@@ -2946,29 +3121,27 @@ static void hclge_set_promisc_mode(struct hnae3_handle *handle, u32 en)
  static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable)
  {
         struct hclge_desc desc;
-       struct hclge_config_mac_mode *req =
-               (struct hclge_config_mac_mode *)desc.data;
+       struct hclge_config_mac_mode_cmd *req =
+               (struct hclge_config_mac_mode_cmd *)desc.data;
+       u32 loop_en = 0;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAC_MODE, false);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_TX_EN_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_RX_EN_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_PAD_TX_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_PAD_RX_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_1588_TX_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_1588_RX_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_APP_LP_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_LINE_LP_B, 0);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_FCS_TX_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_RX_FCS_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_RX_FCS_STRIP_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B, enable);
-       hnae_set_bit(req->txrx_pad_fcs_loop_en,
-                    HCLGE_MAC_TX_UNDER_MIN_ERR_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_TX_EN_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_EN_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_PAD_TX_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_PAD_RX_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_1588_TX_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_1588_RX_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_APP_LP_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_LINE_LP_B, 0);
+       hnae_set_bit(loop_en, HCLGE_MAC_FCS_TX_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_FCS_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_FCS_STRIP_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B, enable);
+       hnae_set_bit(loop_en, HCLGE_MAC_TX_UNDER_MIN_ERR_B, enable);
+       req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en);
  
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
         if (ret)
@@ -2980,8 +3153,8 @@ static int hclge_tqp_enable(struct hclge_dev *hdev, int tqp_id,
                             int stream_id, bool enable)
  {
         struct hclge_desc desc;
-       struct hclge_cfg_com_tqp_queue *req =
-               (struct hclge_cfg_com_tqp_queue *)desc.data;
+       struct hclge_cfg_com_tqp_queue_cmd *req =
+               (struct hclge_cfg_com_tqp_queue_cmd *)desc.data;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_COM_TQP_QUEUE, false);
@@ -3145,16 +3318,16 @@ static int hclge_update_desc_vfid(struct hclge_desc *desc, int vfid, bool clr)
                 word_num = vfid / 32;
                 bit_num  = vfid % 32;
                 if (clr)
-                       desc[1].data[word_num] &= ~(1 << bit_num);
+                       desc[1].data[word_num] &= cpu_to_le32(~(1 << bit_num));
                 else
-                       desc[1].data[word_num] |= (1 << bit_num);
+                       desc[1].data[word_num] |= cpu_to_le32(1 << bit_num);
         } else {
                 word_num = (vfid - 192) / 32;
                 bit_num  = vfid % 32;
                 if (clr)
-                       desc[2].data[word_num] &= ~(1 << bit_num);
+                       desc[2].data[word_num] &= cpu_to_le32(~(1 << bit_num));
                 else
-                       desc[2].data[word_num] |= (1 << bit_num);
+                       desc[2].data[word_num] |= cpu_to_le32(1 << bit_num);
         }
  
         return 0;
@@ -3174,7 +3347,7 @@ static bool hclge_is_all_function_id_zero(struct hclge_desc *desc)
         return true;
  }
  
-static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry *new_req,
+static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry_cmd *new_req,
                                    const u8 *addr)
  {
         const unsigned char *mac_addr = addr;
@@ -3186,8 +3359,8 @@ static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry *new_req,
         new_req->mac_addr_lo16 = cpu_to_le16(low_val & 0xffff);
  }
  
-u16 hclge_get_mac_addr_to_mta_index(struct hclge_vport *vport,
-                                   const u8 *addr)
+static u16 hclge_get_mac_addr_to_mta_index(struct hclge_vport *vport,
+                                          const u8 *addr)
  {
         u16 high_val = addr[1] | (addr[0] << 8);
         struct hclge_dev *hdev = vport->back;
@@ -3201,11 +3374,11 @@ static int hclge_set_mta_filter_mode(struct hclge_dev *hdev,
                                      enum hclge_mta_dmac_sel_type mta_mac_sel,
                                      bool enable)
  {
-       struct hclge_mta_filter_mode *req;
+       struct hclge_mta_filter_mode_cmd *req;
         struct hclge_desc desc;
         int ret;
  
-       req = (struct hclge_mta_filter_mode *)desc.data;
+       req = (struct hclge_mta_filter_mode_cmd *)desc.data;
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_MAC_MODE_CFG, false);
  
         hnae_set_bit(req->dmac_sel_en, HCLGE_CFG_MTA_MAC_EN_B,
@@ -3228,11 +3401,11 @@ int hclge_cfg_func_mta_filter(struct hclge_dev *hdev,
                               u8 func_id,
                               bool enable)
  {
-       struct hclge_cfg_func_mta_filter *req;
+       struct hclge_cfg_func_mta_filter_cmd *req;
         struct hclge_desc desc;
         int ret;
  
-       req = (struct hclge_cfg_func_mta_filter *)desc.data;
+       req = (struct hclge_cfg_func_mta_filter_cmd *)desc.data;
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_MAC_FUNC_CFG, false);
  
         hnae_set_bit(req->accept, HCLGE_CFG_FUNC_MTA_ACCEPT_B,
@@ -3255,17 +3428,18 @@ static int hclge_set_mta_table_item(struct hclge_vport *vport,
                                     bool enable)
  {
         struct hclge_dev *hdev = vport->back;
-       struct hclge_cfg_func_mta_item *req;
+       struct hclge_cfg_func_mta_item_cmd *req;
         struct hclge_desc desc;
+       u16 item_idx = 0;
         int ret;
  
-       req = (struct hclge_cfg_func_mta_item *)desc.data;
+       req = (struct hclge_cfg_func_mta_item_cmd *)desc.data;
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_TBL_ITEM_CFG, false);
         hnae_set_bit(req->accept, HCLGE_CFG_MTA_ITEM_ACCEPT_B, enable);
  
-       hnae_set_field(req->item_idx, HCLGE_CFG_MTA_ITEM_IDX_M,
+       hnae_set_field(item_idx, HCLGE_CFG_MTA_ITEM_IDX_M,
                        HCLGE_CFG_MTA_ITEM_IDX_S, idx);
-       req->item_idx = cpu_to_le16(req->item_idx);
+       req->item_idx = cpu_to_le16(item_idx);
  
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
         if (ret) {
@@ -3279,16 +3453,17 @@ static int hclge_set_mta_table_item(struct hclge_vport *vport,
  }
  
  static int hclge_remove_mac_vlan_tbl(struct hclge_vport *vport,
-                                    struct hclge_mac_vlan_tbl_entry *req)
+                                    struct hclge_mac_vlan_tbl_entry_cmd *req)
  {
         struct hclge_dev *hdev = vport->back;
         struct hclge_desc desc;
         u8 resp_code;
+       u16 retval;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MAC_VLAN_REMOVE, false);
  
-       memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry));
+       memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
  
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
         if (ret) {
@@ -3297,19 +3472,21 @@ static int hclge_remove_mac_vlan_tbl(struct hclge_vport *vport,
                         ret);
                 return ret;
         }
-       resp_code = (desc.data[0] >> 8) & 0xff;
+       resp_code = (le32_to_cpu(desc.data[0]) >> 8) & 0xff;
+       retval = le16_to_cpu(desc.retval);
  
-       return hclge_get_mac_vlan_cmd_status(vport, desc.retval, resp_code,
+       return hclge_get_mac_vlan_cmd_status(vport, retval, resp_code,
                                              HCLGE_MAC_VLAN_REMOVE);
  }
  
  static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
-                                    struct hclge_mac_vlan_tbl_entry *req,
+                                    struct hclge_mac_vlan_tbl_entry_cmd *req,
                                      struct hclge_desc *desc,
                                      bool is_mc)
  {
         struct hclge_dev *hdev = vport->back;
         u8 resp_code;
+       u16 retval;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_MAC_VLAN_ADD, true);
@@ -3317,7 +3494,7 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
                 desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
                 memcpy(desc[0].data,
                        req,
-                      sizeof(struct hclge_mac_vlan_tbl_entry));
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                 hclge_cmd_setup_basic_desc(&desc[1],
                                            HCLGE_OPC_MAC_VLAN_ADD,
                                            true);
@@ -3329,7 +3506,7 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
         } else {
                 memcpy(desc[0].data,
                        req,
-                      sizeof(struct hclge_mac_vlan_tbl_entry));
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                 ret = hclge_cmd_send(&hdev->hw, desc, 1);
         }
         if (ret) {
@@ -3338,19 +3515,21 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport,
                         ret);
                 return ret;
         }
-       resp_code = (desc[0].data[0] >> 8) & 0xff;
+       resp_code = (le32_to_cpu(desc[0].data[0]) >> 8) & 0xff;
+       retval = le16_to_cpu(desc[0].retval);
  
-       return hclge_get_mac_vlan_cmd_status(vport, desc[0].retval, resp_code,
+       return hclge_get_mac_vlan_cmd_status(vport, retval, resp_code,
                                              HCLGE_MAC_VLAN_LKUP);
  }
  
  static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport,
-                                 struct hclge_mac_vlan_tbl_entry *req,
+                                 struct hclge_mac_vlan_tbl_entry_cmd *req,
                                   struct hclge_desc *mc_desc)
  {
         struct hclge_dev *hdev = vport->back;
         int cfg_status;
         u8 resp_code;
+       u16 retval;
         int ret;
  
         if (!mc_desc) {
@@ -3359,10 +3538,13 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport,
                 hclge_cmd_setup_basic_desc(&desc,
                                            HCLGE_OPC_MAC_VLAN_ADD,
                                            false);
-               memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry));
+               memcpy(desc.data, req,
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                 ret = hclge_cmd_send(&hdev->hw, &desc, 1);
-               resp_code = (desc.data[0] >> 8) & 0xff;
-               cfg_status = hclge_get_mac_vlan_cmd_status(vport, desc.retval,
+               resp_code = (le32_to_cpu(desc.data[0]) >> 8) & 0xff;
+               retval = le16_to_cpu(desc.retval);
+
+               cfg_status = hclge_get_mac_vlan_cmd_status(vport, retval,
                                                            resp_code,
                                                            HCLGE_MAC_VLAN_ADD);
         } else {
@@ -3373,11 +3555,12 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport,
                 mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_WR);
                 mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_NEXT);
                 memcpy(mc_desc[0].data, req,
-                      sizeof(struct hclge_mac_vlan_tbl_entry));
+                      sizeof(struct hclge_mac_vlan_tbl_entry_cmd));
                 ret = hclge_cmd_send(&hdev->hw, mc_desc, 3);
-               resp_code = (mc_desc[0].data[0] >> 8) & 0xff;
-               cfg_status = hclge_get_mac_vlan_cmd_status(vport,
-                                                          mc_desc[0].retval,
+               resp_code = (le32_to_cpu(mc_desc[0].data[0]) >> 8) & 0xff;
+               retval = le16_to_cpu(mc_desc[0].retval);
+
+               cfg_status = hclge_get_mac_vlan_cmd_status(vport, retval,
                                                            resp_code,
                                                            HCLGE_MAC_VLAN_ADD);
         }
@@ -3404,8 +3587,9 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport,
                              const unsigned char *addr)
  {
         struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
         enum hclge_cmd_status status;
+       u16 egress_port = 0;
  
         /* mac addr check */
         if (is_zero_ether_addr(addr) ||
@@ -3425,15 +3609,15 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport,
         hnae_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT0_EN_B, 0);
         hnae_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT1_EN_B, 0);
         hnae_set_bit(req.mc_mac_en, HCLGE_MAC_VLAN_BIT0_EN_B, 0);
-       hnae_set_bit(req.egress_port,
-                    HCLGE_MAC_EPORT_SW_EN_B, 0);
-       hnae_set_bit(req.egress_port,
-                    HCLGE_MAC_EPORT_TYPE_B, 0);
-       hnae_set_field(req.egress_port, HCLGE_MAC_EPORT_VFID_M,
+
+       hnae_set_bit(egress_port, HCLGE_MAC_EPORT_SW_EN_B, 0);
+       hnae_set_bit(egress_port, HCLGE_MAC_EPORT_TYPE_B, 0);
+       hnae_set_field(egress_port, HCLGE_MAC_EPORT_VFID_M,
                        HCLGE_MAC_EPORT_VFID_S, vport->vport_id);
-       hnae_set_field(req.egress_port, HCLGE_MAC_EPORT_PFID_M,
+       hnae_set_field(egress_port, HCLGE_MAC_EPORT_PFID_M,
                        HCLGE_MAC_EPORT_PFID_S, 0);
-       req.egress_port = cpu_to_le16(req.egress_port);
+
+       req.egress_port = cpu_to_le16(egress_port);
  
         hclge_prepare_mac_addr(&req, addr);
  
@@ -3454,7 +3638,7 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport,
                             const unsigned char *addr)
  {
         struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
         enum hclge_cmd_status status;
  
         /* mac addr check */
@@ -3488,7 +3672,7 @@ int hclge_add_mc_addr_common(struct hclge_vport *vport,
                              const unsigned char *addr)
  {
         struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
         struct hclge_desc desc[3];
         u16 tbl_idx;
         int status;
@@ -3539,7 +3723,7 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport,
                             const unsigned char *addr)
  {
         struct hclge_dev *hdev = vport->back;
-       struct hclge_mac_vlan_tbl_entry req;
+       struct hclge_mac_vlan_tbl_entry_cmd req;
         enum hclge_cmd_status status;
         struct hclge_desc desc[3];
         u16 tbl_idx;
@@ -3622,13 +3806,13 @@ static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p)
  static int hclge_set_vlan_filter_ctrl(struct hclge_dev *hdev, u8 vlan_type,
                                       bool filter_en)
  {
-       struct hclge_vlan_filter_ctrl *req;
+       struct hclge_vlan_filter_ctrl_cmd *req;
         struct hclge_desc desc;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_VLAN_FILTER_CTRL, false);
  
-       req = (struct hclge_vlan_filter_ctrl *)desc.data;
+       req = (struct hclge_vlan_filter_ctrl_cmd *)desc.data;
         req->vlan_type = vlan_type;
         req->vlan_fe = filter_en;
  
@@ -3646,8 +3830,8 @@ int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
                              bool is_kill, u16 vlan, u8 qos, __be16 proto)
  {
  #define HCLGE_MAX_VF_BYTES  16
-       struct hclge_vlan_filter_vf_cfg *req0;
-       struct hclge_vlan_filter_vf_cfg *req1;
+       struct hclge_vlan_filter_vf_cfg_cmd *req0;
+       struct hclge_vlan_filter_vf_cfg_cmd *req1;
         struct hclge_desc desc[2];
         u8 vf_byte_val;
         u8 vf_byte_off;
@@ -3663,10 +3847,10 @@ int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid,
         vf_byte_off = vfid / 8;
         vf_byte_val = 1 << (vfid % 8);
  
-       req0 = (struct hclge_vlan_filter_vf_cfg *)desc[0].data;
-       req1 = (struct hclge_vlan_filter_vf_cfg *)desc[1].data;
+       req0 = (struct hclge_vlan_filter_vf_cfg_cmd *)desc[0].data;
+       req1 = (struct hclge_vlan_filter_vf_cfg_cmd *)desc[1].data;
  
-       req0->vlan_id  = vlan;
+       req0->vlan_id  = cpu_to_le16(vlan);
         req0->vlan_cfg = is_kill;
  
         if (vf_byte_off < HCLGE_MAX_VF_BYTES)
@@ -3707,7 +3891,7 @@ static int hclge_set_port_vlan_filter(struct hnae3_handle *handle,
  {
         struct hclge_vport *vport = hclge_get_vport(handle);
         struct hclge_dev *hdev = vport->back;
-       struct hclge_vlan_filter_pf_cfg *req;
+       struct hclge_vlan_filter_pf_cfg_cmd *req;
         struct hclge_desc desc;
         u8 vlan_offset_byte_val;
         u8 vlan_offset_byte;
@@ -3720,7 +3904,7 @@ static int hclge_set_port_vlan_filter(struct hnae3_handle *handle,
         vlan_offset_byte = (vlan_id % 160) / 8;
         vlan_offset_byte_val = 1 << (vlan_id % 8);
  
-       req = (struct hclge_vlan_filter_pf_cfg *)desc.data;
+       req = (struct hclge_vlan_filter_pf_cfg_cmd *)desc.data;
         req->vlan_offset = vlan_offset_160;
         req->vlan_cfg = is_kill;
         req->vlan_offset_bitmap[vlan_offset_byte] = vlan_offset_byte_val;
@@ -3782,7 +3966,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev)
  static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu)
  {
         struct hclge_vport *vport = hclge_get_vport(handle);
-       struct hclge_config_max_frm_size *req;
+       struct hclge_config_max_frm_size_cmd *req;
         struct hclge_dev *hdev = vport->back;
         struct hclge_desc desc;
         int ret;
@@ -3793,7 +3977,7 @@ static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu)
         hdev->mps = new_mtu;
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAX_FRM_SIZE, false);
  
-       req = (struct hclge_config_max_frm_size *)desc.data;
+       req = (struct hclge_config_max_frm_size_cmd *)desc.data;
         req->max_frm_size = cpu_to_le16(new_mtu);
  
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -3808,13 +3992,13 @@ static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu)
  static int hclge_send_reset_tqp_cmd(struct hclge_dev *hdev, u16 queue_id,
                                     bool enable)
  {
-       struct hclge_reset_tqp_queue *req;
+       struct hclge_reset_tqp_queue_cmd *req;
         struct hclge_desc desc;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RESET_TQP_QUEUE, false);
  
-       req = (struct hclge_reset_tqp_queue *)desc.data;
+       req = (struct hclge_reset_tqp_queue_cmd *)desc.data;
         req->tqp_id = cpu_to_le16(queue_id & HCLGE_RING_ID_MASK);
         hnae_set_bit(req->reset_req, HCLGE_TQP_RESET_B, enable);
  
@@ -3830,13 +4014,13 @@ static int hclge_send_reset_tqp_cmd(struct hclge_dev *hdev, u16 queue_id,
  
  static int hclge_get_reset_status(struct hclge_dev *hdev, u16 queue_id)
  {
-       struct hclge_reset_tqp_queue *req;
+       struct hclge_reset_tqp_queue_cmd *req;
         struct hclge_desc desc;
         int ret;
  
         hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RESET_TQP_QUEUE, true);
  
-       req = (struct hclge_reset_tqp_queue *)desc.data;
+       req = (struct hclge_reset_tqp_queue_cmd *)desc.data;
         req->tqp_id = cpu_to_le16(queue_id & HCLGE_RING_ID_MASK);
  
         ret = hclge_cmd_send(&hdev->hw, &desc, 1);
@@ -4313,6 +4497,8 @@ static const struct hnae3_ae_ops hclge_ops = {
         .get_rss_indir_size = hclge_get_rss_indir_size,
         .get_rss = hclge_get_rss,
         .set_rss = hclge_set_rss,
+       .set_rss_tuple = hclge_set_rss_tuple,
+       .get_rss_tuple = hclge_get_rss_tuple,
         .get_tc_size = hclge_get_tc_size,
         .get_mac_addr = hclge_get_mac_addr,
         .set_mac_addr = hclge_set_mac_addr,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h

index 7c66c00..a7c018c 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -32,7 +32,7 @@
  #define HCLGE_VECTOR_VF_OFFSET         0x100000
  
  #define HCLGE_RSS_IND_TBL_SIZE         512
-#define HCLGE_RSS_SET_BITMAP_MSK       0xffff
+#define HCLGE_RSS_SET_BITMAP_MSK       GENMASK(15, 0)
  #define HCLGE_RSS_KEY_SIZE             40
  #define HCLGE_RSS_HASH_ALGO_TOEPLITZ   0
  #define HCLGE_RSS_HASH_ALGO_SIMPLE     1
@@ -41,6 +41,14 @@
  #define HCLGE_RSS_CFG_TBL_NUM \
         (HCLGE_RSS_IND_TBL_SIZE / HCLGE_RSS_CFG_TBL_SIZE)
  
+#define HCLGE_RSS_INPUT_TUPLE_OTHER    GENMASK(3, 0)
+#define HCLGE_RSS_INPUT_TUPLE_SCTP     GENMASK(4, 0)
+#define HCLGE_D_PORT_BIT               BIT(0)
+#define HCLGE_S_PORT_BIT               BIT(1)
+#define HCLGE_D_IP_BIT                 BIT(2)
+#define HCLGE_S_IP_BIT                 BIT(3)
+#define HCLGE_V_TAG_BIT                        BIT(4)
+
  #define HCLGE_RSS_TC_SIZE_0            1
  #define HCLGE_RSS_TC_SIZE_1            2
  #define HCLGE_RSS_TC_SIZE_2            4
@@ -65,7 +73,7 @@
  #define HCLGE_PHY_CSS_REG              17
  
  #define HCLGE_PHY_MDIX_CTRL_S          (5)
-#define HCLGE_PHY_MDIX_CTRL_M          (3 << HCLGE_PHY_MDIX_CTRL_S)
+#define HCLGE_PHY_MDIX_CTRL_M          GENMASK(6, 5)
  
  #define HCLGE_PHY_MDIX_STATUS_B        (6)
  #define HCLGE_PHY_SPEED_DUP_RESOLVE_B  (11)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c

index 359ee67..1ae6eae 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c
@@ -283,6 +283,7 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev,
         struct hclge_pg_shapping_cmd *shap_cfg_cmd;
         enum hclge_opcode_type opcode;
         struct hclge_desc desc;
+       u32 shapping_para = 0;
  
         opcode = bucket ? HCLGE_OPC_TM_PG_P_SHAPPING :
                 HCLGE_OPC_TM_PG_C_SHAPPING;
@@ -292,11 +293,13 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev,
  
         shap_cfg_cmd->pg_id = pg_id;
  
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b);
-       hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s);
+       hclge_tm_set_field(shapping_para, IR_B, ir_b);
+       hclge_tm_set_field(shapping_para, IR_U, ir_u);
+       hclge_tm_set_field(shapping_para, IR_S, ir_s);
+       hclge_tm_set_field(shapping_para, BS_B, bs_b);
+       hclge_tm_set_field(shapping_para, BS_S, bs_s);
+
+       shap_cfg_cmd->pg_shapping_para = cpu_to_le32(shapping_para);
  
         return hclge_cmd_send(&hdev->hw, &desc, 1);
  }
@@ -337,6 +340,7 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev,
         struct hclge_pri_shapping_cmd *shap_cfg_cmd;
         enum hclge_opcode_type opcode;
         struct hclge_desc desc;
+       u32 shapping_para = 0;
  
         opcode = bucket ? HCLGE_OPC_TM_PRI_P_SHAPPING :
                 HCLGE_OPC_TM_PRI_C_SHAPPING;
@@ -347,11 +351,13 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev,
  
         shap_cfg_cmd->pri_id = pri_id;
  
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b);
-       hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s);
+       hclge_tm_set_field(shapping_para, IR_B, ir_b);
+       hclge_tm_set_field(shapping_para, IR_U, ir_u);
+       hclge_tm_set_field(shapping_para, IR_S, ir_s);
+       hclge_tm_set_field(shapping_para, BS_B, bs_b);
+       hclge_tm_set_field(shapping_para, BS_S, bs_s);
+
+       shap_cfg_cmd->pri_shapping_para = cpu_to_le32(shapping_para);
  
         return hclge_cmd_send(&hdev->hw, &desc, 1);
  }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c

index 9832172..925619a 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c
@@ -13,8 +13,7 @@
  static
  int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets)
  {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
  
         if (h->kinfo.dcb_ops->ieee_getets)
                 return h->kinfo.dcb_ops->ieee_getets(h, ets);
@@ -25,8 +24,7 @@ int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets)
  static
  int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets)
  {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
  
         if (h->kinfo.dcb_ops->ieee_setets)
                 return h->kinfo.dcb_ops->ieee_setets(h, ets);
@@ -37,8 +35,7 @@ int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets)
  static
  int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc)
  {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
  
         if (h->kinfo.dcb_ops->ieee_getpfc)
                 return h->kinfo.dcb_ops->ieee_getpfc(h, pfc);
@@ -49,8 +46,7 @@ int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc)
  static
  int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc)
  {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
  
         if (h->kinfo.dcb_ops->ieee_setpfc)
                 return h->kinfo.dcb_ops->ieee_setpfc(h, pfc);
@@ -61,8 +57,7 @@ int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc)
  /* DCBX configuration */
  static u8 hns3_dcbnl_getdcbx(struct net_device *ndev)
  {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
  
         if (h->kinfo.dcb_ops->getdcbx)
                 return h->kinfo.dcb_ops->getdcbx(h);
@@ -73,8 +68,7 @@ static u8 hns3_dcbnl_getdcbx(struct net_device *ndev)
  /* return 0 if successful, otherwise fail */
  static u8 hns3_dcbnl_setdcbx(struct net_device *ndev, u8 mode)
  {
-       struct hns3_nic_priv *priv = netdev_priv(ndev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(ndev);
  
         if (h->kinfo.dcb_ops->setdcbx)
                 return h->kinfo.dcb_ops->setdcbx(h, mode);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c

index c315065..ba550c1 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c
@@ -24,7 +24,7 @@
  #include "hnae3.h"
  #include "hns3_enet.h"
  
-const char hns3_driver_name[] = "hns3";
+static const char hns3_driver_name[] = "hns3";
  const char hns3_driver_version[] = VERMAGIC_STRING;
  static const char hns3_driver_string[] =
                         "Hisilicon Ethernet Network Driver for Hip08 Family";
@@ -198,8 +198,7 @@ static void hns3_vector_gl_rl_init(struct hns3_enet_tqp_vector *tqp_vector)
  
  static int hns3_nic_set_real_num_queue(struct net_device *netdev)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         struct hnae3_knic_private_info *kinfo = &h->kinfo;
         unsigned int queue_size = kinfo->rss_size * kinfo->num_tc;
         int ret;
@@ -305,24 +304,10 @@ static int hns3_nic_net_stop(struct net_device *netdev)
         return 0;
  }
  
-void hns3_set_multicast_list(struct net_device *netdev)
-{
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
-       struct netdev_hw_addr *ha = NULL;
-
-       if (h->ae_algo->ops->set_mc_addr) {
-               netdev_for_each_mc_addr(ha, netdev)
-                       if (h->ae_algo->ops->set_mc_addr(h, ha->addr))
-                               netdev_err(netdev, "set multicast fail\n");
-       }
-}
-
  static int hns3_nic_uc_sync(struct net_device *netdev,
                             const unsigned char *addr)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (h->ae_algo->ops->add_uc_addr)
                 return h->ae_algo->ops->add_uc_addr(h, addr);
@@ -333,8 +318,7 @@ static int hns3_nic_uc_sync(struct net_device *netdev,
  static int hns3_nic_uc_unsync(struct net_device *netdev,
                               const unsigned char *addr)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (h->ae_algo->ops->rm_uc_addr)
                 return h->ae_algo->ops->rm_uc_addr(h, addr);
@@ -345,8 +329,7 @@ static int hns3_nic_uc_unsync(struct net_device *netdev,
  static int hns3_nic_mc_sync(struct net_device *netdev,
                             const unsigned char *addr)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (h->ae_algo->ops->add_mc_addr)
                 return h->ae_algo->ops->add_mc_addr(h, addr);
@@ -357,8 +340,7 @@ static int hns3_nic_mc_sync(struct net_device *netdev,
  static int hns3_nic_mc_unsync(struct net_device *netdev,
                               const unsigned char *addr)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (h->ae_algo->ops->rm_mc_addr)
                 return h->ae_algo->ops->rm_mc_addr(h, addr);
@@ -366,10 +348,9 @@ static int hns3_nic_mc_unsync(struct net_device *netdev,
         return 0;
  }
  
-void hns3_nic_set_rx_mode(struct net_device *netdev)
+static void hns3_nic_set_rx_mode(struct net_device *netdev)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (h->ae_algo->ops->set_promisc_mode) {
                 if (netdev->flags & IFF_PROMISC)
@@ -768,7 +749,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv,
  
         if (type == DESC_TYPE_SKB) {
                 skb = (struct sk_buff *)priv;
-               paylen = cpu_to_le16(skb->len);
+               paylen = skb->len;
  
                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
                         skb_reset_mac_len(skb);
@@ -802,7 +783,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv,
                         cpu_to_le32(ol_type_vlan_len_msec);
                 desc->tx.type_cs_vlan_tso_len =
                         cpu_to_le32(type_cs_vlan_tso);
-               desc->tx.paylen = cpu_to_le16(paylen);
+               desc->tx.paylen = cpu_to_le32(paylen);
                 desc->tx.mss = cpu_to_le16(mss);
         }
  
@@ -1025,8 +1006,7 @@ out_net_tx_busy:
  
  static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         struct sockaddr *mac_addr = p;
         int ret;
  
@@ -1208,8 +1188,7 @@ static void hns3_nic_udp_tunnel_del(struct net_device *netdev,
  
  static int hns3_setup_tc(struct net_device *netdev, u8 tc)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         struct hnae3_knic_private_info *kinfo = &h->kinfo;
         unsigned int i;
         int ret;
@@ -1259,8 +1238,7 @@ static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type,
  static int hns3_vlan_rx_add_vid(struct net_device *netdev,
                                 __be16 proto, u16 vid)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         int ret = -EIO;
  
         if (h->ae_algo->ops->set_vlan_filter)
@@ -1272,8 +1250,7 @@ static int hns3_vlan_rx_add_vid(struct net_device *netdev,
  static int hns3_vlan_rx_kill_vid(struct net_device *netdev,
                                  __be16 proto, u16 vid)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         int ret = -EIO;
  
         if (h->ae_algo->ops->set_vlan_filter)
@@ -1285,8 +1262,7 @@ static int hns3_vlan_rx_kill_vid(struct net_device *netdev,
  static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
                                 u8 qos, __be16 vlan_proto)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         int ret = -EIO;
  
         if (h->ae_algo->ops->set_vf_vlan_filter)
@@ -1298,8 +1274,7 @@ static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan,
  
  static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         bool if_running = netif_running(netdev);
         int ret;
  
@@ -2609,7 +2584,7 @@ static void hns3_fini_ring(struct hns3_enet_ring *ring)
         ring->next_to_use = 0;
  }
  
-int hns3_buf_size2type(u32 buf_size)
+static int hns3_buf_size2type(u32 buf_size)
  {
         int bd_size_type;
  
@@ -2662,7 +2637,7 @@ static void hns3_init_ring_hw(struct hns3_enet_ring *ring)
         }
  }
  
-static int hns3_init_all_ring(struct hns3_nic_priv *priv)
+int hns3_init_all_ring(struct hns3_nic_priv *priv)
  {
         struct hnae3_handle *h = priv->ae_handle;
         int ring_num = h->kinfo.num_tqps * 2;
@@ -2686,12 +2661,12 @@ static int hns3_init_all_ring(struct hns3_nic_priv *priv)
  
  out_when_alloc_ring_memory:
         for (j = i - 1; j >= 0; j--)
-               hns3_fini_ring(priv->ring_data[i].ring);
+               hns3_fini_ring(priv->ring_data[j].ring);
  
         return -ENOMEM;
  }
  
-static int hns3_uninit_all_ring(struct hns3_nic_priv *priv)
+int hns3_uninit_all_ring(struct hns3_nic_priv *priv)
  {
         struct hnae3_handle *h = priv->ae_handle;
         int i;
@@ -2921,7 +2896,7 @@ err_out:
         return ret;
  }
  
-const struct hnae3_client_ops client_ops = {
+static const struct hnae3_client_ops client_ops = {
         .init_instance = hns3_client_init,
         .uninit_instance = hns3_client_uninit,
         .link_status_change = hns3_link_status_change,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h

index 481eada..6659989 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h
@@ -76,6 +76,8 @@ enum hns3_nic_state {
  #define HNS3_RING_NAME_LEN                     16
  #define HNS3_BUFFER_SIZE_2048                  2048
  #define HNS3_RING_MAX_PENDING                  32768
+#define HNS3_RING_MIN_PENDING                  8
+#define HNS3_RING_BD_MULTIPLE                  8
  #define HNS3_MAX_MTU                           9728
  
  #define HNS3_BD_SIZE_512_TYPE                  0
@@ -587,9 +589,14 @@ static inline void hns3_write_reg(void __iomem *base, u32 reg, u32 value)
  #define hns3_for_each_ring(pos, head) \
         for (pos = (head).ring; pos; pos = pos->next)
  
+#define hns3_get_handle(ndev) \
+       (((struct hns3_nic_priv *)netdev_priv(ndev))->ae_handle)
+
  void hns3_ethtool_set_ops(struct net_device *netdev);
  
  int hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget);
+int hns3_init_all_ring(struct hns3_nic_priv *priv);
+int hns3_uninit_all_ring(struct hns3_nic_priv *priv);
  
  #ifdef CONFIG_HNS3_DCB
  void hns3_dcbnl_setup(struct hnae3_handle *handle);
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c

index d636399..9b36ce0 100644 (file)
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c
@@ -102,8 +102,7 @@ static void hns3_driv_to_eth_caps(u32 caps, struct ethtool_link_ksettings *cmd,
  
  static int hns3_get_sset_count(struct net_device *netdev, int stringset)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         const struct hnae3_ae_ops *ops = h->ae_algo->ops;
  
         if (!ops->get_sset_count)
@@ -164,8 +163,7 @@ static u8 *hns3_get_strings_tqps(struct hnae3_handle *handle, u8 *data)
  
  static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         const struct hnae3_ae_ops *ops = h->ae_algo->ops;
         char *buff = (char *)data;
  
@@ -217,11 +215,10 @@ static u64 *hns3_get_stats_tqps(struct hnae3_handle *handle, u64 *data)
   * @stats: statistics info.
   * @data: statistics data.
   */
-void hns3_get_stats(struct net_device *netdev, struct ethtool_stats *stats,
-                   u64 *data)
+static void hns3_get_stats(struct net_device *netdev,
+                          struct ethtool_stats *stats, u64 *data)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         u64 *p = data;
  
         if (!h->ae_algo->ops->get_stats || !h->ae_algo->ops->update_stats) {
@@ -262,10 +259,7 @@ static void hns3_get_drvinfo(struct net_device *netdev,
  
  static u32 hns3_get_link(struct net_device *netdev)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h;
-
-       h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (h->ae_algo && h->ae_algo->ops && h->ae_algo->ops->get_status)
                 return h->ae_algo->ops->get_status(h);
@@ -277,7 +271,8 @@ static void hns3_get_ringparam(struct net_device *netdev,
                                struct ethtool_ringparam *param)
  {
         struct hns3_nic_priv *priv = netdev_priv(netdev);
-       int queue_num = priv->ae_handle->kinfo.num_tqps;
+       struct hnae3_handle *h = priv->ae_handle;
+       int queue_num = h->kinfo.num_tqps;
  
         param->tx_max_pending = HNS3_RING_MAX_PENDING;
         param->rx_max_pending = HNS3_RING_MAX_PENDING;
@@ -289,8 +284,7 @@ static void hns3_get_ringparam(struct net_device *netdev,
  static void hns3_get_pauseparam(struct net_device *netdev,
                                 struct ethtool_pauseparam *param)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (h->ae_algo && h->ae_algo->ops && h->ae_algo->ops->get_pauseparam)
                 h->ae_algo->ops->get_pauseparam(h, &param->autoneg,
@@ -300,8 +294,7 @@ static void hns3_get_pauseparam(struct net_device *netdev,
  static int hns3_get_link_ksettings(struct net_device *netdev,
                                    struct ethtool_link_ksettings *cmd)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
         u32 supported_caps;
         u32 advertised_caps;
         u8 media_type = HNAE3_MEDIA_TYPE_UNKNOWN;
@@ -392,8 +385,7 @@ static int hns3_get_link_ksettings(struct net_device *netdev,
  
  static u32 hns3_get_rss_key_size(struct net_device *netdev)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (!h->ae_algo || !h->ae_algo->ops ||
             !h->ae_algo->ops->get_rss_key_size)
@@ -404,8 +396,7 @@ static u32 hns3_get_rss_key_size(struct net_device *netdev)
  
  static u32 hns3_get_rss_indir_size(struct net_device *netdev)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (!h->ae_algo || !h->ae_algo->ops ||
             !h->ae_algo->ops->get_rss_indir_size)
@@ -417,8 +408,7 @@ static u32 hns3_get_rss_indir_size(struct net_device *netdev)
  static int hns3_get_rss(struct net_device *netdev, u32 *indir, u8 *key,
                         u8 *hfunc)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_rss)
                 return -EOPNOTSUPP;
@@ -429,8 +419,7 @@ static int hns3_get_rss(struct net_device *netdev, u32 *indir, u8 *key,
  static int hns3_set_rss(struct net_device *netdev, const u32 *indir,
                         const u8 *key, const u8 hfunc)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->set_rss)
                 return -EOPNOTSUPP;
@@ -454,16 +443,17 @@ static int hns3_get_rxnfc(struct net_device *netdev,
                           struct ethtool_rxnfc *cmd,
                           u32 *rule_locs)
  {
-       struct hns3_nic_priv *priv = netdev_priv(netdev);
-       struct hnae3_handle *h = priv->ae_handle;
+       struct hnae3_handle *h = hns3_get_handle(netdev);
  
         if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_tc_size)
                 return -EOPNOTSUPP;
  
         switch (cmd->cmd) {
         case ETHTOOL_GRXRINGS:
-               cmd->data = h->ae_algo->ops->get_tc_size(h);
+               cmd->data = h->kinfo.num_tc * h->kinfo.rss_size;
                 break;
+       case ETHTOOL_GRXFH:
+               return h->ae_algo->ops->get_rss_tuple(h, cmd);
         default:
                 return -EOPNOTSUPP;
         }
@@ -471,15 +461,106 @@ static int hns3_get_rxnfc(struct net_device *netdev,
         return 0;
  }
  
+int hns3_change_all_ring_bd_num(struct hns3_nic_priv *priv, u32 new_desc_num)
+{
+       struct hnae3_handle *h = priv->ae_handle;
+       int i;
+
+       h->kinfo.num_desc = new_desc_num;
+
+       for (i = 0; i < h->kinfo.num_tqps * 2; i++)
+               priv->ring_data[i].ring->desc_num = new_desc_num;
+
+       return hns3_init_all_ring(priv);
+}
+
+int hns3_set_ringparam(struct net_device *ndev, struct ethtool_ringparam *param)
+{
+       struct hns3_nic_priv *priv = netdev_priv(ndev);
+       struct hnae3_handle *h = priv->ae_handle;
+       bool if_running = netif_running(ndev);
+       u32 old_desc_num, new_desc_num;
+       int ret;
+
+       if (param->rx_mini_pending || param->rx_jumbo_pending)
+               return -EINVAL;
+
+       if (param->tx_pending != param->rx_pending) {
+               netdev_err(ndev,
+                          "Descriptors of tx and rx must be equal");
+               return -EINVAL;
+       }
+
+       if (param->tx_pending > HNS3_RING_MAX_PENDING ||
+           param->tx_pending < HNS3_RING_MIN_PENDING) {
+               netdev_err(ndev,
+                          "Descriptors requested (Tx/Rx: %d) out of range [%d-%d]\n",
+                          param->tx_pending, HNS3_RING_MIN_PENDING,
+                          HNS3_RING_MAX_PENDING);
+               return -EINVAL;
+       }
+
+       new_desc_num = param->tx_pending;
+
+       /* Hardware requires that its descriptors must be multiple of eight */
+       new_desc_num = ALIGN(new_desc_num, HNS3_RING_BD_MULTIPLE);
+       old_desc_num = h->kinfo.num_desc;
+       if (old_desc_num == new_desc_num)
+               return 0;
+
+       netdev_info(ndev,
+                   "Changing descriptor count from %d to %d.\n",
+                   old_desc_num, new_desc_num);
+
+       if (if_running)
+               dev_close(ndev);
+
+       ret = hns3_uninit_all_ring(priv);
+       if (ret)
+               return ret;
+
+       ret = hns3_change_all_ring_bd_num(priv, new_desc_num);
+       if (ret) {
+               ret = hns3_change_all_ring_bd_num(priv, old_desc_num);
+               if (ret) {
+                       netdev_err(ndev,
+                                  "Revert to old bd num fail, ret=%d.\n", ret);
+                       return ret;
+               }
+       }
+
+       if (if_running)
+               ret = dev_open(ndev);
+
+       return ret;
+}
+
+static int hns3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+       struct hnae3_handle *h = hns3_get_handle(netdev);
+
+       if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->set_rss_tuple)
+               return -EOPNOTSUPP;
+
+       switch (cmd->cmd) {
+       case ETHTOOL_SRXFH:
+               return h->ae_algo->ops->set_rss_tuple(h, cmd);
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
  static const struct ethtool_ops hns3_ethtool_ops = {
         .get_drvinfo = hns3_get_drvinfo,
         .get_link = hns3_get_link,
         .get_ringparam = hns3_get_ringparam,
+       .set_ringparam = hns3_set_ringparam,
         .get_pauseparam = hns3_get_pauseparam,
         .get_strings = hns3_get_strings,
         .get_ethtool_stats = hns3_get_stats,
         .get_sset_count = hns3_get_sset_count,
         .get_rxnfc = hns3_get_rxnfc,
+       .set_rxnfc = hns3_set_rxnfc,
         .get_rxfh_key_size = hns3_get_rss_key_size,
         .get_rxfh_indir_size = hns3_get_rss_indir_size,
         .get_rxfh = hns3_get_rss,
diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h

index 0641c00..afb7ebe 100644 (file)
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -398,6 +398,7 @@
  #define E1000_ICR_LSC           0x00000004 /* Link Status Change */
  #define E1000_ICR_RXSEQ         0x00000008 /* Rx sequence error */
  #define E1000_ICR_RXDMT0        0x00000010 /* Rx desc min. threshold (0) */
+#define E1000_ICR_RXO           0x00000040 /* Receiver Overrun */
  #define E1000_ICR_RXT0          0x00000080 /* Rx timer intr (ring 0) */
  #define E1000_ICR_ECCER         0x00400000 /* Uncorrectable ECC Error */
  /* If this bit asserted, the driver should claim the interrupt */
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h

index 98e6888..2311b31 100644 (file)
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -94,10 +94,6 @@ struct e1000_info;
   */
  #define E1000_CHECK_RESET_COUNT                25
  
-#define DEFAULT_RDTR                   0
-#define DEFAULT_RADV                   8
-#define BURST_RDTR                     0x20
-#define BURST_RADV                     0x20
  #define PCICFG_DESC_RING_STATUS                0xe4
  #define FLUSH_DESC_REQUIRED            0x100
  
diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c

index b322011..f457c57 100644 (file)
--- a/drivers/net/ethernet/intel/e1000e/mac.c
+++ b/drivers/net/ethernet/intel/e1000e/mac.c
@@ -410,6 +410,9 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw)
   *  Checks to see of the link status of the hardware has changed.  If a
   *  change in link status has been detected, then we read the PHY registers
   *  to get the current speed/duplex if link exists.
+ *
+ *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
+ *  up).
   **/
  s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
  {
@@ -423,7 +426,7 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
          * Change or Rx Sequence Error interrupt.
          */
         if (!mac->get_link_status)
-               return 0;
+               return 1;
  
         /* First we want to see if the MII Status Register reports
          * link.  If so, then we want to get the current speed/duplex
@@ -461,10 +464,12 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
          * different link partner.
          */
         ret_val = e1000e_config_fc_after_link_up(hw);
-       if (ret_val)
+       if (ret_val) {
                 e_dbg("Error configuring flow control\n");
+               return ret_val;
+       }
  
-       return ret_val;
+       return 1;
  }
  
  /**
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c

index 8436c5f..bf8f38f 100644 (file)
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -1071,7 +1071,8 @@ next_desc:
  }
  
  static void e1000_put_txbuf(struct e1000_ring *tx_ring,
-                           struct e1000_buffer *buffer_info)
+                           struct e1000_buffer *buffer_info,
+                           bool drop)
  {
         struct e1000_adapter *adapter = tx_ring->adapter;
  
@@ -1085,7 +1086,10 @@ static void e1000_put_txbuf(struct e1000_ring *tx_ring,
                 buffer_info->dma = 0;
         }
         if (buffer_info->skb) {
-               dev_kfree_skb_any(buffer_info->skb);
+               if (drop)
+                       dev_kfree_skb_any(buffer_info->skb);
+               else
+                       dev_consume_skb_any(buffer_info->skb);
                 buffer_info->skb = NULL;
         }
         buffer_info->time_stamp = 0;
@@ -1199,7 +1203,7 @@ static void e1000e_tx_hwtstamp_work(struct work_struct *work)
                 wmb(); /* force write prior to skb_tstamp_tx */
  
                 skb_tstamp_tx(skb, &shhwtstamps);
-               dev_kfree_skb_any(skb);
+               dev_consume_skb_any(skb);
         } else if (time_after(jiffies, adapter->tx_hwtstamp_start
                               + adapter->tx_timeout_factor * HZ)) {
                 dev_kfree_skb_any(adapter->tx_hwtstamp_skb);
@@ -1254,7 +1258,7 @@ static bool e1000_clean_tx_irq(struct e1000_ring *tx_ring)
                                 }
                         }
  
-                       e1000_put_txbuf(tx_ring, buffer_info);
+                       e1000_put_txbuf(tx_ring, buffer_info, false);
                         tx_desc->upper.data = 0;
  
                         i++;
@@ -1910,14 +1914,30 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data)
         struct net_device *netdev = data;
         struct e1000_adapter *adapter = netdev_priv(netdev);
         struct e1000_hw *hw = &adapter->hw;
+       u32 icr;
+       bool enable = true;
+
+       icr = er32(ICR);
+       if (icr & E1000_ICR_RXO) {
+               ew32(ICR, E1000_ICR_RXO);
+               enable = false;
+               /* napi poll will re-enable Other, make sure it runs */
+               if (napi_schedule_prep(&adapter->napi)) {
+                       adapter->total_rx_bytes = 0;
+                       adapter->total_rx_packets = 0;
+                       __napi_schedule(&adapter->napi);
+               }
+       }
+       if (icr & E1000_ICR_LSC) {
+               ew32(ICR, E1000_ICR_LSC);
+               hw->mac.get_link_status = true;
+               /* guard against interrupt when we're going down */
+               if (!test_bit(__E1000_DOWN, &adapter->state))
+                       mod_timer(&adapter->watchdog_timer, jiffies + 1);
+       }
  
-       hw->mac.get_link_status = true;
-
-       /* guard against interrupt when we're going down */
-       if (!test_bit(__E1000_DOWN, &adapter->state)) {
-               mod_timer(&adapter->watchdog_timer, jiffies + 1);
+       if (enable && !test_bit(__E1000_DOWN, &adapter->state))
                 ew32(IMS, E1000_IMS_OTHER);
-       }
  
         return IRQ_HANDLED;
  }
@@ -2421,7 +2441,7 @@ static void e1000_clean_tx_ring(struct e1000_ring *tx_ring)
  
         for (i = 0; i < tx_ring->count; i++) {
                 buffer_info = &tx_ring->buffer_info[i];
-               e1000_put_txbuf(tx_ring, buffer_info);
+               e1000_put_txbuf(tx_ring, buffer_info, false);
         }
  
         netdev_reset_queue(adapter->netdev);
@@ -2687,7 +2707,8 @@ static int e1000e_poll(struct napi_struct *napi, int weight)
                 napi_complete_done(napi, work_done);
                 if (!test_bit(__E1000_DOWN, &adapter->state)) {
                         if (adapter->msix_entries)
-                               ew32(IMS, adapter->rx_ring->ims_val);
+                               ew32(IMS, adapter->rx_ring->ims_val |
+                                    E1000_IMS_OTHER);
                         else
                                 e1000_irq_enable(adapter);
                 }
@@ -3004,8 +3025,8 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
  
         hw->mac.ops.config_collision_dist(hw);
  
-       /* SPT and CNP Si errata workaround to avoid data corruption */
-       if (hw->mac.type >= e1000_pch_spt) {
+       /* SPT and KBL Si errata workaround to avoid data corruption */
+       if (hw->mac.type == e1000_pch_spt) {
                 u32 reg_val;
  
                 reg_val = er32(IOSFPC);
@@ -3013,7 +3034,9 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
                 ew32(IOSFPC, reg_val);
  
                 reg_val = er32(TARC(0));
-               reg_val |= E1000_TARC0_CB_MULTIQ_3_REQ;
+               /* SPT and KBL Si errata workaround to avoid Tx hang */
+               reg_val &= ~BIT(28);
+               reg_val |= BIT(29);
                 ew32(TARC(0), reg_val);
         }
  }
@@ -3223,14 +3246,6 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
                  */
                 ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
                 ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
-
-               /* override the delay timers for enabling bursting, only if
-                * the value was not set by the user via module options
-                */
-               if (adapter->rx_int_delay == DEFAULT_RDTR)
-                       adapter->rx_int_delay = BURST_RDTR;
-               if (adapter->rx_abs_int_delay == DEFAULT_RADV)
-                       adapter->rx_abs_int_delay = BURST_RADV;
         }
  
         /* set the Receive Delay Timer Register */
@@ -4204,7 +4219,7 @@ static void e1000e_trigger_lsc(struct e1000_adapter *adapter)
         struct e1000_hw *hw = &adapter->hw;
  
         if (adapter->msix_entries)
-               ew32(ICS, E1000_ICS_OTHER);
+               ew32(ICS, E1000_ICS_LSC | E1000_ICS_OTHER);
         else
                 ew32(ICS, E1000_ICS_LSC);
  }
@@ -5074,14 +5089,14 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
  
         /* get_link_status is set on LSC (link status) interrupt or
          * Rx sequence error interrupt.  get_link_status will stay
-        * false until the check_for_link establishes link
+        * true until the check_for_link establishes link
          * for copper adapters ONLY
          */
         switch (hw->phy.media_type) {
         case e1000_media_type_copper:
                 if (hw->mac.get_link_status) {
                         ret_val = hw->mac.ops.check_for_link(hw);
-                       link_active = !hw->mac.get_link_status;
+                       link_active = ret_val > 0;
                 } else {
                         link_active = true;
                 }
@@ -5092,14 +5107,14 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
                 break;
         case e1000_media_type_internal_serdes:
                 ret_val = hw->mac.ops.check_for_link(hw);
-               link_active = adapter->hw.mac.serdes_has_link;
+               link_active = hw->mac.serdes_has_link;
                 break;
         default:
         case e1000_media_type_unknown:
                 break;
         }
  
-       if ((ret_val == E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) &&
+       if ((ret_val == -E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) &&
             (er32(CTRL) & E1000_PHY_CTRL_GBE_DISABLE)) {
                 /* See e1000_kmrn_lock_loss_workaround_ich8lan() */
                 e_info("Gigabit has been disabled, downgrading speed\n");
@@ -5614,7 +5629,7 @@ dma_error:
                         i += tx_ring->count;
                 i--;
                 buffer_info = &tx_ring->buffer_info[i];
-               e1000_put_txbuf(tx_ring, buffer_info);
+               e1000_put_txbuf(tx_ring, buffer_info, true);
         }
  
         return 0;
@@ -7408,7 +7423,7 @@ static void e1000_remove(struct pci_dev *pdev)
         if (adapter->flags & FLAG_HAS_HW_TIMESTAMP) {
                 cancel_work_sync(&adapter->tx_hwtstamp_work);
                 if (adapter->tx_hwtstamp_skb) {
-                       dev_kfree_skb_any(adapter->tx_hwtstamp_skb);
+                       dev_consume_skb_any(adapter->tx_hwtstamp_skb);
                         adapter->tx_hwtstamp_skb = NULL;
                 }
         }
diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c

index 6d8c39a..47da518 100644 (file)
--- a/drivers/net/ethernet/intel/e1000e/param.c
+++ b/drivers/net/ethernet/intel/e1000e/param.c
@@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute Interrupt Delay");
  /* Receive Interrupt Delay in units of 1.024 microseconds
   * hardware will likely hang if you set this to anything but zero.
   *
+ * Burst variant is used as default if device has FLAG2_DMA_BURST.
+ *
   * Valid Range: 0-65535
   */
  E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
+#define DEFAULT_RDTR   0
+#define BURST_RDTR     0x20
  #define MAX_RXDELAY 0xFFFF
  #define MIN_RXDELAY 0
  
  /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
   *
+ * Burst variant is used as default if device has FLAG2_DMA_BURST.
+ *
   * Valid Range: 0-65535
   */
  E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
+#define DEFAULT_RADV   8
+#define BURST_RADV     0x20
  #define MAX_RXABSDELAY 0xFFFF
  #define MIN_RXABSDELAY 0
  
@@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
                                          .max = MAX_RXDELAY } }
                 };
  
+               if (adapter->flags2 & FLAG2_DMA_BURST)
+                       opt.def = BURST_RDTR;
+
                 if (num_RxIntDelay > bd) {
                         adapter->rx_int_delay = RxIntDelay[bd];
                         e1000_validate_option(&adapter->rx_int_delay, &opt,
@@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter *adapter)
         }
         /* Receive Absolute Interrupt Delay */
         {
-               static const struct e1000_option opt = {
+               static struct e1000_option opt = {
                         .type = range_option,
                         .name = "Receive Absolute Interrupt Delay",
                         .err  = "using default of "
@@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
                                          .max = MAX_RXABSDELAY } }
                 };
  
+               if (adapter->flags2 & FLAG2_DMA_BURST)
+                       opt.def = BURST_RADV;
+
                 if (num_RxAbsIntDelay > bd) {
                         adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
                         e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c

index d78d47b..86ff096 100644 (file)
--- a/drivers/net/ethernet/intel/e1000e/phy.c
+++ b/drivers/net/ethernet/intel/e1000e/phy.c
@@ -1744,6 +1744,7 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations,
         s32 ret_val = 0;
         u16 i, phy_status;
  
+       *success = false;
         for (i = 0; i < iterations; i++) {
                 /* Some PHYs require the MII_BMSR register to be read
                  * twice due to the link bit being sticky.  No harm doing
@@ -1763,16 +1764,16 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations,
                 ret_val = e1e_rphy(hw, MII_BMSR, &phy_status);
                 if (ret_val)
                         break;
-               if (phy_status & BMSR_LSTATUS)
+               if (phy_status & BMSR_LSTATUS) {
+                       *success = true;
                         break;
+               }
                 if (usec_interval >= 1000)
                         msleep(usec_interval / 1000);
                 else
                         udelay(usec_interval);
         }
  
-       *success = (i < iterations);
-
         return ret_val;
  }
  
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h

index 439c63c..8139b4e 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -350,7 +350,7 @@ struct i40e_pf {
         u16 num_vmdq_vsis;         /* num vmdq vsis this PF has set up */
         u16 num_vmdq_qps;          /* num queue pairs per vmdq pool */
         u16 num_vmdq_msix;         /* num queue vectors per vmdq pool */
-       u16 num_req_vfs;           /* num VFs requested for this VF */
+       u16 num_req_vfs;           /* num VFs requested for this PF */
         u16 num_vf_qps;            /* num queue pairs per VF */
         u16 num_lan_qps;           /* num lan queues this PF has set up */
         u16 num_lan_msix;          /* num queue vectors for the base PF vsi */
@@ -403,55 +403,57 @@ struct i40e_pf {
         struct timer_list service_timer;
         struct work_struct service_task;
  
-       u64 hw_features;
-#define I40E_HW_RSS_AQ_CAPABLE                 BIT_ULL(0)
-#define I40E_HW_128_QP_RSS_CAPABLE             BIT_ULL(1)
-#define I40E_HW_ATR_EVICT_CAPABLE              BIT_ULL(2)
-#define I40E_HW_WB_ON_ITR_CAPABLE              BIT_ULL(3)
-#define I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE    BIT_ULL(4)
-#define I40E_HW_NO_PCI_LINK_CHECK              BIT_ULL(5)
-#define I40E_HW_100M_SGMII_CAPABLE             BIT_ULL(6)
-#define I40E_HW_NO_DCB_SUPPORT                 BIT_ULL(7)
-#define I40E_HW_USE_SET_LLDP_MIB               BIT_ULL(8)
-#define I40E_HW_GENEVE_OFFLOAD_CAPABLE         BIT_ULL(9)
-#define I40E_HW_PTP_L4_CAPABLE                 BIT_ULL(10)
-#define I40E_HW_WOL_MC_MAGIC_PKT_WAKE          BIT_ULL(11)
-#define I40E_HW_MPLS_HDR_OFFLOAD_CAPABLE       BIT_ULL(12)
-#define I40E_HW_HAVE_CRT_RETIMER               BIT_ULL(13)
-#define I40E_HW_OUTER_UDP_CSUM_CAPABLE         BIT_ULL(14)
-#define I40E_HW_PHY_CONTROLS_LEDS              BIT_ULL(15)
-#define I40E_HW_STOP_FW_LLDP                   BIT_ULL(16)
-#define I40E_HW_PORT_ID_VALID                  BIT_ULL(17)
-#define I40E_HW_RESTART_AUTONEG                        BIT_ULL(18)
-
-       u64 flags;
-#define I40E_FLAG_RX_CSUM_ENABLED              BIT_ULL(1)
-#define I40E_FLAG_MSI_ENABLED                  BIT_ULL(2)
-#define I40E_FLAG_MSIX_ENABLED                 BIT_ULL(3)
-#define I40E_FLAG_HW_ATR_EVICT_ENABLED         BIT_ULL(4)
-#define I40E_FLAG_RSS_ENABLED                  BIT_ULL(6)
-#define I40E_FLAG_VMDQ_ENABLED                 BIT_ULL(7)
-#define I40E_FLAG_IWARP_ENABLED                        BIT_ULL(10)
-#define I40E_FLAG_FILTER_SYNC                  BIT_ULL(15)
-#define I40E_FLAG_SERVICE_CLIENT_REQUESTED     BIT_ULL(16)
-#define I40E_FLAG_SRIOV_ENABLED                        BIT_ULL(19)
-#define I40E_FLAG_DCB_ENABLED                  BIT_ULL(20)
-#define I40E_FLAG_FD_SB_ENABLED                        BIT_ULL(21)
-#define I40E_FLAG_FD_ATR_ENABLED               BIT_ULL(22)
-#define I40E_FLAG_FD_SB_AUTO_DISABLED          BIT_ULL(23)
-#define I40E_FLAG_FD_ATR_AUTO_DISABLED         BIT_ULL(24)
-#define I40E_FLAG_PTP                          BIT_ULL(25)
-#define I40E_FLAG_MFP_ENABLED                  BIT_ULL(26)
-#define I40E_FLAG_UDP_FILTER_SYNC              BIT_ULL(27)
-#define I40E_FLAG_DCB_CAPABLE                  BIT_ULL(29)
-#define I40E_FLAG_VEB_STATS_ENABLED            BIT_ULL(37)
-#define I40E_FLAG_LINK_POLLING_ENABLED         BIT_ULL(39)
-#define I40E_FLAG_VEB_MODE_ENABLED             BIT_ULL(40)
-#define I40E_FLAG_TRUE_PROMISC_SUPPORT         BIT_ULL(51)
-#define I40E_FLAG_CLIENT_RESET                 BIT_ULL(54)
-#define I40E_FLAG_TEMP_LINK_POLLING            BIT_ULL(55)
-#define I40E_FLAG_CLIENT_L2_CHANGE             BIT_ULL(56)
-#define I40E_FLAG_LEGACY_RX                    BIT_ULL(58)
+       u32 hw_features;
+#define I40E_HW_RSS_AQ_CAPABLE                 BIT(0)
+#define I40E_HW_128_QP_RSS_CAPABLE             BIT(1)
+#define I40E_HW_ATR_EVICT_CAPABLE              BIT(2)
+#define I40E_HW_WB_ON_ITR_CAPABLE              BIT(3)
+#define I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE    BIT(4)
+#define I40E_HW_NO_PCI_LINK_CHECK              BIT(5)
+#define I40E_HW_100M_SGMII_CAPABLE             BIT(6)
+#define I40E_HW_NO_DCB_SUPPORT                 BIT(7)
+#define I40E_HW_USE_SET_LLDP_MIB               BIT(8)
+#define I40E_HW_GENEVE_OFFLOAD_CAPABLE         BIT(9)
+#define I40E_HW_PTP_L4_CAPABLE                 BIT(10)
+#define I40E_HW_WOL_MC_MAGIC_PKT_WAKE          BIT(11)
+#define I40E_HW_MPLS_HDR_OFFLOAD_CAPABLE       BIT(12)
+#define I40E_HW_HAVE_CRT_RETIMER               BIT(13)
+#define I40E_HW_OUTER_UDP_CSUM_CAPABLE         BIT(14)
+#define I40E_HW_PHY_CONTROLS_LEDS              BIT(15)
+#define I40E_HW_STOP_FW_LLDP                   BIT(16)
+#define I40E_HW_PORT_ID_VALID                  BIT(17)
+#define I40E_HW_RESTART_AUTONEG                        BIT(18)
+
+       u32 flags;
+#define I40E_FLAG_RX_CSUM_ENABLED              BIT(0)
+#define I40E_FLAG_MSI_ENABLED                  BIT(1)
+#define I40E_FLAG_MSIX_ENABLED                 BIT(2)
+#define I40E_FLAG_RSS_ENABLED                  BIT(3)
+#define I40E_FLAG_VMDQ_ENABLED                 BIT(4)
+#define I40E_FLAG_FILTER_SYNC                  BIT(5)
+#define I40E_FLAG_SRIOV_ENABLED                        BIT(6)
+#define I40E_FLAG_DCB_CAPABLE                  BIT(7)
+#define I40E_FLAG_DCB_ENABLED                  BIT(8)
+#define I40E_FLAG_FD_SB_ENABLED                        BIT(9)
+#define I40E_FLAG_FD_ATR_ENABLED               BIT(10)
+#define I40E_FLAG_FD_SB_AUTO_DISABLED          BIT(11)
+#define I40E_FLAG_FD_ATR_AUTO_DISABLED         BIT(12)
+#define I40E_FLAG_MFP_ENABLED                  BIT(13)
+#define I40E_FLAG_UDP_FILTER_SYNC              BIT(14)
+#define I40E_FLAG_HW_ATR_EVICT_ENABLED         BIT(15)
+#define I40E_FLAG_VEB_MODE_ENABLED             BIT(16)
+#define I40E_FLAG_VEB_STATS_ENABLED            BIT(17)
+#define I40E_FLAG_LINK_POLLING_ENABLED         BIT(18)
+#define I40E_FLAG_TRUE_PROMISC_SUPPORT         BIT(19)
+#define I40E_FLAG_TEMP_LINK_POLLING            BIT(20)
+#define I40E_FLAG_LEGACY_RX                    BIT(21)
+#define I40E_FLAG_PTP                          BIT(22)
+#define I40E_FLAG_IWARP_ENABLED                        BIT(23)
+#define I40E_FLAG_SERVICE_CLIENT_REQUESTED     BIT(24)
+#define I40E_FLAG_CLIENT_L2_CHANGE             BIT(25)
+#define I40E_FLAG_CLIENT_RESET                 BIT(26)
+#define I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED   BIT(27)
+#define I40E_FLAG_SOURCE_PRUNING_DISABLED      BIT(28)
  
         struct i40e_client_instance *cinst;
         bool stat_offsets_loaded;
@@ -947,9 +949,6 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector)
         struct i40e_hw *hw = &pf->hw;
         u32 val;
  
-       /* definitely clear the PBA here, as this function is meant to
-        * clean out all previous interrupts AND enable the interrupt
-        */
         val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
               I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
               (I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT);
@@ -958,7 +957,7 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector)
  }
  
  void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf);
-void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba);
+void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf);
  int i40e_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd);
  int i40e_open(struct net_device *netdev);
  int i40e_close(struct net_device *netdev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h

index 4c85ea9..a8f65ae 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1771,9 +1771,10 @@ enum i40e_aq_phy_type {
         I40E_PHY_TYPE_25GBASE_CR                = 0x20,
         I40E_PHY_TYPE_25GBASE_SR                = 0x21,
         I40E_PHY_TYPE_25GBASE_LR                = 0x22,
+       I40E_PHY_TYPE_MAX,
+       I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP   = 0xFD,
         I40E_PHY_TYPE_EMPTY                     = 0xFE,
         I40E_PHY_TYPE_DEFAULT                   = 0xFF,
-       I40E_PHY_TYPE_MAX
  };
  
  #define I40E_LINK_SPEED_100MB_SHIFT    0x1
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c

index 60542be..53aad37 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1567,30 +1567,46 @@ i40e_status i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
         struct i40e_aq_desc desc;
         i40e_status status;
         u16 abilities_size = sizeof(struct i40e_aq_get_phy_abilities_resp);
+       u16 max_delay = I40E_MAX_PHY_TIMEOUT, total_delay = 0;
  
         if (!abilities)
                 return I40E_ERR_PARAM;
  
-       i40e_fill_default_direct_cmd_desc(&desc,
-                                         i40e_aqc_opc_get_phy_abilities);
+       do {
+               i40e_fill_default_direct_cmd_desc(&desc,
+                                              i40e_aqc_opc_get_phy_abilities);
  
-       desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
-       if (abilities_size > I40E_AQ_LARGE_BUF)
-               desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+               desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+               if (abilities_size > I40E_AQ_LARGE_BUF)
+                       desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
  
-       if (qualified_modules)
-               desc.params.external.param0 |=
+               if (qualified_modules)
+                       desc.params.external.param0 |=
                         cpu_to_le32(I40E_AQ_PHY_REPORT_QUALIFIED_MODULES);
  
-       if (report_init)
-               desc.params.external.param0 |=
+               if (report_init)
+                       desc.params.external.param0 |=
                         cpu_to_le32(I40E_AQ_PHY_REPORT_INITIAL_VALUES);
  
-       status = i40e_asq_send_command(hw, &desc, abilities, abilities_size,
-                                      cmd_details);
+               status = i40e_asq_send_command(hw, &desc, abilities,
+                                              abilities_size, cmd_details);
  
-       if (hw->aq.asq_last_status == I40E_AQ_RC_EIO)
-               status = I40E_ERR_UNKNOWN_PHY;
+               if (status)
+                       break;
+
+               if (hw->aq.asq_last_status == I40E_AQ_RC_EIO) {
+                       status = I40E_ERR_UNKNOWN_PHY;
+                       break;
+               } else if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN) {
+                       usleep_range(1000, 2000);
+                       total_delay++;
+                       status = I40E_ERR_TIMEOUT;
+               }
+       } while ((hw->aq.asq_last_status != I40E_AQ_RC_OK) &&
+                (total_delay < max_delay));
+
+       if (status)
+               return status;
  
         if (report_init) {
                 if (hw->mac.type ==  I40E_MAC_XL710 &&
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c

index 8f326f8..6f2725f 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -278,8 +278,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
                          rx_ring->netdev,
                          rx_ring->rx_bi);
                 dev_info(&pf->pdev->dev,
-                        "    rx_rings[%i]: state = %li, queue_index = %d, reg_idx = %d\n",
-                        i, rx_ring->state,
+                        "    rx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n",
+                        i, *rx_ring->state,
                          rx_ring->queue_index,
                          rx_ring->reg_idx);
                 dev_info(&pf->pdev->dev,
@@ -334,8 +334,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
                          tx_ring->netdev,
                          tx_ring->tx_bi);
                 dev_info(&pf->pdev->dev,
-                        "    tx_rings[%i]: state = %li, queue_index = %d, reg_idx = %d\n",
-                        i, tx_ring->state,
+                        "    tx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n",
+                        i, *tx_ring->state,
                          tx_ring->queue_index,
                          tx_ring->reg_idx);
                 dev_info(&pf->pdev->dev,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c

index 1136d02..afd3ca8 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -227,6 +227,8 @@ static const struct i40e_priv_flags i40e_gstrings_priv_flags[] = {
         I40E_PRIV_FLAG("veb-stats", I40E_FLAG_VEB_STATS_ENABLED, 0),
         I40E_PRIV_FLAG("hw-atr-eviction", I40E_FLAG_HW_ATR_EVICT_ENABLED, 0),
         I40E_PRIV_FLAG("legacy-rx", I40E_FLAG_LEGACY_RX, 0),
+       I40E_PRIV_FLAG("disable-source-pruning",
+                      I40E_FLAG_SOURCE_PRUNING_DISABLED, 0),
  };
  
  #define I40E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gstrings_priv_flags)
@@ -2008,7 +2010,9 @@ static int i40e_set_phys_id(struct net_device *netdev,
                 if (!(pf->hw_features & I40E_HW_PHY_CONTROLS_LEDS)) {
                         pf->led_status = i40e_led_get(hw);
                 } else {
-                       i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL, NULL);
+                       if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE))
+                               i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL,
+                                                     NULL);
                         ret = i40e_led_get_phy(hw, &temp_status,
                                                &pf->phy_led_val);
                         pf->led_status = temp_status;
@@ -2033,7 +2037,8 @@ static int i40e_set_phys_id(struct net_device *netdev,
                         ret = i40e_led_set_phy(hw, false, pf->led_status,
                                                (pf->phy_led_val |
                                                I40E_PHY_LED_MODE_ORIG));
-                       i40e_aq_set_phy_debug(hw, 0, NULL);
+                       if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE))
+                               i40e_aq_set_phy_debug(hw, 0, NULL);
                 }
                 break;
         default:
@@ -4090,7 +4095,7 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags)
         struct i40e_netdev_priv *np = netdev_priv(dev);
         struct i40e_vsi *vsi = np->vsi;
         struct i40e_pf *pf = vsi->back;
-       u64 orig_flags, new_flags, changed_flags;
+       u32 orig_flags, new_flags, changed_flags;
         u32 i, j;
  
         orig_flags = READ_ONCE(pf->flags);
@@ -4142,12 +4147,12 @@ flags_complete:
                 return -EOPNOTSUPP;
  
         /* Compare and exchange the new flags into place. If we failed, that
-        * is if cmpxchg64 returns anything but the old value, this means that
+        * is if cmpxchg returns anything but the old value, this means that
          * something else has modified the flags variable since we copied it
          * originally. We'll just punt with an error and log something in the
          * message buffer.
          */
-       if (cmpxchg64(&pf->flags, orig_flags, new_flags) != orig_flags) {
+       if (cmpxchg(&pf->flags, orig_flags, new_flags) != orig_flags) {
                 dev_warn(&pf->pdev->dev,
                          "Unable to update pf->flags as it was modified by another thread...\n");
                 return -EAGAIN;
@@ -4189,8 +4194,9 @@ flags_complete:
         /* Issue reset to cause things to take effect, as additional bits
          * are added we will need to create a mask of bits requiring reset
          */
-       if ((changed_flags & I40E_FLAG_VEB_STATS_ENABLED) ||
-           ((changed_flags & I40E_FLAG_LEGACY_RX) && netif_running(dev)))
+       if (changed_flags & (I40E_FLAG_VEB_STATS_ENABLED |
+                            I40E_FLAG_LEGACY_RX |
+                            I40E_FLAG_SOURCE_PRUNING_DISABLED))
                 i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true);
  
         return 0;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c

index 3f9e89b..4de5200 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1776,11 +1776,6 @@ static void i40e_set_rx_mode(struct net_device *netdev)
                 vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
                 vsi->back->flags |= I40E_FLAG_FILTER_SYNC;
         }
-
-       /* schedule our worker thread which will take care of
-        * applying the new filter changes
-        */
-       i40e_service_event_schedule(vsi->back);
  }
  
  /**
@@ -2884,22 +2879,18 @@ static void i40e_vsi_free_rx_resources(struct i40e_vsi *vsi)
   **/
  static void i40e_config_xps_tx_ring(struct i40e_ring *ring)
  {
-       struct i40e_vsi *vsi = ring->vsi;
+       int cpu;
  
         if (!ring->q_vector || !ring->netdev)
                 return;
  
-       if ((vsi->tc_config.numtc <= 1) &&
-           !test_and_set_bit(__I40E_TX_XPS_INIT_DONE, &ring->state)) {
-               netif_set_xps_queue(ring->netdev,
-                                   get_cpu_mask(ring->q_vector->v_idx),
-                                   ring->queue_index);
-       }
+       /* We only initialize XPS once, so as not to overwrite user settings */
+       if (test_and_set_bit(__I40E_TX_XPS_INIT_DONE, ring->state))
+               return;
  
-       /* schedule our worker thread which will take care of
-        * applying the new filter changes
-        */
-       i40e_service_event_schedule(vsi->back);
+       cpu = cpumask_local_spread(ring->q_vector->v_idx, -1);
+       netif_set_xps_queue(ring->netdev, get_cpu_mask(cpu),
+                           ring->queue_index);
  }
  
  /**
@@ -3009,7 +3000,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
         struct i40e_hmc_obj_rxq rx_ctx;
         i40e_status err = 0;
  
-       ring->state = 0;
+       bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
  
         /* clear the context structure first */
         memset(&rx_ctx, 0, sizeof(rx_ctx));
@@ -3034,7 +3025,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
         if (hw->revision_id == 0)
                 rx_ctx.lrxqthresh = 0;
         else
-               rx_ctx.lrxqthresh = 2;
+               rx_ctx.lrxqthresh = 1;
         rx_ctx.crcstrip = 1;
         rx_ctx.l2tsel = 1;
         /* this controls whether VLAN is stripped from inner headers */
@@ -3407,15 +3398,14 @@ void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf)
  /**
   * i40e_irq_dynamic_enable_icr0 - Enable default interrupt generation for icr0
   * @pf: board private structure
- * @clearpba: true when all pending interrupt events should be cleared
   **/
-void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba)
+void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf)
  {
         struct i40e_hw *hw = &pf->hw;
         u32 val;
  
         val = I40E_PFINT_DYN_CTL0_INTENA_MASK   |
-             (clearpba ? I40E_PFINT_DYN_CTL0_CLEARPBA_MASK : 0) |
+             I40E_PFINT_DYN_CTL0_CLEARPBA_MASK |
               (I40E_ITR_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT);
  
         wr32(hw, I40E_PFINT_DYN_CTL0, val);
@@ -3482,6 +3472,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename)
         int tx_int_idx = 0;
         int vector, err;
         int irq_num;
+       int cpu;
  
         for (vector = 0; vector < q_vectors; vector++) {
                 struct i40e_q_vector *q_vector = vsi->q_vectors[vector];
@@ -3517,10 +3508,14 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename)
                 q_vector->affinity_notify.notify = i40e_irq_affinity_notify;
                 q_vector->affinity_notify.release = i40e_irq_affinity_release;
                 irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify);
-               /* get_cpu_mask returns a static constant mask with
-                * a permanent lifetime so it's ok to use here.
+               /* Spread affinity hints out across online CPUs.
+                *
+                * get_cpu_mask returns a static constant mask with
+                * a permanent lifetime so it's ok to pass to
+                * irq_set_affinity_hint without making a copy.
                  */
-               irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx));
+               cpu = cpumask_local_spread(q_vector->v_idx, -1);
+               irq_set_affinity_hint(irq_num, get_cpu_mask(cpu));
         }
  
         vsi->irqs_ready = true;
@@ -3596,7 +3591,7 @@ static int i40e_vsi_enable_irq(struct i40e_vsi *vsi)
                 for (i = 0; i < vsi->num_q_vectors; i++)
                         i40e_irq_dynamic_enable(vsi, i);
         } else {
-               i40e_irq_dynamic_enable_icr0(pf, true);
+               i40e_irq_dynamic_enable_icr0(pf);
         }
  
         i40e_flush(&pf->hw);
@@ -3745,7 +3740,7 @@ enable_intr:
         wr32(hw, I40E_PFINT_ICR0_ENA, ena_mask);
         if (!test_bit(__I40E_DOWN, pf->state)) {
                 i40e_service_event_schedule(pf);
-               i40e_irq_dynamic_enable_icr0(pf, false);
+               i40e_irq_dynamic_enable_icr0(pf);
         }
  
         return ret;
@@ -6231,6 +6226,7 @@ void i40e_fdir_check_and_reenable(struct i40e_pf *pf)
                                 hlist_del(&filter->fdir_node);
                                 kfree(filter);
                                 pf->fdir_pf_active_filters--;
+                               pf->fd_inv = 0;
                         }
                 }
         }
@@ -6557,12 +6553,26 @@ static void i40e_handle_link_event(struct i40e_pf *pf,
          */
         i40e_link_event(pf);
  
-       /* check for unqualified module, if link is down */
-       if ((status->link_info & I40E_AQ_MEDIA_AVAILABLE) &&
-           (!(status->an_info & I40E_AQ_QUALIFIED_MODULE)) &&
-           (!(status->link_info & I40E_AQ_LINK_UP)))
+       /* Check if module meets thermal requirements */
+       if (status->phy_type == I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP) {
+               dev_err(&pf->pdev->dev,
+                       "Rx/Tx is disabled on this device because the module does not meet thermal requirements.\n");
                 dev_err(&pf->pdev->dev,
-                       "The driver failed to link because an unqualified module was detected.\n");
+                       "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
+       } else {
+               /* check for unqualified module, if link is down, suppress
+                * the message if link was forced to be down.
+                */
+               if ((status->link_info & I40E_AQ_MEDIA_AVAILABLE) &&
+                   (!(status->an_info & I40E_AQ_QUALIFIED_MODULE)) &&
+                   (!(status->link_info & I40E_AQ_LINK_UP)) &&
+                   (!(pf->flags & I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED))) {
+                       dev_err(&pf->pdev->dev,
+                               "Rx/Tx is disabled on this device because an unsupported SFP module type was detected.\n");
+                       dev_err(&pf->pdev->dev,
+                               "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n");
+               }
+       }
  }
  
  /**
@@ -7678,7 +7688,7 @@ static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi)
  
  /**
   * i40e_vsi_alloc_arrays - Allocate queue and vector pointer arrays for the vsi
- * @type: VSI pointer
+ * @vsi: VSI pointer
   * @alloc_qvectors: a bool to specify if q_vectors need to be allocated.
   *
   * On error: returns error code (negative)
@@ -8439,7 +8449,7 @@ static int i40e_setup_misc_vector(struct i40e_pf *pf)
  
         i40e_flush(hw);
  
-       i40e_irq_dynamic_enable_icr0(pf, true);
+       i40e_irq_dynamic_enable_icr0(pf);
  
         return err;
  }
@@ -8967,8 +8977,8 @@ static int i40e_sw_init(struct i40e_pf *pf)
                     I40E_FLAG_MSIX_ENABLED;
  
         /* Set default ITR */
-       pf->rx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_RX_DEF;
-       pf->tx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_TX_DEF;
+       pf->rx_itr_default = I40E_ITR_RX_DEF;
+       pf->tx_itr_default = I40E_ITR_TX_DEF;
  
         /* Depending on PF configurations, it is possible that the RSS
          * maximum might end up larger than the available queues
@@ -9068,6 +9078,11 @@ static int i40e_sw_init(struct i40e_pf *pf)
             (pf->hw.aq.fw_maj_ver >= 5)))
                 pf->hw_features |= I40E_HW_USE_SET_LLDP_MIB;
  
+       /* Enable PTP L4 if FW > v6.0 */
+       if (pf->hw.mac.type == I40E_MAC_XL710 &&
+           pf->hw.aq.fw_maj_ver >= 6)
+               pf->hw_features |= I40E_HW_PTP_L4_CAPABLE;
+
         if (pf->hw.func_caps.vmdq) {
                 pf->num_vmdq_vsis = I40E_DEFAULT_NUM_VMDQ_VSI;
                 pf->flags |= I40E_FLAG_VMDQ_ENABLED;
@@ -9903,6 +9918,31 @@ static int i40e_add_vsi(struct i40e_vsi *vsi)
  
                 enabled_tc = i40e_pf_get_tc_map(pf);
  
+               /* Source pruning is enabled by default, so the flag is
+                * negative logic - if it's set, we need to fiddle with
+                * the VSI to disable source pruning.
+                */
+               if (pf->flags & I40E_FLAG_SOURCE_PRUNING_DISABLED) {
+                       memset(&ctxt, 0, sizeof(ctxt));
+                       ctxt.seid = pf->main_vsi_seid;
+                       ctxt.pf_num = pf->hw.pf_id;
+                       ctxt.vf_num = 0;
+                       ctxt.info.valid_sections |=
+                                    cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID);
+                       ctxt.info.switch_id =
+                                  cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB);
+                       ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL);
+                       if (ret) {
+                               dev_info(&pf->pdev->dev,
+                                        "update vsi failed, err %s aq_err %s\n",
+                                        i40e_stat_str(&pf->hw, ret),
+                                        i40e_aq_str(&pf->hw,
+                                                    pf->hw.aq.asq_last_status));
+                               ret = -ENOENT;
+                               goto err;
+                       }
+               }
+
                 /* MFP mode setup queue map and update VSI */
                 if ((pf->flags & I40E_FLAG_MFP_ENABLED) &&
                     !(pf->hw.func_caps.iscsi)) { /* NIC type PF */
@@ -12000,6 +12040,28 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev)
  }
  
  /**
+ * i40e_pci_error_reset_prepare - prepare device driver for pci reset
+ * @pdev: PCI device information struct
+ */
+static void i40e_pci_error_reset_prepare(struct pci_dev *pdev)
+{
+       struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+       i40e_prep_for_reset(pf, false);
+}
+
+/**
+ * i40e_pci_error_reset_done - pci reset done, device driver reset can begin
+ * @pdev: PCI device information struct
+ */
+static void i40e_pci_error_reset_done(struct pci_dev *pdev)
+{
+       struct i40e_pf *pf = pci_get_drvdata(pdev);
+
+       i40e_reset_and_rebuild(pf, false, false);
+}
+
+/**
   * i40e_pci_error_resume - restart operations after PCI error recovery
   * @pdev: PCI device information struct
   *
@@ -12189,6 +12251,8 @@ static int i40e_resume(struct device *dev)
  static const struct pci_error_handlers i40e_err_handler = {
         .error_detected = i40e_pci_error_detected,
         .slot_reset = i40e_pci_error_slot_reset,
+       .reset_prepare = i40e_pci_error_reset_prepare,
+       .reset_done = i40e_pci_error_reset_done,
         .resume = i40e_pci_error_resume,
  };
  
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c

index 57505b1..151d9cf 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -311,13 +311,10 @@ static i40e_status i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset,
  static i40e_status __i40e_read_nvm_word(struct i40e_hw *hw,
                                         u16 offset, u16 *data)
  {
-       i40e_status ret_code = 0;
-
         if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE)
-               ret_code = i40e_read_nvm_word_aq(hw, offset, data);
-       else
-               ret_code = i40e_read_nvm_word_srctl(hw, offset, data);
-       return ret_code;
+               return i40e_read_nvm_word_aq(hw, offset, data);
+
+       return i40e_read_nvm_word_srctl(hw, offset, data);
  }
  
  /**
@@ -331,7 +328,7 @@ static i40e_status __i40e_read_nvm_word(struct i40e_hw *hw,
  i40e_status i40e_read_nvm_word(struct i40e_hw *hw, u16 offset,
                                u16 *data)
  {
-       i40e_status ret_code = 0;
+       i40e_status ret_code;
  
         ret_code = i40e_acquire_nvm(hw, I40E_RESOURCE_READ);
         if (ret_code)
@@ -446,13 +443,10 @@ static i40e_status __i40e_read_nvm_buffer(struct i40e_hw *hw,
                                           u16 offset, u16 *words,
                                           u16 *data)
  {
-       i40e_status ret_code = 0;
-
         if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE)
-               ret_code = i40e_read_nvm_buffer_aq(hw, offset, words, data);
-       else
-               ret_code = i40e_read_nvm_buffer_srctl(hw, offset, words, data);
-       return ret_code;
+               return i40e_read_nvm_buffer_aq(hw, offset, words, data);
+
+       return i40e_read_nvm_buffer_srctl(hw, offset, words, data);
  }
  
  /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h

index 86ca27f..c234758 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_register.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_register.h
@@ -2794,7 +2794,7 @@
  #define I40E_GLV_RUPP_MAX_INDEX 383
  #define I40E_GLV_RUPP_RUPP_SHIFT 0
  #define I40E_GLV_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_RUPP_RUPP_SHIFT)
-#define I40E_GLV_TEPC(_VSI) (0x00344000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */
+#define I40E_GLV_TEPC(_i) (0x00344000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */
  #define I40E_GLV_TEPC_MAX_INDEX 383
  #define I40E_GLV_TEPC_TEPC_SHIFT 0
  #define I40E_GLV_TEPC_TEPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_TEPC_TEPC_SHIFT)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c

index d9fdf69..a23306f 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1372,6 +1372,15 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
         union i40e_rx_desc *rx_desc;
         struct i40e_rx_buffer *bi;
  
+       /* Hardware only fetches new descriptors in cache lines of 8,
+        * essentially ignoring the lower 3 bits of the tail register. We want
+        * to ensure our tail writes are aligned to avoid unnecessary work. We
+        * can't simply round down the cleaned count, since we might fail to
+        * allocate some buffers. What we really want is to ensure that
+        * next_to_used + cleaned_count produces an aligned value.
+        */
+       cleaned_count -= (ntu + cleaned_count) & 0x7;
+
         /* do nothing if no valid netdev defined */
         if (!rx_ring->netdev || !cleaned_count)
                 return false;
@@ -2202,9 +2211,7 @@ static u32 i40e_buildreg_itr(const int type, const u16 itr)
         u32 val;
  
         val = I40E_PFINT_DYN_CTLN_INTENA_MASK |
-             /* Don't clear PBA because that can cause lost interrupts that
-              * came in while we were cleaning/polling
-              */
+             I40E_PFINT_DYN_CTLN_CLEARPBA_MASK |
               (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) |
               (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT);
  
@@ -2241,7 +2248,7 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi,
  
         /* If we don't have MSIX, then we only need to re-enable icr0 */
         if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED)) {
-               i40e_irq_dynamic_enable_icr0(vsi->back, false);
+               i40e_irq_dynamic_enable_icr0(vsi->back);
                 return;
         }
  
@@ -3167,38 +3174,12 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
         /* write last descriptor with EOP bit */
         td_cmd |= I40E_TX_DESC_CMD_EOP;
  
-       /* We can OR these values together as they both are checked against
-        * 4 below and at this point desc_count will be used as a boolean value
-        * after this if/else block.
+       /* We OR these values together to check both against 4 (WB_STRIDE)
+        * below. This is safe since we don't re-use desc_count afterwards.
          */
         desc_count |= ++tx_ring->packet_stride;
  
-       /* Algorithm to optimize tail and RS bit setting:
-        * if queue is stopped
-        *      mark RS bit
-        *      reset packet counter
-        * else if xmit_more is supported and is true
-        *      advance packet counter to 4
-        *      reset desc_count to 0
-        *
-        * if desc_count >= 4
-        *      mark RS bit
-        *      reset packet counter
-        * if desc_count > 0
-        *      update tail
-        *
-        * Note: If there are less than 4 descriptors
-        * pending and interrupts were disabled the service task will
-        * trigger a force WB.
-        */
-       if (netif_xmit_stopped(txring_txq(tx_ring))) {
-               goto do_rs;
-       } else if (skb->xmit_more) {
-               /* set stride to arm on next packet and reset desc_count */
-               tx_ring->packet_stride = WB_STRIDE;
-               desc_count = 0;
-       } else if (desc_count >= WB_STRIDE) {
-do_rs:
+       if (desc_count >= WB_STRIDE) {
                 /* write last descriptor with RS bit set */
                 td_cmd |= I40E_TX_DESC_CMD_RS;
                 tx_ring->packet_stride = 0;
@@ -3219,7 +3200,7 @@ do_rs:
         first->next_to_watch = tx_desc;
  
         /* notify HW of packet */
-       if (desc_count) {
+       if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
                 writel(i, tx_ring->tail);
  
                 /* we need this if more than one processor can write to our tail
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h

index 2f848bc..ff57ae4 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -38,8 +38,10 @@
  #define I40E_ITR_8K                0x003E
  #define I40E_ITR_4K                0x007A
  #define I40E_MAX_INTRL             0x3B    /* reg uses 4 usec resolution */
-#define I40E_ITR_RX_DEF            I40E_ITR_20K
-#define I40E_ITR_TX_DEF            I40E_ITR_20K
+#define I40E_ITR_RX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
+#define I40E_ITR_TX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
  #define I40E_ITR_DYNAMIC           0x8000  /* use top bit as a flag */
  #define I40E_MIN_INT_RATE          250     /* ~= 1000000 / (I40E_MAX_ITR * 2) */
  #define I40E_MAX_INT_RATE          500000  /* == 1000000 / (I40E_MIN_ITR * 2) */
@@ -206,7 +208,7 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc,
  }
  
  /* How many Rx Buffers do we bundle into one write to the hardware ? */
-#define I40E_RX_BUFFER_WRITE   16      /* Must be power of 2 */
+#define I40E_RX_BUFFER_WRITE   32      /* Must be power of 2 */
  #define I40E_RX_INCREMENT(r, i) \
         do {                                    \
                 (i)++;                          \
@@ -342,6 +344,7 @@ struct i40e_rx_queue_stats {
  enum i40e_ring_state_t {
         __I40E_TX_FDIR_INIT_DONE,
         __I40E_TX_XPS_INIT_DONE,
+       __I40E_RING_STATE_NBITS /* must be last */
  };
  
  /* some useful defines for virtchannel interface, which
@@ -366,7 +369,7 @@ struct i40e_ring {
                 struct i40e_tx_buffer *tx_bi;
                 struct i40e_rx_buffer *rx_bi;
         };
-       unsigned long state;
+       DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
         u16 queue_index;                /* Queue number of ring */
         u8 dcb_tc;                      /* Traffic class of ring */
         u8 __iomem *tail;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h

index 4b32b1d..0410fcb 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -46,6 +46,9 @@
  /* Max default timeout in ms, */
  #define I40E_MAX_NVM_TIMEOUT           18000
  
+/* Max timeout in ms for the phy to respond */
+#define I40E_MAX_PHY_TIMEOUT           500
+
  /* Switch from ms to the 1usec global time (this is the GTIME resolution) */
  #define I40E_MS_TO_GTIME(time)         ((time) * 1000)
  
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c

index 0456813..0c4fa22 100644 (file)
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -273,7 +273,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
         struct i40e_hw *hw = &pf->hw;
         u16 vsi_queue_id, pf_queue_id;
         enum i40e_queue_type qtype;
-       u16 next_q, vector_id;
+       u16 next_q, vector_id, size;
         u32 reg, reg_idx;
         u16 itr_idx = 0;
  
@@ -303,9 +303,11 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
                                      vsi_queue_id + 1));
         }
  
-       next_q = find_first_bit(&linklistmap,
-                               (I40E_MAX_VSI_QP *
-                                I40E_VIRTCHNL_SUPPORTED_QTYPES));
+       size = I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES;
+       next_q = find_first_bit(&linklistmap, size);
+       if (unlikely(next_q == size))
+               goto irq_list_done;
+
         vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES;
         qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES;
         pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id);
@@ -313,7 +315,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
  
         wr32(hw, reg_idx, reg);
  
-       while (next_q < (I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES)) {
+       while (next_q < size) {
                 switch (qtype) {
                 case I40E_QUEUE_TYPE_RX:
                         reg_idx = I40E_QINT_RQCTL(pf_queue_id);
@@ -327,12 +329,8 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id,
                         break;
                 }
  
-               next_q = find_next_bit(&linklistmap,
-                                      (I40E_MAX_VSI_QP *
-                                       I40E_VIRTCHNL_SUPPORTED_QTYPES),
-                                      next_q + 1);
-               if (next_q <
-                   (I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES)) {
+               next_q = find_next_bit(&linklistmap, size, next_q + 1);
+               if (next_q < size) {
                         vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES;
                         qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES;
                         pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id,
@@ -639,7 +637,7 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
         rx_ctx.dsize = 1;
  
         /* default values */
-       rx_ctx.lrxqthresh = 2;
+       rx_ctx.lrxqthresh = 1;
         rx_ctx.crcstrip = 1;
         rx_ctx.prefena = 1;
         rx_ctx.l2tsel = 1;
@@ -1358,7 +1356,7 @@ err_alloc:
                 i40e_free_vfs(pf);
  err_iov:
         /* Re-enable interrupt 0. */
-       i40e_irq_dynamic_enable_icr0(pf, false);
+       i40e_irq_dynamic_enable_icr0(pf);
         return ret;
  }
  
@@ -2883,6 +2881,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
         struct i40e_mac_filter *f;
         struct i40e_vf *vf;
         int ret = 0;
+       struct hlist_node *h;
         int bkt;
  
         /* validate the request */
@@ -2921,7 +2920,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
         /* Delete all the filters for this VSI - we're going to kill it
          * anyway.
          */
-       hash_for_each(vsi->mac_filter_hash, bkt, f, hlist)
+       hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist)
                 __i40e_del_filter(vsi, f);
  
         spin_unlock_bh(&vsi->mac_filter_hash_lock);
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h

index ed5602f..60c892f 100644 (file)
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
@@ -1767,9 +1767,10 @@ enum i40e_aq_phy_type {
         I40E_PHY_TYPE_25GBASE_CR                = 0x20,
         I40E_PHY_TYPE_25GBASE_SR                = 0x21,
         I40E_PHY_TYPE_25GBASE_LR                = 0x22,
+       I40E_PHY_TYPE_MAX,
+       I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP   = 0xFD,
         I40E_PHY_TYPE_EMPTY                     = 0xFE,
         I40E_PHY_TYPE_DEFAULT                   = 0xFF,
-       I40E_PHY_TYPE_MAX
  };
  
  #define I40E_LINK_SPEED_100MB_SHIFT    0x1
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c

index 37e1de8..6806ada 100644 (file)
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -711,6 +711,15 @@ bool i40evf_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
         union i40e_rx_desc *rx_desc;
         struct i40e_rx_buffer *bi;
  
+       /* Hardware only fetches new descriptors in cache lines of 8,
+        * essentially ignoring the lower 3 bits of the tail register. We want
+        * to ensure our tail writes are aligned to avoid unnecessary work. We
+        * can't simply round down the cleaned count, since we might fail to
+        * allocate some buffers. What we really want is to ensure that
+        * next_to_used + cleaned_count produces an aligned value.
+        */
+       cleaned_count -= (ntu + cleaned_count) & 0x7;
+
         /* do nothing if no valid netdev defined */
         if (!rx_ring->netdev || !cleaned_count)
                 return false;
@@ -1409,9 +1418,7 @@ static u32 i40e_buildreg_itr(const int type, const u16 itr)
         u32 val;
  
         val = I40E_VFINT_DYN_CTLN1_INTENA_MASK |
-             /* Don't clear PBA because that can cause lost interrupts that
-              * came in while we were cleaning/polling
-              */
+             I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK |
               (type << I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT) |
               (itr << I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT);
  
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h

index 0d9f98b..8d26c85 100644 (file)
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
@@ -38,8 +38,10 @@
  #define I40E_ITR_8K                0x003E
  #define I40E_ITR_4K                0x007A
  #define I40E_MAX_INTRL             0x3B    /* reg uses 4 usec resolution */
-#define I40E_ITR_RX_DEF            I40E_ITR_20K
-#define I40E_ITR_TX_DEF            I40E_ITR_20K
+#define I40E_ITR_RX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
+#define I40E_ITR_TX_DEF            (ITR_REG_TO_USEC(I40E_ITR_20K) | \
+                                   I40E_ITR_DYNAMIC)
  #define I40E_ITR_DYNAMIC           0x8000  /* use top bit as a flag */
  #define I40E_MIN_INT_RATE          250     /* ~= 1000000 / (I40E_MAX_ITR * 2) */
  #define I40E_MAX_INT_RATE          500000  /* == 1000000 / (I40E_MIN_ITR * 2) */
@@ -189,7 +191,7 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc,
  }
  
  /* How many Rx Buffers do we bundle into one write to the hardware ? */
-#define I40E_RX_BUFFER_WRITE   16      /* Must be power of 2 */
+#define I40E_RX_BUFFER_WRITE   32      /* Must be power of 2 */
  #define I40E_RX_INCREMENT(r, i) \
         do {                                    \
                 (i)++;                          \
@@ -325,6 +327,7 @@ struct i40e_rx_queue_stats {
  enum i40e_ring_state_t {
         __I40E_TX_FDIR_INIT_DONE,
         __I40E_TX_XPS_INIT_DONE,
+       __I40E_RING_STATE_NBITS /* must be last */
  };
  
  /* some useful defines for virtchannel interface, which
@@ -348,7 +351,7 @@ struct i40e_ring {
                 struct i40e_tx_buffer *tx_bi;
                 struct i40e_rx_buffer *rx_bi;
         };
-       unsigned long state;
+       DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
         u16 queue_index;                /* Queue number of ring */
         u8 dcb_tc;                      /* Traffic class of ring */
         u8 __iomem *tail;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h

index 9364b67..213b773 100644 (file)
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -46,6 +46,9 @@
  /* Max default timeout in ms, */
  #define I40E_MAX_NVM_TIMEOUT           18000
  
+/* Max timeout in ms for the phy to respond */
+#define I40E_MAX_PHY_TIMEOUT           500
+
  /* Switch from ms to the 1usec global time (this is the GTIME resolution) */
  #define I40E_MS_TO_GTIME(time)         ((time) * 1000)
  
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h

index 5982362..de0af52 100644 (file)
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -222,22 +222,22 @@ struct i40evf_adapter {
  
         u32 flags;
  #define I40EVF_FLAG_RX_CSUM_ENABLED            BIT(0)
-#define I40EVF_FLAG_IMIR_ENABLED               BIT(5)
-#define I40EVF_FLAG_MQ_CAPABLE                 BIT(6)
-#define I40EVF_FLAG_PF_COMMS_FAILED            BIT(8)
-#define I40EVF_FLAG_RESET_PENDING              BIT(9)
-#define I40EVF_FLAG_RESET_NEEDED               BIT(10)
-#define I40EVF_FLAG_WB_ON_ITR_CAPABLE          BIT(11)
-#define I40EVF_FLAG_OUTER_UDP_CSUM_CAPABLE     BIT(12)
-#define I40EVF_FLAG_ADDR_SET_BY_PF             BIT(13)
-#define I40EVF_FLAG_SERVICE_CLIENT_REQUESTED   BIT(14)
-#define I40EVF_FLAG_CLIENT_NEEDS_OPEN          BIT(15)
-#define I40EVF_FLAG_CLIENT_NEEDS_CLOSE         BIT(16)
-#define I40EVF_FLAG_CLIENT_NEEDS_L2_PARAMS     BIT(17)
-#define I40EVF_FLAG_PROMISC_ON                 BIT(18)
-#define I40EVF_FLAG_ALLMULTI_ON                        BIT(19)
-#define I40EVF_FLAG_LEGACY_RX                  BIT(20)
-#define I40EVF_FLAG_REINIT_ITR_NEEDED          BIT(21)
+#define I40EVF_FLAG_IMIR_ENABLED               BIT(1)
+#define I40EVF_FLAG_MQ_CAPABLE                 BIT(2)
+#define I40EVF_FLAG_PF_COMMS_FAILED            BIT(3)
+#define I40EVF_FLAG_RESET_PENDING              BIT(4)
+#define I40EVF_FLAG_RESET_NEEDED               BIT(5)
+#define I40EVF_FLAG_WB_ON_ITR_CAPABLE          BIT(6)
+#define I40EVF_FLAG_OUTER_UDP_CSUM_CAPABLE     BIT(7)
+#define I40EVF_FLAG_ADDR_SET_BY_PF             BIT(8)
+#define I40EVF_FLAG_SERVICE_CLIENT_REQUESTED   BIT(9)
+#define I40EVF_FLAG_CLIENT_NEEDS_OPEN          BIT(10)
+#define I40EVF_FLAG_CLIENT_NEEDS_CLOSE         BIT(11)
+#define I40EVF_FLAG_CLIENT_NEEDS_L2_PARAMS     BIT(12)
+#define I40EVF_FLAG_PROMISC_ON                 BIT(13)
+#define I40EVF_FLAG_ALLMULTI_ON                        BIT(14)
+#define I40EVF_FLAG_LEGACY_RX                  BIT(15)
+#define I40EVF_FLAG_REINIT_ITR_NEEDED          BIT(16)
  /* duplicates for common code */
  #define I40E_FLAG_DCB_ENABLED                  0
  #define I40E_FLAG_RX_CSUM_ENABLED              I40EVF_FLAG_RX_CSUM_ENABLED
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c

index f2f1e75..5bcbd46 100644 (file)
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -515,6 +515,7 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename)
         unsigned int vector, q_vectors;
         unsigned int rx_int_idx = 0, tx_int_idx = 0;
         int irq_num, err;
+       int cpu;
  
         i40evf_irq_disable(adapter);
         /* Decrement for Other and TCP Timer vectors */
@@ -553,10 +554,12 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename)
                 q_vector->affinity_notify.release =
                                                    i40evf_irq_affinity_release;
                 irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify);
-               /* get_cpu_mask returns a static constant mask with
-                * a permanent lifetime so it's ok to use here.
+               /* Spread the IRQ affinity hints across online CPUs. Note that
+                * get_cpu_mask returns a mask with a permanent lifetime so
+                * it's safe to use as a hint for irq_set_affinity_hint.
                  */
-               irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx));
+               cpu = cpumask_local_spread(q_vector->v_idx, -1);
+               irq_set_affinity_hint(irq_num, get_cpu_mask(cpu));
         }
  
         return 0;
@@ -877,6 +880,8 @@ i40evf_mac_filter *i40evf_add_filter(struct i40evf_adapter *adapter,
                 list_add_tail(&f->list, &adapter->mac_filter_list);
                 f->add = true;
                 adapter->aq_required |= I40EVF_FLAG_AQ_ADD_MAC_FILTER;
+       } else {
+               f->remove = false;
         }
  
         clear_bit(__I40EVF_IN_CRITICAL_TASK, &adapter->crit_section);
@@ -1218,7 +1223,7 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter)
                 tx_ring->netdev = adapter->netdev;
                 tx_ring->dev = &adapter->pdev->dev;
                 tx_ring->count = adapter->tx_desc_count;
-               tx_ring->tx_itr_setting = (I40E_ITR_DYNAMIC | I40E_ITR_TX_DEF);
+               tx_ring->tx_itr_setting = I40E_ITR_TX_DEF;
                 if (adapter->flags & I40EVF_FLAG_WB_ON_ITR_CAPABLE)
                         tx_ring->flags |= I40E_TXR_FLAGS_WB_ON_ITR;
  
@@ -1227,7 +1232,7 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter)
                 rx_ring->netdev = adapter->netdev;
                 rx_ring->dev = &adapter->pdev->dev;
                 rx_ring->count = adapter->rx_desc_count;
-               rx_ring->rx_itr_setting = (I40E_ITR_DYNAMIC | I40E_ITR_RX_DEF);
+               rx_ring->rx_itr_setting = I40E_ITR_RX_DEF;
         }
  
         adapter->num_active_queues = num_active_queues;
@@ -2420,10 +2425,6 @@ out_err:
         return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
  }
  
-#define I40EVF_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_TX |\
-                             NETIF_F_HW_VLAN_CTAG_RX |\
-                             NETIF_F_HW_VLAN_CTAG_FILTER)
-
  /**
   * i40evf_fix_features - fix up the netdev feature bits
   * @netdev: our net device
@@ -2436,9 +2437,11 @@ static netdev_features_t i40evf_fix_features(struct net_device *netdev,
  {
         struct i40evf_adapter *adapter = netdev_priv(netdev);
  
-       features &= ~I40EVF_VLAN_FEATURES;
-       if (adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
-               features |= I40EVF_VLAN_FEATURES;
+       if (!(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN))
+               features &= ~(NETIF_F_HW_VLAN_CTAG_TX |
+                             NETIF_F_HW_VLAN_CTAG_RX |
+                             NETIF_F_HW_VLAN_CTAG_FILTER);
+
         return features;
  }
  
@@ -2569,9 +2572,17 @@ int i40evf_process_config(struct i40evf_adapter *adapter)
          */
         hw_features = hw_enc_features;
  
+       /* Enable VLAN features if supported */
+       if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
+               hw_features |= (NETIF_F_HW_VLAN_CTAG_TX |
+                               NETIF_F_HW_VLAN_CTAG_RX);
+
         netdev->hw_features |= hw_features;
  
-       netdev->features |= hw_features | I40EVF_VLAN_FEATURES;
+       netdev->features |= hw_features;
+
+       if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)
+               netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
  
         adapter->vsi.id = adapter->vsi_res->vsi_id;
  
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c

index fd4a46b..837d9b4 100644 (file)
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3162,6 +3162,8 @@ static int igb_sw_init(struct igb_adapter *adapter)
         /* Setup and initialize a copy of the hw vlan table array */
         adapter->shadow_vfta = kcalloc(E1000_VLAN_FILTER_TBL_SIZE, sizeof(u32),
                                        GFP_ATOMIC);
+       if (!adapter->shadow_vfta)
+               return -ENOMEM;
  
         /* This call may decrease the number of queues */
         if (igb_init_interrupt_scheme(adapter, true)) {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h

index dd55787..468c355 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -275,6 +275,7 @@ struct ixgbe_rx_queue_stats {
         u64 rsc_count;
         u64 rsc_flush;
         u64 non_eop_descs;
+       u64 alloc_rx_page;
         u64 alloc_rx_page_failed;
         u64 alloc_rx_buff_failed;
         u64 csum_err;
@@ -434,8 +435,15 @@ static inline unsigned int ixgbe_rx_pg_order(struct ixgbe_ring *ring)
  }
  #define ixgbe_rx_pg_size(_ring) (PAGE_SIZE << ixgbe_rx_pg_order(_ring))
  
+#define IXGBE_ITR_ADAPTIVE_MIN_INC     2
+#define IXGBE_ITR_ADAPTIVE_MIN_USECS   10
+#define IXGBE_ITR_ADAPTIVE_MAX_USECS   126
+#define IXGBE_ITR_ADAPTIVE_LATENCY     0x80
+#define IXGBE_ITR_ADAPTIVE_BULK                0x00
+
  struct ixgbe_ring_container {
         struct ixgbe_ring *ring;        /* pointer to linked list of rings */
+       unsigned long next_update;      /* jiffies value of last update */
         unsigned int total_bytes;       /* total bytes processed this int */
         unsigned int total_packets;     /* total packets processed this int */
         u16 work_limit;                 /* total work allowed per interrupt */
@@ -655,6 +663,7 @@ struct ixgbe_adapter {
         u64 rsc_total_count;
         u64 rsc_total_flush;
         u64 non_eop_descs;
+       u32 alloc_rx_page;
         u32 alloc_rx_page_failed;
         u32 alloc_rx_buff_failed;
  
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c

index 523f9d0..8a32eb7 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
@@ -175,31 +175,9 @@ static s32 ixgbe_init_phy_ops_82598(struct ixgbe_hw *hw)
   **/
  static s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw)
  {
-#ifndef CONFIG_SPARC
-       u32 regval;
-       u32 i;
-#endif
         s32 ret_val;
  
         ret_val = ixgbe_start_hw_generic(hw);
-
-#ifndef CONFIG_SPARC
-       /* Disable relaxed ordering */
-       for (i = 0; ((i < hw->mac.max_tx_queues) &&
-            (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
-               regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
-       }
-
-       for (i = 0; ((i < hw->mac.max_rx_queues) &&
-            (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-               regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-                           IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
-       }
-#endif
         if (ret_val)
                 return ret_val;
  
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c

index 2c19070..9bef255 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -366,25 +366,6 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
         }
         IXGBE_WRITE_FLUSH(hw);
  
-#ifndef CONFIG_ARCH_WANT_RELAX_ORDER
-       /* Disable relaxed ordering */
-       for (i = 0; i < hw->mac.max_tx_queues; i++) {
-               u32 regval;
-
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
-               regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
-       }
-
-       for (i = 0; i < hw->mac.max_rx_queues; i++) {
-               u32 regval;
-
-               regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-               regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-                           IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-               IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
-       }
-#endif
         return 0;
  }
  
@@ -3800,10 +3781,10 @@ s32 ixgbe_set_fw_drv_ver_generic(struct ixgbe_hw *hw, u8 maj, u8 min,
         fw_cmd.ver_build = build;
         fw_cmd.ver_sub = sub;
         fw_cmd.hdr.checksum = 0;
-       fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd,
-                               (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len));
         fw_cmd.pad = 0;
         fw_cmd.pad2 = 0;
+       fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd,
+                               (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len));
  
         for (i = 0; i <= FW_CEM_MAX_RETRIES; i++) {
                 ret_val = ixgbe_host_interface_command(hw, &fw_cmd,
@@ -4100,8 +4081,8 @@ bool ixgbe_mng_present(struct ixgbe_hw *hw)
                 return false;
  
         fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw));
-       fwsm &= IXGBE_FWSM_MODE_MASK;
-       return fwsm == IXGBE_FWSM_FW_MODE_PT;
+
+       return !!(fwsm & IXGBE_FWSM_FW_MODE_PT);
  }
  
  /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c

index 72c5657..0aad1c2 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -104,6 +104,7 @@ static const struct ixgbe_stats ixgbe_gstrings_stats[] = {
         {"tx_flow_control_xoff", IXGBE_STAT(stats.lxofftxc)},
         {"rx_flow_control_xoff", IXGBE_STAT(stats.lxoffrxc)},
         {"rx_csum_offload_errors", IXGBE_STAT(hw_csum_rx_error)},
+       {"alloc_rx_page", IXGBE_STAT(alloc_rx_page)},
         {"alloc_rx_page_failed", IXGBE_STAT(alloc_rx_page_failed)},
         {"alloc_rx_buff_failed", IXGBE_STAT(alloc_rx_buff_failed)},
         {"rx_no_dma_resources", IXGBE_STAT(hw_rx_no_dma_resources)},
@@ -1048,7 +1049,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
  {
         struct ixgbe_adapter *adapter = netdev_priv(netdev);
         struct ixgbe_ring *temp_ring;
-       int i, err = 0;
+       int i, j, err = 0;
         u32 new_rx_count, new_tx_count;
  
         if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending))
@@ -1085,8 +1086,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
         }
  
         /* allocate temporary buffer to store rings in */
-       i = max_t(int, adapter->num_tx_queues, adapter->num_rx_queues);
-       i = max_t(int, i, adapter->num_xdp_queues);
+       i = max_t(int, adapter->num_tx_queues + adapter->num_xdp_queues,
+                 adapter->num_rx_queues);
         temp_ring = vmalloc(i * sizeof(struct ixgbe_ring));
  
         if (!temp_ring) {
@@ -1118,8 +1119,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
                         }
                 }
  
-               for (i = 0; i < adapter->num_xdp_queues; i++) {
-                       memcpy(&temp_ring[i], adapter->xdp_ring[i],
+               for (j = 0; j < adapter->num_xdp_queues; j++, i++) {
+                       memcpy(&temp_ring[i], adapter->xdp_ring[j],
                                sizeof(struct ixgbe_ring));
  
                         temp_ring[i].count = new_tx_count;
@@ -1139,10 +1140,10 @@ static int ixgbe_set_ringparam(struct net_device *netdev,
                         memcpy(adapter->tx_ring[i], &temp_ring[i],
                                sizeof(struct ixgbe_ring));
                 }
-               for (i = 0; i < adapter->num_xdp_queues; i++) {
-                       ixgbe_free_tx_resources(adapter->xdp_ring[i]);
+               for (j = 0; j < adapter->num_xdp_queues; j++, i++) {
+                       ixgbe_free_tx_resources(adapter->xdp_ring[j]);
  
-                       memcpy(adapter->xdp_ring[i], &temp_ring[i],
+                       memcpy(adapter->xdp_ring[j], &temp_ring[i],
                                sizeof(struct ixgbe_ring));
                 }
  
@@ -1916,8 +1917,6 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
                                   unsigned int size)
  {
         union ixgbe_adv_rx_desc *rx_desc;
-       struct ixgbe_rx_buffer *rx_buffer;
-       struct ixgbe_tx_buffer *tx_buffer;
         u16 rx_ntc, tx_ntc, count = 0;
  
         /* initialize next to clean and descriptor values */
@@ -1925,7 +1924,38 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
         tx_ntc = tx_ring->next_to_clean;
         rx_desc = IXGBE_RX_DESC(rx_ring, rx_ntc);
  
+       while (tx_ntc != tx_ring->next_to_use) {
+               union ixgbe_adv_tx_desc *tx_desc;
+               struct ixgbe_tx_buffer *tx_buffer;
+
+               tx_desc = IXGBE_TX_DESC(tx_ring, tx_ntc);
+
+               /* if DD is not set transmit has not completed */
+               if (!(tx_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD)))
+                       return count;
+
+               /* unmap buffer on Tx side */
+               tx_buffer = &tx_ring->tx_buffer_info[tx_ntc];
+
+               /* Free all the Tx ring sk_buffs */
+               dev_kfree_skb_any(tx_buffer->skb);
+
+               /* unmap skb header data */
+               dma_unmap_single(tx_ring->dev,
+                                dma_unmap_addr(tx_buffer, dma),
+                                dma_unmap_len(tx_buffer, len),
+                                DMA_TO_DEVICE);
+               dma_unmap_len_set(tx_buffer, len, 0);
+
+               /* increment Tx next to clean counter */
+               tx_ntc++;
+               if (tx_ntc == tx_ring->count)
+                       tx_ntc = 0;
+       }
+
         while (rx_desc->wb.upper.length) {
+               struct ixgbe_rx_buffer *rx_buffer;
+
                 /* check Rx buffer */
                 rx_buffer = &rx_ring->rx_buffer_info[rx_ntc];
  
@@ -1938,6 +1968,8 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
                 /* verify contents of skb */
                 if (ixgbe_check_lbtest_frame(rx_buffer, size))
                         count++;
+               else
+                       break;
  
                 /* sync Rx buffer for device write */
                 dma_sync_single_for_device(rx_ring->dev,
@@ -1945,26 +1977,10 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring,
                                            ixgbe_rx_bufsz(rx_ring),
                                            DMA_FROM_DEVICE);
  
-               /* unmap buffer on Tx side */
-               tx_buffer = &tx_ring->tx_buffer_info[tx_ntc];
-
-               /* Free all the Tx ring sk_buffs */
-               dev_kfree_skb_any(tx_buffer->skb);
-
-               /* unmap skb header data */
-               dma_unmap_single(tx_ring->dev,
-                                dma_unmap_addr(tx_buffer, dma),
-                                dma_unmap_len(tx_buffer, len),
-                                DMA_TO_DEVICE);
-               dma_unmap_len_set(tx_buffer, len, 0);
-
-               /* increment Rx/Tx next to clean counters */
+               /* increment Rx next to clean counter */
                 rx_ntc++;
                 if (rx_ntc == rx_ring->count)
                         rx_ntc = 0;
-               tx_ntc++;
-               if (tx_ntc == tx_ring->count)
-                       tx_ntc = 0;
  
                 /* fetch next descriptor */
                 rx_desc = IXGBE_RX_DESC(rx_ring, rx_ntc);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c

index f1bfae0..8e2a957 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -806,6 +806,7 @@ static void ixgbe_add_ring(struct ixgbe_ring *ring,
         ring->next = head->ring;
         head->ring = ring;
         head->count++;
+       head->next_update = jiffies + 1;
  }
  
  /**
@@ -879,8 +880,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
         /* initialize work limits */
         q_vector->tx.work_limit = adapter->tx_work_limit;
  
-       /* initialize pointer to rings */
-       ring = q_vector->ring;
+       /* Initialize setting for adaptive ITR */
+       q_vector->tx.itr = IXGBE_ITR_ADAPTIVE_MAX_USECS |
+                          IXGBE_ITR_ADAPTIVE_LATENCY;
+       q_vector->rx.itr = IXGBE_ITR_ADAPTIVE_MAX_USECS |
+                          IXGBE_ITR_ADAPTIVE_LATENCY;
  
         /* intialize ITR */
         if (txr_count && !rxr_count) {
@@ -897,6 +901,9 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
                         q_vector->itr = adapter->rx_itr_setting;
         }
  
+       /* initialize pointer to rings */
+       ring = q_vector->ring;
+
         while (txr_count) {
                 /* assign generic ring traits */
                 ring->dev = &adapter->pdev->dev;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

index 3942c62..7683c14 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1620,6 +1620,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring,
         bi->page = page;
         bi->page_offset = ixgbe_rx_offset(rx_ring);
         bi->pagecnt_bias = 1;
+       rx_ring->rx_stats.alloc_rx_page++;
  
         return true;
  }
@@ -2539,50 +2540,174 @@ enum latency_range {
  static void ixgbe_update_itr(struct ixgbe_q_vector *q_vector,
                              struct ixgbe_ring_container *ring_container)
  {
-       int bytes = ring_container->total_bytes;
-       int packets = ring_container->total_packets;
-       u32 timepassed_us;
-       u64 bytes_perint;
-       u8 itr_setting = ring_container->itr;
+       unsigned int itr = IXGBE_ITR_ADAPTIVE_MIN_USECS |
+                          IXGBE_ITR_ADAPTIVE_LATENCY;
+       unsigned int avg_wire_size, packets, bytes;
+       unsigned long next_update = jiffies;
  
-       if (packets == 0)
+       /* If we don't have any rings just leave ourselves set for maximum
+        * possible latency so we take ourselves out of the equation.
+        */
+       if (!ring_container->ring)
                 return;
  
-       /* simple throttlerate management
-        *   0-10MB/s   lowest (100000 ints/s)
-        *  10-20MB/s   low    (20000 ints/s)
-        *  20-1249MB/s bulk   (12000 ints/s)
+       /* If we didn't update within up to 1 - 2 jiffies we can assume
+        * that either packets are coming in so slow there hasn't been
+        * any work, or that there is so much work that NAPI is dealing
+        * with interrupt moderation and we don't need to do anything.
          */
-       /* what was last interrupt timeslice? */
-       timepassed_us = q_vector->itr >> 2;
-       if (timepassed_us == 0)
-               return;
+       if (time_after(next_update, ring_container->next_update))
+               goto clear_counts;
  
-       bytes_perint = bytes / timepassed_us; /* bytes/usec */
+       packets = ring_container->total_packets;
  
-       switch (itr_setting) {
-       case lowest_latency:
-               if (bytes_perint > 10)
-                       itr_setting = low_latency;
-               break;
-       case low_latency:
-               if (bytes_perint > 20)
-                       itr_setting = bulk_latency;
-               else if (bytes_perint <= 10)
-                       itr_setting = lowest_latency;
+       /* We have no packets to actually measure against. This means
+        * either one of the other queues on this vector is active or
+        * we are a Tx queue doing TSO with too high of an interrupt rate.
+        *
+        * When this occurs just tick up our delay by the minimum value
+        * and hope that this extra delay will prevent us from being called
+        * without any work on our queue.
+        */
+       if (!packets) {
+               itr = (q_vector->itr >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC;
+               if (itr > IXGBE_ITR_ADAPTIVE_MAX_USECS)
+                       itr = IXGBE_ITR_ADAPTIVE_MAX_USECS;
+               itr += ring_container->itr & IXGBE_ITR_ADAPTIVE_LATENCY;
+               goto clear_counts;
+       }
+
+       bytes = ring_container->total_bytes;
+
+       /* If packets are less than 4 or bytes are less than 9000 assume
+        * insufficient data to use bulk rate limiting approach. We are
+        * likely latency driven.
+        */
+       if (packets < 4 && bytes < 9000) {
+               itr = IXGBE_ITR_ADAPTIVE_LATENCY;
+               goto adjust_by_size;
+       }
+
+       /* Between 4 and 48 we can assume that our current interrupt delay
+        * is only slightly too low. As such we should increase it by a small
+        * fixed amount.
+        */
+       if (packets < 48) {
+               itr = (q_vector->itr >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC;
+               if (itr > IXGBE_ITR_ADAPTIVE_MAX_USECS)
+                       itr = IXGBE_ITR_ADAPTIVE_MAX_USECS;
+               goto clear_counts;
+       }
+
+       /* Between 48 and 96 is our "goldilocks" zone where we are working
+        * out "just right". Just report that our current ITR is good for us.
+        */
+       if (packets < 96) {
+               itr = q_vector->itr >> 2;
+               goto clear_counts;
+       }
+
+       /* If packet count is 96 or greater we are likely looking at a slight
+        * overrun of the delay we want. Try halving our delay to see if that
+        * will cut the number of packets in half per interrupt.
+        */
+       if (packets < 256) {
+               itr = q_vector->itr >> 3;
+               if (itr < IXGBE_ITR_ADAPTIVE_MIN_USECS)
+                       itr = IXGBE_ITR_ADAPTIVE_MIN_USECS;
+               goto clear_counts;
+       }
+
+       /* The paths below assume we are dealing with a bulk ITR since number
+        * of packets is 256 or greater. We are just going to have to compute
+        * a value and try to bring the count under control, though for smaller
+        * packet sizes there isn't much we can do as NAPI polling will likely
+        * be kicking in sooner rather than later.
+        */
+       itr = IXGBE_ITR_ADAPTIVE_BULK;
+
+adjust_by_size:
+       /* If packet counts are 256 or greater we can assume we have a gross
+        * overestimation of what the rate should be. Instead of trying to fine
+        * tune it just use the formula below to try and dial in an exact value
+        * give the current packet size of the frame.
+        */
+       avg_wire_size = bytes / packets;
+
+       /* The following is a crude approximation of:
+        *  wmem_default / (size + overhead) = desired_pkts_per_int
+        *  rate / bits_per_byte / (size + ethernet overhead) = pkt_rate
+        *  (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value
+        *
+        * Assuming wmem_default is 212992 and overhead is 640 bytes per
+        * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the
+        * formula down to
+        *
+        *  (170 * (size + 24)) / (size + 640) = ITR
+        *
+        * We first do some math on the packet size and then finally bitshift
+        * by 8 after rounding up. We also have to account for PCIe link speed
+        * difference as ITR scales based on this.
+        */
+       if (avg_wire_size <= 60) {
+               /* Start at 50k ints/sec */
+               avg_wire_size = 5120;
+       } else if (avg_wire_size <= 316) {
+               /* 50K ints/sec to 16K ints/sec */
+               avg_wire_size *= 40;
+               avg_wire_size += 2720;
+       } else if (avg_wire_size <= 1084) {
+               /* 16K ints/sec to 9.2K ints/sec */
+               avg_wire_size *= 15;
+               avg_wire_size += 11452;
+       } else if (avg_wire_size <= 1980) {
+               /* 9.2K ints/sec to 8K ints/sec */
+               avg_wire_size *= 5;
+               avg_wire_size += 22420;
+       } else {
+               /* plateau at a limit of 8K ints/sec */
+               avg_wire_size = 32256;
+       }
+
+       /* If we are in low latency mode half our delay which doubles the rate
+        * to somewhere between 100K to 16K ints/sec
+        */
+       if (itr & IXGBE_ITR_ADAPTIVE_LATENCY)
+               avg_wire_size >>= 1;
+
+       /* Resultant value is 256 times larger than it needs to be. This
+        * gives us room to adjust the value as needed to either increase
+        * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc.
+        *
+        * Use addition as we have already recorded the new latency flag
+        * for the ITR value.
+        */
+       switch (q_vector->adapter->link_speed) {
+       case IXGBE_LINK_SPEED_10GB_FULL:
+       case IXGBE_LINK_SPEED_100_FULL:
+       default:
+               itr += DIV_ROUND_UP(avg_wire_size,
+                                   IXGBE_ITR_ADAPTIVE_MIN_INC * 256) *
+                      IXGBE_ITR_ADAPTIVE_MIN_INC;
                 break;
-       case bulk_latency:
-               if (bytes_perint <= 20)
-                       itr_setting = low_latency;
+       case IXGBE_LINK_SPEED_2_5GB_FULL:
+       case IXGBE_LINK_SPEED_1GB_FULL:
+       case IXGBE_LINK_SPEED_10_FULL:
+               itr += DIV_ROUND_UP(avg_wire_size,
+                                   IXGBE_ITR_ADAPTIVE_MIN_INC * 64) *
+                      IXGBE_ITR_ADAPTIVE_MIN_INC;
                 break;
         }
  
-       /* clear work counters since we have the values we need */
+clear_counts:
+       /* write back value */
+       ring_container->itr = itr;
+
+       /* next update should occur within next jiffy */
+       ring_container->next_update = next_update + 1;
+
         ring_container->total_bytes = 0;
         ring_container->total_packets = 0;
-
-       /* write updated itr to ring container */
-       ring_container->itr = itr_setting;
  }
  
  /**
@@ -2624,34 +2749,19 @@ void ixgbe_write_eitr(struct ixgbe_q_vector *q_vector)
  
  static void ixgbe_set_itr(struct ixgbe_q_vector *q_vector)
  {
-       u32 new_itr = q_vector->itr;
-       u8 current_itr;
+       u32 new_itr;
  
         ixgbe_update_itr(q_vector, &q_vector->tx);
         ixgbe_update_itr(q_vector, &q_vector->rx);
  
-       current_itr = max(q_vector->rx.itr, q_vector->tx.itr);
+       /* use the smallest value of new ITR delay calculations */
+       new_itr = min(q_vector->rx.itr, q_vector->tx.itr);
  
-       switch (current_itr) {
-       /* counts and packets in update_itr are dependent on these numbers */
-       case lowest_latency:
-               new_itr = IXGBE_100K_ITR;
-               break;
-       case low_latency:
-               new_itr = IXGBE_20K_ITR;
-               break;
-       case bulk_latency:
-               new_itr = IXGBE_12K_ITR;
-               break;
-       default:
-               break;
-       }
+       /* Clear latency flag if set, shift into correct position */
+       new_itr &= ~IXGBE_ITR_ADAPTIVE_LATENCY;
+       new_itr <<= 2;
  
         if (new_itr != q_vector->itr) {
-               /* do an exponential smoothing */
-               new_itr = (10 * new_itr * q_vector->itr) /
-                         ((9 * new_itr) + q_vector->itr);
-
                 /* save the algorithm value here */
                 q_vector->itr = new_itr;
  
@@ -4904,7 +5014,7 @@ static void ixgbe_clear_udp_tunnel_port(struct ixgbe_adapter *adapter, u32 mask)
                                 IXGBE_FLAG_GENEVE_OFFLOAD_CAPABLE)))
                 return;
  
-       vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) && ~mask;
+       vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) & ~mask;
         IXGBE_WRITE_REG(hw, IXGBE_VXLANCTRL, vxlanctrl);
  
         if (mask & IXGBE_VXLANCTRL_VXLAN_UDPPORT_MASK)
@@ -6794,6 +6904,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
         u32 i, missed_rx = 0, mpc, bprc, lxon, lxoff, xon_off_tot;
         u64 non_eop_descs = 0, restart_queue = 0, tx_busy = 0;
         u64 alloc_rx_page_failed = 0, alloc_rx_buff_failed = 0;
+       u64 alloc_rx_page = 0;
         u64 bytes = 0, packets = 0, hw_csum_rx_error = 0;
  
         if (test_bit(__IXGBE_DOWN, &adapter->state) ||
@@ -6814,6 +6925,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
         for (i = 0; i < adapter->num_rx_queues; i++) {
                 struct ixgbe_ring *rx_ring = adapter->rx_ring[i];
                 non_eop_descs += rx_ring->rx_stats.non_eop_descs;
+               alloc_rx_page += rx_ring->rx_stats.alloc_rx_page;
                 alloc_rx_page_failed += rx_ring->rx_stats.alloc_rx_page_failed;
                 alloc_rx_buff_failed += rx_ring->rx_stats.alloc_rx_buff_failed;
                 hw_csum_rx_error += rx_ring->rx_stats.csum_err;
@@ -6821,6 +6933,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
                 packets += rx_ring->stats.packets;
         }
         adapter->non_eop_descs = non_eop_descs;
+       adapter->alloc_rx_page = alloc_rx_page;
         adapter->alloc_rx_page_failed = alloc_rx_page_failed;
         adapter->alloc_rx_buff_failed = alloc_rx_buff_failed;
         adapter->hw_csum_rx_error = hw_csum_rx_error;
@@ -8552,6 +8665,10 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd)
                 return ixgbe_ptp_set_ts_config(adapter, req);
         case SIOCGHWTSTAMP:
                 return ixgbe_ptp_get_ts_config(adapter, req);
+       case SIOCGMIIPHY:
+               if (!adapter->hw.phy.ops.read_reg)
+                       return -EOPNOTSUPP;
+               /* fall through */
         default:
                 return mdio_mii_ioctl(&adapter->hw.phy.mdio, if_mii(req), cmd);
         }
@@ -9758,6 +9875,17 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv)
         limit = find_last_bit(&adapter->fwd_bitmask, 32);
         adapter->ring_feature[RING_F_VMDQ].limit = limit + 1;
         ixgbe_fwd_ring_down(fwd_adapter->netdev, fwd_adapter);
+
+       /* go back to full RSS if we're done with our VMQs */
+       if (adapter->ring_feature[RING_F_VMDQ].limit == 1) {
+               int rss = min_t(int, ixgbe_max_rss_indices(adapter),
+                               num_online_cpus());
+
+               adapter->flags &= ~IXGBE_FLAG_VMDQ_ENABLED;
+               adapter->flags &= ~IXGBE_FLAG_SRIOV_ENABLED;
+               adapter->ring_feature[RING_F_RSS].limit = rss;
+       }
+
         ixgbe_setup_tc(pdev, netdev_get_num_tc(pdev));
         netdev_dbg(pdev, "pool %i:%i queues %i:%i VSI bitmask %lx\n",
                    fwd_adapter->pool, adapter->num_rx_pools,
@@ -10737,6 +10865,9 @@ skip_bad_vf_detection:
         if (!test_bit(__IXGBE_SERVICE_INITED, &adapter->state))
                 return PCI_ERS_RESULT_DISCONNECT;
  
+       if (!netif_device_present(netdev))
+               return PCI_ERS_RESULT_DISCONNECT;
+
         rtnl_lock();
         netif_device_detach(netdev);
  
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c

index 6ea0d6a..b8c5fd2 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c
@@ -619,12 +619,6 @@ s32 ixgbe_acquire_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask)
                 usleep_range(5000, 10000);
         }
  
-       /* Failed to get SW only semaphore */
-       if (swmask == IXGBE_GSSR_SW_MNG_SM) {
-               hw_dbg(hw, "Failed to get SW only semaphore\n");
-               return IXGBE_ERR_SWFW_SYNC;
-       }
-
         /* If the resource is not released by the FW/HW the SW can assume that
          * the FW/HW malfunctions. In that case the SW should set the SW bit(s)
          * of the requested resource(s) while ignoring the corresponding FW/HW
@@ -647,7 +641,8 @@ s32 ixgbe_acquire_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask)
          */
         if (swfw_sync & swmask) {
                 u32 rmask = IXGBE_GSSR_EEP_SM | IXGBE_GSSR_PHY0_SM |
-                           IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM;
+                           IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM |
+                           IXGBE_GSSR_SW_MNG_SM;
  
                 if (swi2c_mask)
                         rmask |= IXGBE_GSSR_I2C_MASK;
@@ -763,6 +758,8 @@ static void ixgbe_release_swfw_sync_semaphore(struct ixgbe_hw *hw)
   **/
  void ixgbe_init_swfw_sync_X540(struct ixgbe_hw *hw)
  {
+       u32 rmask;
+
         /* First try to grab the semaphore but we don't need to bother
          * looking to see whether we got the lock or not since we do
          * the same thing regardless of whether we got the lock or not.
@@ -771,6 +768,14 @@ void ixgbe_init_swfw_sync_X540(struct ixgbe_hw *hw)
          */
         ixgbe_get_swfw_sync_semaphore(hw);
         ixgbe_release_swfw_sync_semaphore(hw);
+
+       /* Acquire and release all software resources. */
+       rmask = IXGBE_GSSR_EEP_SM | IXGBE_GSSR_PHY0_SM |
+               IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM |
+               IXGBE_GSSR_SW_MNG_SM | IXGBE_GSSR_I2C_MASK;
+
+       ixgbe_acquire_swfw_sync_X540(hw, rmask);
+       ixgbe_release_swfw_sync_X540(hw, rmask);
  }
  
  /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c

index 19fbb2f..cb7da5f 100644 (file)
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c
@@ -900,6 +900,8 @@ static s32 ixgbe_read_ee_hostif_buffer_X550(struct ixgbe_hw *hw,
                 /* convert offset from words to bytes */
                 buffer.address = cpu_to_be32((offset + current_word) * 2);
                 buffer.length = cpu_to_be16(words_to_read * 2);
+               buffer.pad2 = 0;
+               buffer.pad3 = 0;
  
                 status = ixgbe_hic_unlocked(hw, (u32 *)&buffer, sizeof(buffer),
                                             IXGBE_HI_COMMAND_TIMEOUT);
@@ -3192,6 +3194,9 @@ static s32 ixgbe_init_phy_ops_X550em(struct ixgbe_hw *hw)
  
         /* Identify the PHY or SFP module */
         ret_val = phy->ops.identify(hw);
+       if (ret_val == IXGBE_ERR_SFP_NOT_SUPPORTED ||
+           ret_val == IXGBE_ERR_PHY_ADDR_INVALID)
+               return ret_val;
  
         /* Setup function pointers based on detected hardware */
         ixgbe_init_mac_link_ops_X550em(hw);
@@ -3394,9 +3399,10 @@ static s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw)
         ixgbe_clear_tx_pending(hw);
  
         /* PHY ops must be identified and initialized prior to reset */
-
-       /* Identify PHY and related function pointers */
         status = hw->phy.ops.init(hw);
+       if (status == IXGBE_ERR_SFP_NOT_SUPPORTED ||
+           status == IXGBE_ERR_PHY_ADDR_INVALID)
+               return status;
  
         /* start the external PHY */
         if (hw->phy.type == ixgbe_phy_x550em_ext_t) {
@@ -3884,7 +3890,7 @@ static const struct ixgbe_mac_operations mac_ops_X550EM_x_fw = {
         .write_iosf_sb_reg      = ixgbe_write_iosf_sb_reg_x550,
  };
  
-static struct ixgbe_mac_operations mac_ops_x550em_a = {
+static const struct ixgbe_mac_operations mac_ops_x550em_a = {
         X550_COMMON_MAC
         .led_on                 = ixgbe_led_on_t_x550em,
         .led_off                = ixgbe_led_off_t_x550em,
@@ -3905,7 +3911,7 @@ static struct ixgbe_mac_operations mac_ops_x550em_a = {
         .write_iosf_sb_reg      = ixgbe_write_iosf_sb_reg_x550a,
  };
  
-static struct ixgbe_mac_operations mac_ops_x550em_a_fw = {
+static const struct ixgbe_mac_operations mac_ops_x550em_a_fw = {
         X550_COMMON_MAC
         .led_on                 = ixgbe_led_on_generic,
         .led_off                = ixgbe_led_off_generic,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c

index 3d4e4a5..bf1f041 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1742,13 +1742,18 @@ static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd)
         return err;
  }
  
+static int mlx4_en_get_max_num_rx_rings(struct net_device *dev)
+{
+       return min_t(int, num_online_cpus(), MAX_RX_RINGS);
+}
+
  static void mlx4_en_get_channels(struct net_device *dev,
                                  struct ethtool_channels *channel)
  {
         struct mlx4_en_priv *priv = netdev_priv(dev);
  
-       channel->max_rx = MAX_RX_RINGS;
-       channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP;
+       channel->max_rx = mlx4_en_get_max_num_rx_rings(dev);
+       channel->max_tx = priv->mdev->profile.max_num_tx_rings_p_up;
  
         channel->rx_count = priv->rx_ring_num;
         channel->tx_count = priv->tx_ring_num[TX] /
@@ -1777,7 +1782,7 @@ static int mlx4_en_set_channels(struct net_device *dev,
         mutex_lock(&mdev->state_lock);
         xdp_count = priv->tx_ring_num[TX_XDP] ? channel->rx_count : 0;
         if (channel->tx_count * priv->prof->num_up + xdp_count >
-           MAX_TX_RINGS) {
+           priv->mdev->profile.max_num_tx_rings_p_up * priv->prof->num_up) {
                 err = -EINVAL;
                 en_err(priv,
                        "Total number of TX and XDP rings (%d) exceeds the maximum supported (%d)\n",
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c

index 686e18d..2c29654 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -153,7 +153,7 @@ static void mlx4_en_get_profile(struct mlx4_en_dev *mdev)
         int i;
  
         params->udp_rss = udp_rss;
-       params->num_tx_rings_p_up = mlx4_low_memory_profile() ?
+       params->max_num_tx_rings_p_up = mlx4_low_memory_profile() ?
                 MLX4_EN_MIN_TX_RING_P_UP :
                 min_t(int, num_online_cpus(), MLX4_EN_MAX_TX_RING_P_UP);
  
@@ -170,8 +170,8 @@ static void mlx4_en_get_profile(struct mlx4_en_dev *mdev)
                 params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE;
                 params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE;
                 params->prof[i].num_up = MLX4_EN_NUM_UP_LOW;
-               params->prof[i].num_tx_rings_p_up = params->num_tx_rings_p_up;
-               params->prof[i].tx_ring_num[TX] = params->num_tx_rings_p_up *
+               params->prof[i].num_tx_rings_p_up = params->max_num_tx_rings_p_up;
+               params->prof[i].tx_ring_num[TX] = params->max_num_tx_rings_p_up *
                         params->prof[i].num_up;
                 params->prof[i].rss_rings = 0;
                 params->prof[i].inline_thold = inline_thold;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c

index 9c218f1..e4c7a80 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -3305,7 +3305,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
         priv->pflags = MLX4_EN_PRIV_FLAGS_BLUEFLAME;
         priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
                         MLX4_WQE_CTRL_SOLICITED);
-       priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up;
+       priv->num_tx_rings_p_up = mdev->profile.max_num_tx_rings_p_up;
         priv->tx_work_limit = MLX4_EN_DEFAULT_TX_WORK;
         netdev_rss_key_fill(priv->rss_key, sizeof(priv->rss_key));
  
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c

index 5a47f96..6883ac7 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -53,7 +53,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
         if (is_tx) {
                 context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
                 if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP)
-                       context->params2 |= MLX4_QP_BIT_FPP;
+                       context->params2 |= cpu_to_be32(MLX4_QP_BIT_FPP);
  
         } else {
                 context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c

index 8f9cb8a..a786695 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -254,8 +254,7 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
                                          DEF_RX_RINGS));
  
                 num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS :
-                       min_t(int, num_of_eqs,
-                             netif_get_num_default_rss_queues());
+                       min_t(int, num_of_eqs, num_online_cpus());
                 mdev->profile.prof[i].rx_ring_num =
                         rounddown_pow_of_two(num_rx_rings);
         }
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c

index 8a32a8f..2cc82dc 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -718,7 +718,7 @@ void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring)
  #else
         iowrite32be(
  #endif
-                 ring->doorbell_qpn,
+                 (__force u32)ring->doorbell_qpn,
                   ring->bf.uar->map + MLX4_SEND_DOORBELL);
  }
  
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c

index 16c0994..634f603 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -57,12 +57,12 @@ MODULE_PARM_DESC(enable_qos, "Enable Enhanced QoS support (default: off)");
  #define MLX4_GET(dest, source, offset)                               \
         do {                                                          \
                 void *__p = (char *) (source) + (offset);             \
-               u64 val;                                              \
-               switch (sizeof(dest)) {                       \
+               __be64 val;                                           \
+               switch (sizeof(dest)) {                               \
                 case 1: (dest) = *(u8 *) __p;       break;            \
                 case 2: (dest) = be16_to_cpup(__p); break;            \
                 case 4: (dest) = be32_to_cpup(__p); break;            \
-               case 8: val = get_unaligned((u64 *)__p);              \
+               case 8: val = get_unaligned((__be64 *)__p);           \
                         (dest) = be64_to_cpu(val);  break;            \
                 default: __buggy_use_of_MLX4_GET();                   \
                 }                                                     \
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h

index fdb3ad0..245e9ea 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -399,7 +399,7 @@ struct mlx4_en_profile {
         u32 active_ports;
         u32 small_pkt_int;
         u8 no_reset;
-       u8 num_tx_rings_p_up;
+       u8 max_num_tx_rings_p_up;
         struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1];
  };
  
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c

index 728a2fb..2033209 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -925,7 +925,7 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
                 context->flags &= cpu_to_be32(~(0xf << 28));
                 context->flags |= cpu_to_be32(states[i + 1] << 28);
                 if (states[i + 1] != MLX4_QP_STATE_RTR)
-                       context->params2 &= ~MLX4_QP_BIT_FPP;
+                       context->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
                 err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
                                      context, 0, 0, qp);
                 if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c

index fabb533..04304dd 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -3185,7 +3185,7 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
         optpar  = be32_to_cpu(*(__be32 *) inbox->buf);
  
         if (slave != mlx4_master_func_num(dev)) {
-               qp_ctx->params2 &= ~MLX4_QP_BIT_FPP;
+               qp_ctx->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP);
                 /* setting QP rate-limit is disallowed for VFs */
                 if (qp_ctx->rate_limit_params)
                         return -EPERM;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c

index 5a7bea6..7a136ae 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -145,10 +145,10 @@ static struct init_tree_node {
         }
  };
  
-enum fs_i_mutex_lock_class {
-       FS_MUTEX_GRANDPARENT,
-       FS_MUTEX_PARENT,
-       FS_MUTEX_CHILD
+enum fs_i_lock_class {
+       FS_LOCK_GRANDPARENT,
+       FS_LOCK_PARENT,
+       FS_LOCK_CHILD
  };
  
  static const struct rhashtable_params rhash_fte = {
@@ -168,10 +168,16 @@ static const struct rhashtable_params rhash_fg = {
  
  };
  
-static void del_rule(struct fs_node *node);
-static void del_flow_table(struct fs_node *node);
-static void del_flow_group(struct fs_node *node);
-static void del_fte(struct fs_node *node);
+static void del_hw_flow_table(struct fs_node *node);
+static void del_hw_flow_group(struct fs_node *node);
+static void del_hw_fte(struct fs_node *node);
+static void del_sw_flow_table(struct fs_node *node);
+static void del_sw_flow_group(struct fs_node *node);
+static void del_sw_fte(struct fs_node *node);
+/* Delete rule (destination) is special case that 
+ * requires to lock the FTE for all the deletion process.
+ */
+static void del_sw_hw_rule(struct fs_node *node);
  static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
                                 struct mlx5_flow_destination *d2);
  static struct mlx5_flow_rule *
@@ -179,14 +185,16 @@ find_flow_rule(struct fs_fte *fte,
                struct mlx5_flow_destination *dest);
  
  static void tree_init_node(struct fs_node *node,
-                          unsigned int refcount,
-                          void (*remove_func)(struct fs_node *))
+                          void (*del_hw_func)(struct fs_node *),
+                          void (*del_sw_func)(struct fs_node *))
  {
-       atomic_set(&node->refcount, refcount);
+       atomic_set(&node->refcount, 1);
         INIT_LIST_HEAD(&node->list);
         INIT_LIST_HEAD(&node->children);
-       mutex_init(&node->lock);
-       node->remove_func = remove_func;
+       init_rwsem(&node->lock);
+       node->del_hw_func = del_hw_func;
+       node->del_sw_func = del_sw_func;
+       node->active = false;
  }
  
  static void tree_add_node(struct fs_node *node, struct fs_node *parent)
@@ -202,50 +210,70 @@ static void tree_add_node(struct fs_node *node, struct fs_node *parent)
                 node->root = parent->root;
  }
  
-static void tree_get_node(struct fs_node *node)
+static int tree_get_node(struct fs_node *node)
  {
-       atomic_inc(&node->refcount);
+       return atomic_add_unless(&node->refcount, 1, 0);
  }
  
-static void nested_lock_ref_node(struct fs_node *node,
-                                enum fs_i_mutex_lock_class class)
+static void nested_down_read_ref_node(struct fs_node *node,
+                                     enum fs_i_lock_class class)
  {
         if (node) {
-               mutex_lock_nested(&node->lock, class);
+               down_read_nested(&node->lock, class);
                 atomic_inc(&node->refcount);
         }
  }
  
-static void lock_ref_node(struct fs_node *node)
+static void nested_down_write_ref_node(struct fs_node *node,
+                                      enum fs_i_lock_class class)
  {
         if (node) {
-               mutex_lock(&node->lock);
+               down_write_nested(&node->lock, class);
                 atomic_inc(&node->refcount);
         }
  }
  
-static void unlock_ref_node(struct fs_node *node)
+static void down_write_ref_node(struct fs_node *node)
  {
         if (node) {
-               atomic_dec(&node->refcount);
-               mutex_unlock(&node->lock);
+               down_write(&node->lock);
+               atomic_inc(&node->refcount);
         }
  }
  
+static void up_read_ref_node(struct fs_node *node)
+{
+       atomic_dec(&node->refcount);
+       up_read(&node->lock);
+}
+
+static void up_write_ref_node(struct fs_node *node)
+{
+       atomic_dec(&node->refcount);
+       up_write(&node->lock);
+}
+
  static void tree_put_node(struct fs_node *node)
  {
         struct fs_node *parent_node = node->parent;
  
-       lock_ref_node(parent_node);
         if (atomic_dec_and_test(&node->refcount)) {
-               if (parent_node)
+               if (node->del_hw_func)
+                       node->del_hw_func(node);
+               if (parent_node) {
+                       /* Only root namespace doesn't have parent and we just
+                        * need to free its node.
+                        */
+                       down_write_ref_node(parent_node);
                         list_del_init(&node->list);
-               if (node->remove_func)
-                       node->remove_func(node);
-               kfree(node);
+                       if (node->del_sw_func)
+                               node->del_sw_func(node);
+                       up_write_ref_node(parent_node);
+               } else {
+                       kfree(node);
+               }
                 node = NULL;
         }
-       unlock_ref_node(parent_node);
         if (!node && parent_node)
                 tree_put_node(parent_node);
  }
@@ -362,6 +390,15 @@ static struct mlx5_flow_root_namespace *find_root(struct fs_node *node)
         return container_of(ns, struct mlx5_flow_root_namespace, ns);
  }
  
+static inline struct mlx5_flow_steering *get_steering(struct fs_node *node)
+{
+       struct mlx5_flow_root_namespace *root = find_root(node);
+
+       if (root)
+               return root->dev->priv.steering;
+       return NULL;
+}
+
  static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
  {
         struct mlx5_flow_root_namespace *root = find_root(node);
@@ -371,26 +408,36 @@ static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
         return NULL;
  }
  
-static void del_flow_table(struct fs_node *node)
+static void del_hw_flow_table(struct fs_node *node)
  {
         struct mlx5_flow_table *ft;
         struct mlx5_core_dev *dev;
-       struct fs_prio *prio;
         int err;
  
         fs_get_obj(ft, node);
         dev = get_dev(&ft->node);
  
-       err = mlx5_cmd_destroy_flow_table(dev, ft);
-       if (err)
-               mlx5_core_warn(dev, "flow steering can't destroy ft\n");
-       ida_destroy(&ft->fte_allocator);
+       if (node->active) {
+               err = mlx5_cmd_destroy_flow_table(dev, ft);
+               if (err)
+                       mlx5_core_warn(dev, "flow steering can't destroy ft\n");
+       }
+}
+
+static void del_sw_flow_table(struct fs_node *node)
+{
+       struct mlx5_flow_table *ft;
+       struct fs_prio *prio;
+
+       fs_get_obj(ft, node);
+
         rhltable_destroy(&ft->fgs_hash);
         fs_get_obj(prio, ft->node.parent);
         prio->num_ft--;
+       kfree(ft);
  }
  
-static void del_rule(struct fs_node *node)
+static void del_sw_hw_rule(struct fs_node *node)
  {
         struct mlx5_flow_rule *rule;
         struct mlx5_flow_table *ft;
@@ -406,7 +453,6 @@ static void del_rule(struct fs_node *node)
         fs_get_obj(fg, fte->node.parent);
         fs_get_obj(ft, fg->node.parent);
         trace_mlx5_fs_del_rule(rule);
-       list_del(&rule->node.list);
         if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
                 mutex_lock(&rule->dest_attr.ft->lock);
                 list_del(&rule->next_ft);
@@ -434,117 +480,203 @@ out:
                                        "%s can't del rule fg id=%d fte_index=%d\n",
                                        __func__, fg->id, fte->index);
         }
+       kfree(rule);
  }
  
-static void destroy_fte(struct fs_fte *fte, struct mlx5_flow_group *fg)
+static void del_hw_fte(struct fs_node *node)
  {
         struct mlx5_flow_table *ft;
-       int ret;
+       struct mlx5_flow_group *fg;
+       struct mlx5_core_dev *dev;
+       struct fs_fte *fte;
+       int err;
  
-       ret = rhashtable_remove_fast(&fg->ftes_hash, &fte->hash, rhash_fte);
-       WARN_ON(ret);
-       fte->status = 0;
+       fs_get_obj(fte, node);
+       fs_get_obj(fg, fte->node.parent);
         fs_get_obj(ft, fg->node.parent);
-       ida_simple_remove(&ft->fte_allocator, fte->index);
+
+       trace_mlx5_fs_del_fte(fte);
+       dev = get_dev(&ft->node);
+       if (node->active) {
+               err = mlx5_cmd_delete_fte(dev, ft,
+                                         fte->index);
+               if (err)
+                       mlx5_core_warn(dev,
+                                      "flow steering can't delete fte in index %d of flow group id %d\n",
+                                      fte->index, fg->id);
+       }
  }
  
-static void del_fte(struct fs_node *node)
+static void del_sw_fte(struct fs_node *node)
  {
-       struct mlx5_flow_table *ft;
+       struct mlx5_flow_steering *steering = get_steering(node);
         struct mlx5_flow_group *fg;
-       struct mlx5_core_dev *dev;
         struct fs_fte *fte;
         int err;
  
         fs_get_obj(fte, node);
         fs_get_obj(fg, fte->node.parent);
-       fs_get_obj(ft, fg->node.parent);
-       trace_mlx5_fs_del_fte(fte);
-
-       dev = get_dev(&ft->node);
-       err = mlx5_cmd_delete_fte(dev, ft,
-                                 fte->index);
-       if (err)
-               mlx5_core_warn(dev,
-                              "flow steering can't delete fte in index %d of flow group id %d\n",
-                              fte->index, fg->id);
  
-       destroy_fte(fte, fg);
+       err = rhashtable_remove_fast(&fg->ftes_hash,
+                                    &fte->hash,
+                                    rhash_fte);
+       WARN_ON(err);
+       ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
+       kmem_cache_free(steering->ftes_cache, fte);
  }
  
-static void del_flow_group(struct fs_node *node)
+static void del_hw_flow_group(struct fs_node *node)
  {
         struct mlx5_flow_group *fg;
         struct mlx5_flow_table *ft;
         struct mlx5_core_dev *dev;
-       int err;
  
         fs_get_obj(fg, node);
         fs_get_obj(ft, fg->node.parent);
         dev = get_dev(&ft->node);
         trace_mlx5_fs_del_fg(fg);
  
-       if (ft->autogroup.active)
-               ft->autogroup.num_groups--;
+       if (fg->node.active && mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
+               mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
+                              fg->id, ft->id);
+}
+
+static void del_sw_flow_group(struct fs_node *node)
+{
+       struct mlx5_flow_steering *steering = get_steering(node);
+       struct mlx5_flow_group *fg;
+       struct mlx5_flow_table *ft;
+       int err;
+
+       fs_get_obj(fg, node);
+       fs_get_obj(ft, fg->node.parent);
  
         rhashtable_destroy(&fg->ftes_hash);
+       ida_destroy(&fg->fte_allocator);
+       if (ft->autogroup.active)
+               ft->autogroup.num_groups--;
         err = rhltable_remove(&ft->fgs_hash,
                               &fg->hash,
                               rhash_fg);
         WARN_ON(err);
-       if (mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
-               mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
-                              fg->id, ft->id);
+       kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte)
+{
+       int index;
+       int ret;
+
+       index = ida_simple_get(&fg->fte_allocator, 0, fg->max_ftes, GFP_KERNEL);
+       if (index < 0)
+               return index;
+
+       fte->index = index + fg->start_index;
+       ret = rhashtable_insert_fast(&fg->ftes_hash,
+                                    &fte->hash,
+                                    rhash_fte);
+       if (ret)
+               goto err_ida_remove;
+
+       tree_add_node(&fte->node, &fg->node);
+       list_add_tail(&fte->node.list, &fg->node.children);
+       return 0;
+
+err_ida_remove:
+       ida_simple_remove(&fg->fte_allocator, index);
+       return ret;
  }
  
-static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
+static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
                                 u32 *match_value,
-                               unsigned int index)
+                               struct mlx5_flow_act *flow_act)
  {
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
         struct fs_fte *fte;
  
-       fte = kzalloc(sizeof(*fte), GFP_KERNEL);
+       fte = kmem_cache_zalloc(steering->ftes_cache, GFP_KERNEL);
         if (!fte)
                 return ERR_PTR(-ENOMEM);
  
         memcpy(fte->val, match_value, sizeof(fte->val));
         fte->node.type =  FS_TYPE_FLOW_ENTRY;
         fte->flow_tag = flow_act->flow_tag;
-       fte->index = index;
         fte->action = flow_act->action;
         fte->encap_id = flow_act->encap_id;
         fte->modify_id = flow_act->modify_id;
  
+       tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
+
         return fte;
  }
  
-static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
+static void dealloc_flow_group(struct mlx5_flow_steering *steering,
+                              struct mlx5_flow_group *fg)
+{
+       rhashtable_destroy(&fg->ftes_hash);
+       kmem_cache_free(steering->fgs_cache, fg);
+}
+
+static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering,
+                                               u8 match_criteria_enable,
+                                               void *match_criteria,
+                                               int start_index,
+                                               int end_index)
  {
         struct mlx5_flow_group *fg;
-       void *match_criteria = MLX5_ADDR_OF(create_flow_group_in,
-                                           create_fg_in, match_criteria);
-       u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
-                                           create_fg_in,
-                                           match_criteria_enable);
         int ret;
  
-       fg = kzalloc(sizeof(*fg), GFP_KERNEL);
+       fg = kmem_cache_zalloc(steering->fgs_cache, GFP_KERNEL);
         if (!fg)
                 return ERR_PTR(-ENOMEM);
  
         ret = rhashtable_init(&fg->ftes_hash, &rhash_fte);
         if (ret) {
-               kfree(fg);
+               kmem_cache_free(steering->fgs_cache, fg);
                 return ERR_PTR(ret);
-       }
+}
+       ida_init(&fg->fte_allocator);
         fg->mask.match_criteria_enable = match_criteria_enable;
         memcpy(&fg->mask.match_criteria, match_criteria,
                sizeof(fg->mask.match_criteria));
         fg->node.type =  FS_TYPE_FLOW_GROUP;
-       fg->start_index = MLX5_GET(create_flow_group_in, create_fg_in,
-                                  start_flow_index);
-       fg->max_ftes = MLX5_GET(create_flow_group_in, create_fg_in,
-                               end_flow_index) - fg->start_index + 1;
+       fg->start_index = start_index;
+       fg->max_ftes = end_index - start_index + 1;
+
+       return fg;
+}
+
+static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft,
+                                                      u8 match_criteria_enable,
+                                                      void *match_criteria,
+                                                      int start_index,
+                                                      int end_index,
+                                                      struct list_head *prev)
+{
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
+       struct mlx5_flow_group *fg;
+       int ret;
+
+       fg = alloc_flow_group(steering, match_criteria_enable, match_criteria,
+                             start_index, end_index);
+       if (IS_ERR(fg))
+               return fg;
+
+       /* initialize refcnt, add to parent list */
+       ret = rhltable_insert(&ft->fgs_hash,
+                             &fg->hash,
+                             rhash_fg);
+       if (ret) {
+               dealloc_flow_group(steering, fg);
+               return ERR_PTR(ret);
+       }
+
+       tree_init_node(&fg->node, del_hw_flow_group, del_sw_flow_group);
+       tree_add_node(&fg->node, &ft->node);
+       /* Add node to group list */
+       list_add(&fg->node.list, prev);
+       atomic_inc(&ft->node.version);
+
         return fg;
  }
  
@@ -575,7 +707,6 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_ft
         ft->flags = flags;
         INIT_LIST_HEAD(&ft->fwd_rules);
         mutex_init(&ft->lock);
-       ida_init(&ft->fte_allocator);
  
         return ft;
  }
@@ -724,7 +855,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
         fs_get_obj(fte, rule->node.parent);
         if (!(fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
                 return -EINVAL;
-       lock_ref_node(&fte->node);
+       down_write_ref_node(&fte->node);
         fs_get_obj(fg, fte->node.parent);
         fs_get_obj(ft, fg->node.parent);
  
@@ -733,7 +864,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
                                   ft, fg->id,
                                   modify_mask,
                                   fte);
-       unlock_ref_node(&fte->node);
+       up_write_ref_node(&fte->node);
  
         return err;
  }
@@ -870,7 +1001,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
                 goto unlock_root;
         }
  
-       tree_init_node(&ft->node, 1, del_flow_table);
+       tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table);
         log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
         next_ft = find_next_chained_ft(fs_prio);
         err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->op_mod, ft->type,
@@ -882,17 +1013,17 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
         err = connect_flow_table(root->dev, ft, fs_prio);
         if (err)
                 goto destroy_ft;
-       lock_ref_node(&fs_prio->node);
+       ft->node.active = true;
+       down_write_ref_node(&fs_prio->node);
         tree_add_node(&ft->node, &fs_prio->node);
         list_add_flow_table(ft, fs_prio);
         fs_prio->num_ft++;
-       unlock_ref_node(&fs_prio->node);
+       up_write_ref_node(&fs_prio->node);
         mutex_unlock(&root->chain_lock);
         return ft;
  destroy_ft:
         mlx5_cmd_destroy_flow_table(root->dev, ft);
  free_ft:
-       ida_destroy(&ft->fte_allocator);
         kfree(ft);
  unlock_root:
         mutex_unlock(&root->chain_lock);
@@ -960,54 +1091,6 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
  }
  EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table);
  
-/* Flow table should be locked */
-static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *ft,
-                                                       u32 *fg_in,
-                                                       struct list_head
-                                                       *prev_fg,
-                                                       bool is_auto_fg)
-{
-       struct mlx5_flow_group *fg;
-       struct mlx5_core_dev *dev = get_dev(&ft->node);
-       int err;
-
-       if (!dev)
-               return ERR_PTR(-ENODEV);
-
-       fg = alloc_flow_group(fg_in);
-       if (IS_ERR(fg))
-               return fg;
-
-       err = rhltable_insert(&ft->fgs_hash, &fg->hash, rhash_fg);
-       if (err)
-               goto err_free_fg;
-
-       err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
-       if (err)
-               goto err_remove_fg;
-
-       if (ft->autogroup.active)
-               ft->autogroup.num_groups++;
-       /* Add node to tree */
-       tree_init_node(&fg->node, !is_auto_fg, del_flow_group);
-       tree_add_node(&fg->node, &ft->node);
-       /* Add node to group list */
-       list_add(&fg->node.list, prev_fg);
-
-       trace_mlx5_fs_add_fg(fg);
-       return fg;
-
-err_remove_fg:
-       WARN_ON(rhltable_remove(&ft->fgs_hash,
-                               &fg->hash,
-                               rhash_fg));
-err_free_fg:
-       rhashtable_destroy(&fg->ftes_hash);
-       kfree(fg);
-
-       return ERR_PTR(err);
-}
-
  struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
                                                u32 *fg_in)
  {
@@ -1016,7 +1099,13 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
         u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
                                             fg_in,
                                             match_criteria_enable);
+       int start_index = MLX5_GET(create_flow_group_in, fg_in,
+                                  start_flow_index);
+       int end_index = MLX5_GET(create_flow_group_in, fg_in,
+                                end_flow_index);
+       struct mlx5_core_dev *dev = get_dev(&ft->node);
         struct mlx5_flow_group *fg;
+       int err;
  
         if (!check_valid_mask(match_criteria_enable, match_criteria))
                 return ERR_PTR(-EINVAL);
@@ -1024,9 +1113,21 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
         if (ft->autogroup.active)
                 return ERR_PTR(-EPERM);
  
-       lock_ref_node(&ft->node);
-       fg = create_flow_group_common(ft, fg_in, ft->node.children.prev, false);
-       unlock_ref_node(&ft->node);
+       down_write_ref_node(&ft->node);
+       fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
+                                    start_index, end_index,
+                                    ft->node.children.prev);
+       up_write_ref_node(&ft->node);
+       if (IS_ERR(fg))
+               return fg;
+
+       err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
+       if (err) {
+               tree_put_node(&fg->node);
+               return ERR_PTR(err);
+       }
+       trace_mlx5_fs_add_fg(fg);
+       fg->node.active = true;
  
         return fg;
  }
@@ -1111,7 +1212,7 @@ create_flow_handle(struct fs_fte *fte,
                 /* Add dest to dests list- we need flow tables to be in the
                  * end of the list for forward to next prio rules.
                  */
-               tree_init_node(&rule->node, 1, del_rule);
+               tree_init_node(&rule->node, NULL, del_sw_hw_rule);
                 if (dest &&
                     dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
                         list_add(&rule->node.list, &fte->node.children);
@@ -1167,7 +1268,9 @@ add_rule_fte(struct fs_fte *fte,
         if (err)
                 goto free_handle;
  
+       fte->node.active = true;
         fte->status |= FS_FTE_STATUS_EXISTING;
+       atomic_inc(&fte->node.version);
  
  out:
         return handle;
@@ -1177,59 +1280,17 @@ free_handle:
         return ERR_PTR(err);
  }
  
-static struct fs_fte *create_fte(struct mlx5_flow_group *fg,
-                                u32 *match_value,
-                                struct mlx5_flow_act *flow_act)
-{
-       struct mlx5_flow_table *ft;
-       struct fs_fte *fte;
-       int index;
-       int ret;
-
-       fs_get_obj(ft, fg->node.parent);
-       index = ida_simple_get(&ft->fte_allocator, fg->start_index,
-                              fg->start_index + fg->max_ftes,
-                              GFP_KERNEL);
-       if (index < 0)
-               return ERR_PTR(index);
-
-       fte = alloc_fte(flow_act, match_value, index);
-       if (IS_ERR(fte)) {
-               ret = PTR_ERR(fte);
-               goto err_alloc;
-       }
-       ret = rhashtable_insert_fast(&fg->ftes_hash, &fte->hash, rhash_fte);
-       if (ret)
-               goto err_hash;
-
-       return fte;
-
-err_hash:
-       kfree(fte);
-err_alloc:
-       ida_simple_remove(&ft->fte_allocator, index);
-       return ERR_PTR(ret);
-}
-
-static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft,
-                                               u8 match_criteria_enable,
-                                               u32 *match_criteria)
+static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table  *ft,
+                                                    struct mlx5_flow_spec *spec)
  {
-       int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
         struct list_head *prev = &ft->node.children;
-       unsigned int candidate_index = 0;
         struct mlx5_flow_group *fg;
-       void *match_criteria_addr;
+       unsigned int candidate_index = 0;
         unsigned int group_size = 0;
-       u32 *in;
  
         if (!ft->autogroup.active)
                 return ERR_PTR(-ENOENT);
  
-       in = kvzalloc(inlen, GFP_KERNEL);
-       if (!in)
-               return ERR_PTR(-ENOMEM);
-
         if (ft->autogroup.num_groups < ft->autogroup.required_groups)
                 /* We save place for flow groups in addition to max types */
                 group_size = ft->max_fte / (ft->autogroup.required_groups + 1);
@@ -1247,25 +1308,55 @@ static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft,
                 prev = &fg->node.list;
         }
  
-       if (candidate_index + group_size > ft->max_fte) {
-               fg = ERR_PTR(-ENOSPC);
+       if (candidate_index + group_size > ft->max_fte)
+               return ERR_PTR(-ENOSPC);
+
+       fg = alloc_insert_flow_group(ft,
+                                    spec->match_criteria_enable,
+                                    spec->match_criteria,
+                                    candidate_index,
+                                    candidate_index + group_size - 1,
+                                    prev);
+       if (IS_ERR(fg))
                 goto out;
-       }
+
+       ft->autogroup.num_groups++;
+
+out:
+       return fg;
+}
+
+static int create_auto_flow_group(struct mlx5_flow_table *ft,
+                                 struct mlx5_flow_group *fg)
+{
+       struct mlx5_core_dev *dev = get_dev(&ft->node);
+       int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+       void *match_criteria_addr;
+       int err;
+       u32 *in;
+
+       in = kvzalloc(inlen, GFP_KERNEL);
+       if (!in)
+               return -ENOMEM;
  
         MLX5_SET(create_flow_group_in, in, match_criteria_enable,
-                match_criteria_enable);
-       MLX5_SET(create_flow_group_in, in, start_flow_index, candidate_index);
-       MLX5_SET(create_flow_group_in, in, end_flow_index,   candidate_index +
-                group_size - 1);
+                fg->mask.match_criteria_enable);
+       MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index);
+       MLX5_SET(create_flow_group_in, in, end_flow_index,   fg->start_index +
+                fg->max_ftes - 1);
         match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in,
                                            in, match_criteria);
-       memcpy(match_criteria_addr, match_criteria,
-              MLX5_ST_SZ_BYTES(fte_match_param));
+       memcpy(match_criteria_addr, fg->mask.match_criteria,
+              sizeof(fg->mask.match_criteria));
+
+       err = mlx5_cmd_create_flow_group(dev, ft, in, &fg->id);
+       if (!err) {
+               fg->node.active = true;
+               trace_mlx5_fs_add_fg(fg);
+       }
  
-       fg = create_flow_group_common(ft, in, prev, true);
-out:
         kvfree(in);
-       return fg;
+       return err;
  }
  
  static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
@@ -1340,60 +1431,30 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
                                             struct fs_fte *fte)
  {
         struct mlx5_flow_handle *handle;
-       struct mlx5_flow_table *ft;
+       int old_action;
         int i;
+       int ret;
  
-       if (fte) {
-               int old_action;
-               int ret;
-
-               nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
-               ret = check_conflicting_ftes(fte, flow_act);
-               if (ret) {
-                       handle = ERR_PTR(ret);
-                       goto unlock_fte;
-               }
-
-               old_action = fte->action;
-               fte->action |= flow_act->action;
-               handle = add_rule_fte(fte, fg, dest, dest_num,
-                                     old_action != flow_act->action);
-               if (IS_ERR(handle)) {
-                       fte->action = old_action;
-                       goto unlock_fte;
-               } else {
-                       trace_mlx5_fs_set_fte(fte, false);
-                       goto add_rules;
-               }
-       }
-       fs_get_obj(ft, fg->node.parent);
+       ret = check_conflicting_ftes(fte, flow_act);
+       if (ret)
+               return ERR_PTR(ret);
  
-       fte = create_fte(fg, match_value, flow_act);
-       if (IS_ERR(fte))
-               return (void *)fte;
-       tree_init_node(&fte->node, 0, del_fte);
-       nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
-       handle = add_rule_fte(fte, fg, dest, dest_num, false);
+       old_action = fte->action;
+       fte->action |= flow_act->action;
+       handle = add_rule_fte(fte, fg, dest, dest_num,
+                             old_action != flow_act->action);
         if (IS_ERR(handle)) {
-               unlock_ref_node(&fte->node);
-               destroy_fte(fte, fg);
-               kfree(fte);
+               fte->action = old_action;
                 return handle;
         }
+       trace_mlx5_fs_set_fte(fte, false);
  
-       tree_add_node(&fte->node, &fg->node);
-       /* fte list isn't sorted */
-       list_add_tail(&fte->node.list, &fg->node.children);
-       trace_mlx5_fs_set_fte(fte, true);
-add_rules:
         for (i = 0; i < handle->num_rules; i++) {
                 if (atomic_read(&handle->rule[i]->node.refcount) == 1) {
                         tree_add_node(&handle->rule[i]->node, &fte->node);
                         trace_mlx5_fs_add_rule(handle->rule[i]);
                 }
         }
-unlock_fte:
-       unlock_ref_node(&fte->node);
         return handle;
  }
  
@@ -1441,93 +1502,197 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
         return true;
  }
  
-static struct mlx5_flow_handle *
-try_add_to_existing_fg(struct mlx5_flow_table *ft,
-                      struct mlx5_flow_spec *spec,
-                      struct mlx5_flow_act *flow_act,
-                      struct mlx5_flow_destination *dest,
-                      int dest_num)
-{
+struct match_list {
+       struct list_head        list;
         struct mlx5_flow_group *g;
-       struct mlx5_flow_handle *rule = ERR_PTR(-ENOENT);
+};
+
+struct match_list_head {
+       struct list_head  list;
+       struct match_list first;
+};
+
+static void free_match_list(struct match_list_head *head)
+{
+       if (!list_empty(&head->list)) {
+               struct match_list *iter, *match_tmp;
+
+               list_del(&head->first.list);
+               tree_put_node(&head->first.g->node);
+               list_for_each_entry_safe(iter, match_tmp, &head->list,
+                                        list) {
+                       tree_put_node(&iter->g->node);
+                       list_del(&iter->list);
+                       kfree(iter);
+               }
+       }
+}
+
+static int build_match_list(struct match_list_head *match_head,
+                           struct mlx5_flow_table *ft,
+                           struct mlx5_flow_spec *spec)
+{
         struct rhlist_head *tmp, *list;
-       struct match_list {
-               struct list_head        list;
-               struct mlx5_flow_group *g;
-       } match_list, *iter;
-       LIST_HEAD(match_head);
+       struct mlx5_flow_group *g;
+       int err = 0;
  
         rcu_read_lock();
+       INIT_LIST_HEAD(&match_head->list);
         /* Collect all fgs which has a matching match_criteria */
         list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg);
+       /* RCU is atomic, we can't execute FW commands here */
         rhl_for_each_entry_rcu(g, tmp, list, hash) {
                 struct match_list *curr_match;
  
-               if (likely(list_empty(&match_head))) {
-                       match_list.g = g;
-                       list_add_tail(&match_list.list, &match_head);
+               if (likely(list_empty(&match_head->list))) {
+                       if (!tree_get_node(&g->node))
+                               continue;
+                       match_head->first.g = g;
+                       list_add_tail(&match_head->first.list,
+                                     &match_head->list);
                         continue;
                 }
-               curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
  
+               curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
                 if (!curr_match) {
-                       rcu_read_unlock();
-                       rule = ERR_PTR(-ENOMEM);
-                       goto free_list;
+                       free_match_list(match_head);
+                       err = -ENOMEM;
+                       goto out;
+               }
+               if (!tree_get_node(&g->node)) {
+                       kfree(curr_match);
+                       continue;
                 }
                 curr_match->g = g;
-               list_add_tail(&curr_match->list, &match_head);
+               list_add_tail(&curr_match->list, &match_head->list);
         }
+out:
         rcu_read_unlock();
+       return err;
+}
+
+static u64 matched_fgs_get_version(struct list_head *match_head)
+{
+       struct match_list *iter;
+       u64 version = 0;
+
+       list_for_each_entry(iter, match_head, list)
+               version += (u64)atomic_read(&iter->g->node.version);
+       return version;
+}
+
+static struct mlx5_flow_handle *
+try_add_to_existing_fg(struct mlx5_flow_table *ft,
+                      struct list_head *match_head,
+                      struct mlx5_flow_spec *spec,
+                      struct mlx5_flow_act *flow_act,
+                      struct mlx5_flow_destination *dest,
+                      int dest_num,
+                      int ft_version)
+{
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
+       struct mlx5_flow_group *g;
+       struct mlx5_flow_handle *rule;
+       struct match_list *iter;
+       bool take_write = false;
+       struct fs_fte *fte;
+       u64  version;
+       int err;
+
+       fte = alloc_fte(ft, spec->match_value, flow_act);
+       if (IS_ERR(fte))
+               return  ERR_PTR(-ENOMEM);
  
+       list_for_each_entry(iter, match_head, list) {
+               nested_down_read_ref_node(&iter->g->node, FS_LOCK_PARENT);
+               ida_pre_get(&iter->g->fte_allocator, GFP_KERNEL);
+       }
+
+search_again_locked:
+       version = matched_fgs_get_version(match_head);
         /* Try to find a fg that already contains a matching fte */
-       list_for_each_entry(iter, &match_head, list) {
-               struct fs_fte *fte;
+       list_for_each_entry(iter, match_head, list) {
+               struct fs_fte *fte_tmp;
  
                 g = iter->g;
-               nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
-               fte = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
-                                            rhash_fte);
-               if (fte) {
-                       rule = add_rule_fg(g, spec->match_value,
-                                          flow_act, dest, dest_num, fte);
-                       unlock_ref_node(&g->node);
-                       goto free_list;
+               fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
+                                                rhash_fte);
+               if (!fte_tmp || !tree_get_node(&fte_tmp->node))
+                       continue;
+
+               nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
+               if (!take_write) {
+                       list_for_each_entry(iter, match_head, list)
+                               up_read_ref_node(&iter->g->node);
+               } else {
+                       list_for_each_entry(iter, match_head, list)
+                               up_write_ref_node(&iter->g->node);
                 }
-               unlock_ref_node(&g->node);
+
+               rule = add_rule_fg(g, spec->match_value,
+                                  flow_act, dest, dest_num, fte_tmp);
+               up_write_ref_node(&fte_tmp->node);
+               tree_put_node(&fte_tmp->node);
+               kmem_cache_free(steering->ftes_cache, fte);
+               return rule;
         }
  
         /* No group with matching fte found. Try to add a new fte to any
          * matching fg.
          */
-       list_for_each_entry(iter, &match_head, list) {
-               g = iter->g;
  
-               nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
-               rule = add_rule_fg(g, spec->match_value,
-                                  flow_act, dest, dest_num, NULL);
-               if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC) {
-                       unlock_ref_node(&g->node);
-                       goto free_list;
-               }
-               unlock_ref_node(&g->node);
+       if (!take_write) {
+               list_for_each_entry(iter, match_head, list)
+                       up_read_ref_node(&iter->g->node);
+               list_for_each_entry(iter, match_head, list)
+                       nested_down_write_ref_node(&iter->g->node,
+                                                  FS_LOCK_PARENT);
+               take_write = true;
         }
  
-free_list:
-       if (!list_empty(&match_head)) {
-               struct match_list *match_tmp;
+       /* Check the ft version, for case that new flow group
+        * was added while the fgs weren't locked
+        */
+       if (atomic_read(&ft->node.version) != ft_version) {
+               rule = ERR_PTR(-EAGAIN);
+               goto out;
+       }
  
-               /* The most common case is having one FG. Since we want to
-                * optimize this case, we save the first on the stack.
-                * Therefore, no need to free it.
-                */
-               list_del(&list_first_entry(&match_head, typeof(*iter), list)->list);
-               list_for_each_entry_safe(iter, match_tmp, &match_head, list) {
-                       list_del(&iter->list);
-                       kfree(iter);
+       /* Check the fgs version, for case the new FTE with the
+        * same values was added while the fgs weren't locked
+        */
+       if (version != matched_fgs_get_version(match_head))
+               goto search_again_locked;
+
+       list_for_each_entry(iter, match_head, list) {
+               g = iter->g;
+
+               if (!g->node.active)
+                       continue;
+               err = insert_fte(g, fte);
+               if (err) {
+                       if (err == -ENOSPC)
+                               continue;
+                       list_for_each_entry(iter, match_head, list)
+                               up_write_ref_node(&iter->g->node);
+                       kmem_cache_free(steering->ftes_cache, fte);
+                       return ERR_PTR(err);
                 }
-       }
  
+               nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+               list_for_each_entry(iter, match_head, list)
+                       up_write_ref_node(&iter->g->node);
+               rule = add_rule_fg(g, spec->match_value,
+                                  flow_act, dest, dest_num, fte);
+               up_write_ref_node(&fte->node);
+               tree_put_node(&fte->node);
+               return rule;
+       }
+       rule = ERR_PTR(-ENOENT);
+out:
+       list_for_each_entry(iter, match_head, list)
+               up_write_ref_node(&iter->g->node);
+       kmem_cache_free(steering->ftes_cache, fte);
         return rule;
  }
  
@@ -1539,8 +1704,14 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
                      int dest_num)
  
  {
+       struct mlx5_flow_steering *steering = get_steering(&ft->node);
         struct mlx5_flow_group *g;
         struct mlx5_flow_handle *rule;
+       struct match_list_head match_head;
+       bool take_write = false;
+       struct fs_fte *fte;
+       int version;
+       int err;
         int i;
  
         if (!check_valid_spec(spec))
@@ -1550,33 +1721,73 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft,
                 if (!dest_is_valid(&dest[i], flow_act->action, ft))
                         return ERR_PTR(-EINVAL);
         }
+       nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+search_again_locked:
+       version = atomic_read(&ft->node.version);
+
+       /* Collect all fgs which has a matching match_criteria */
+       err = build_match_list(&match_head, ft, spec);
+       if (err)
+               return ERR_PTR(err);
+
+       if (!take_write)
+               up_read_ref_node(&ft->node);
+
+       rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest,
+                                     dest_num, version);
+       free_match_list(&match_head);
+       if (!IS_ERR(rule) ||
+           (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN))
+               return rule;
+
+       if (!take_write) {
+               nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+               take_write = true;
+       }
  
-       nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
-       rule = try_add_to_existing_fg(ft, spec, flow_act, dest, dest_num);
-       if (!IS_ERR(rule))
-               goto unlock;
+       if (PTR_ERR(rule) == -EAGAIN ||
+           version != atomic_read(&ft->node.version))
+               goto search_again_locked;
  
-       g = create_autogroup(ft, spec->match_criteria_enable,
-                            spec->match_criteria);
+       g = alloc_auto_flow_group(ft, spec);
         if (IS_ERR(g)) {
                 rule = (void *)g;
-               goto unlock;
+               up_write_ref_node(&ft->node);
+               return rule;
         }
  
-       rule = add_rule_fg(g, spec->match_value, flow_act, dest,
-                          dest_num, NULL);
-       if (IS_ERR(rule)) {
-               /* Remove assumes refcount > 0 and autogroup creates a group
-                * with a refcount = 0.
-                */
-               unlock_ref_node(&ft->node);
-               tree_get_node(&g->node);
-               tree_remove_node(&g->node);
-               return rule;
+       nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
+       up_write_ref_node(&ft->node);
+
+       err = create_auto_flow_group(ft, g);
+       if (err)
+               goto err_release_fg;
+
+       fte = alloc_fte(ft, spec->match_value, flow_act);
+       if (IS_ERR(fte)) {
+               err = PTR_ERR(fte);
+               goto err_release_fg;
         }
-unlock:
-       unlock_ref_node(&ft->node);
+
+       err = insert_fte(g, fte);
+       if (err) {
+               kmem_cache_free(steering->ftes_cache, fte);
+               goto err_release_fg;
+       }
+
+       nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+       up_write_ref_node(&g->node);
+       rule = add_rule_fg(g, spec->match_value, flow_act, dest,
+                          dest_num, fte);
+       up_write_ref_node(&fte->node);
+       tree_put_node(&fte->node);
+       tree_put_node(&g->node);
         return rule;
+
+err_release_fg:
+       up_write_ref_node(&g->node);
+       tree_put_node(&g->node);
+       return ERR_PTR(err);
  }
  
  static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
@@ -1817,7 +2028,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
                 return ERR_PTR(-ENOMEM);
  
         fs_prio->node.type = FS_TYPE_PRIO;
-       tree_init_node(&fs_prio->node, 1, NULL);
+       tree_init_node(&fs_prio->node, NULL, NULL);
         tree_add_node(&fs_prio->node, &ns->node);
         fs_prio->num_levels = num_levels;
         fs_prio->prio = prio;
@@ -1843,7 +2054,7 @@ static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
                 return ERR_PTR(-ENOMEM);
  
         fs_init_namespace(ns);
-       tree_init_node(&ns->node, 1, NULL);
+       tree_init_node(&ns->node, NULL, NULL);
         tree_add_node(&ns->node, &prio->node);
         list_add_tail(&ns->node.list, &prio->node.children);
  
@@ -1968,7 +2179,7 @@ static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_flow_steering
         ns = &root_ns->ns;
         fs_init_namespace(ns);
         mutex_init(&root_ns->chain_lock);
-       tree_init_node(&ns->node, 1, NULL);
+       tree_init_node(&ns->node, NULL, NULL);
         tree_add_node(&ns->node, NULL);
  
         return root_ns;
@@ -2066,8 +2277,10 @@ static void clean_tree(struct fs_node *node)
                 struct fs_node *iter;
                 struct fs_node *temp;
  
+               tree_get_node(node);
                 list_for_each_entry_safe(iter, temp, &node->children, list)
                         clean_tree(iter);
+               tree_put_node(node);
                 tree_remove_node(node);
         }
  }
@@ -2091,6 +2304,8 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
         cleanup_root_ns(steering->sniffer_rx_root_ns);
         cleanup_root_ns(steering->sniffer_tx_root_ns);
         mlx5_cleanup_fc_stats(dev);
+       kmem_cache_destroy(steering->ftes_cache);
+       kmem_cache_destroy(steering->fgs_cache);
         kfree(steering);
  }
  
@@ -2196,6 +2411,16 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
         steering->dev = dev;
         dev->priv.steering = steering;
  
+       steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs",
+                                               sizeof(struct mlx5_flow_group), 0,
+                                               0, NULL);
+       steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0,
+                                                0, NULL);
+       if (!steering->ftes_cache || !steering->fgs_cache) {
+               err = -ENOMEM;
+               goto err;
+       }
+
         if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
               (MLX5_CAP_GEN(dev, nic_flow_table))) ||
              ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h

index 48dd789..7a01277 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -66,6 +66,8 @@ enum fs_fte_status {
  
  struct mlx5_flow_steering {
         struct mlx5_core_dev *dev;
+       struct kmem_cache               *fgs_cache;
+       struct kmem_cache               *ftes_cache;
         struct mlx5_flow_root_namespace *root_ns;
         struct mlx5_flow_root_namespace *fdb_root_ns;
         struct mlx5_flow_root_namespace *esw_egress_root_ns;
@@ -81,9 +83,12 @@ struct fs_node {
         struct fs_node          *parent;
         struct fs_node          *root;
         /* lock the node for writing and traversing */
-       struct mutex            lock;
+       struct rw_semaphore     lock;
         atomic_t                refcount;
-       void                    (*remove_func)(struct fs_node *);
+       bool                    active;
+       void                    (*del_hw_func)(struct fs_node *);
+       void                    (*del_sw_func)(struct fs_node *);
+       atomic_t                version;
  };
  
  struct mlx5_flow_rule {
@@ -120,7 +125,6 @@ struct mlx5_flow_table {
         /* FWD rules that point on this flow table */
         struct list_head                fwd_rules;
         u32                             flags;
-       struct ida                      fte_allocator;
         struct rhltable                 fgs_hash;
  };
  
@@ -200,6 +204,7 @@ struct mlx5_flow_group {
         struct mlx5_flow_group_mask     mask;
         u32                             start_index;
         u32                             max_ftes;
+       struct ida                      fte_allocator;
         u32                             id;
         struct rhashtable               ftes_hash;
         struct rhlist_head              hash;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c

index 5cd4df0..321988a 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -53,6 +53,7 @@
  #include <linux/notifier.h>
  #include <linux/dcbnl.h>
  #include <linux/inetdevice.h>
+#include <linux/netlink.h>
  #include <net/switchdev.h>
  #include <net/pkt_cls.h>
  #include <net/tc_act/tc_mirred.h>
@@ -4298,7 +4299,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev,
                         if (info->linking)
                                 err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
                                                                 lower_dev,
-                                                               upper_dev);
+                                                               upper_dev,
+                                                               extack);
                         else
                                 mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
                                                            lower_dev,
@@ -4389,18 +4391,25 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
  {
         struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
         struct netdev_notifier_changeupper_info *info = ptr;
+       struct netlink_ext_ack *extack;
         struct net_device *upper_dev;
         int err = 0;
  
+       extack = netdev_notifier_info_to_extack(&info->info);
+
         switch (event) {
         case NETDEV_PRECHANGEUPPER:
                 upper_dev = info->upper_dev;
-               if (!netif_is_bridge_master(upper_dev))
+               if (!netif_is_bridge_master(upper_dev)) {
+                       NL_SET_ERR_MSG(extack, "spectrum: VLAN devices only support bridge and VRF uppers");
                         return -EINVAL;
+               }
                 if (!info->linking)
                         break;
-               if (netdev_has_any_upper_dev(upper_dev))
+               if (netdev_has_any_upper_dev(upper_dev)) {
+                       NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported");
                         return -EINVAL;
+               }
                 break;
         case NETDEV_CHANGEUPPER:
                 upper_dev = info->upper_dev;
@@ -4408,7 +4417,8 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev,
                         if (info->linking)
                                 err = mlxsw_sp_port_bridge_join(mlxsw_sp_port,
                                                                 vlan_dev,
-                                                               upper_dev);
+                                                               upper_dev,
+                                                               extack);
                         else
                                 mlxsw_sp_port_bridge_leave(mlxsw_sp_port,
                                                            vlan_dev,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h

index ae67e60..8e45183 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -326,7 +326,8 @@ void
  mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan);
  int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
                               struct net_device *brport_dev,
-                             struct net_device *br_dev);
+                             struct net_device *br_dev,
+                             struct netlink_ext_ack *extack);
  void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port,
                                 struct net_device *brport_dev,
                                 struct net_device *br_dev);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c

index e0f8ea4..6a356f4 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -3640,20 +3640,6 @@ static int mlxsw_sp_fib_lpm_tree_link(struct mlxsw_sp *mlxsw_sp,
  static void mlxsw_sp_fib_lpm_tree_unlink(struct mlxsw_sp *mlxsw_sp,
                                          struct mlxsw_sp_fib *fib)
  {
-       struct mlxsw_sp_prefix_usage req_prefix_usage = {{ 0 } };
-       struct mlxsw_sp_lpm_tree *lpm_tree;
-
-       /* Aggregate prefix lengths across all virtual routers to make
-        * sure we only have used prefix lengths in the LPM tree.
-        */
-       mlxsw_sp_vrs_prefixes(mlxsw_sp, fib->proto, &req_prefix_usage);
-       lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
-                                        fib->proto);
-       if (IS_ERR(lpm_tree))
-               goto err_tree_get;
-       mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree);
-
-err_tree_get:
         if (!mlxsw_sp_prefix_usage_none(&fib->prefix_usage))
                 return;
         mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, fib);
@@ -5957,7 +5943,7 @@ static int mlxsw_sp_rif_vlan_fid_op(struct mlxsw_sp_rif *rif,
         return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl);
  }
  
-static u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp)
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp)
  {
         return mlxsw_core_max_ports(mlxsw_sp->core) + 1;
  }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h

index 3d44918..3f2d840 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h
@@ -70,6 +70,7 @@ u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif);
  u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif);
  u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif);
  int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif);
+u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp);
  const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif);
  int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp,
                                    struct mlxsw_sp_rif *rif,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c

index 0f9eac5..7b8548e 100644 (file)
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -46,8 +46,10 @@
  #include <linux/workqueue.h>
  #include <linux/jiffies.h>
  #include <linux/rtnetlink.h>
+#include <linux/netlink.h>
  #include <net/switchdev.h>
  
+#include "spectrum_router.h"
  #include "spectrum.h"
  #include "core.h"
  #include "reg.h"
@@ -78,7 +80,8 @@ struct mlxsw_sp_bridge_device {
         struct list_head ports_list;
         struct list_head mids_list;
         u8 vlan_enabled:1,
-          multicast_enabled:1;
+          multicast_enabled:1,
+          mrouter:1;
         const struct mlxsw_sp_bridge_ops *ops;
  };
  
@@ -107,7 +110,8 @@ struct mlxsw_sp_bridge_vlan {
  struct mlxsw_sp_bridge_ops {
         int (*port_join)(struct mlxsw_sp_bridge_device *bridge_device,
                          struct mlxsw_sp_bridge_port *bridge_port,
-                        struct mlxsw_sp_port *mlxsw_sp_port);
+                        struct mlxsw_sp_port *mlxsw_sp_port,
+                        struct netlink_ext_ack *extack);
         void (*port_leave)(struct mlxsw_sp_bridge_device *bridge_device,
                            struct mlxsw_sp_bridge_port *bridge_port,
                            struct mlxsw_sp_port *mlxsw_sp_port);
@@ -168,6 +172,7 @@ mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge,
         bridge_device->dev = br_dev;
         bridge_device->vlan_enabled = vlan_enabled;
         bridge_device->multicast_enabled = br_multicast_enabled(br_dev);
+       bridge_device->mrouter = br_multicast_router(br_dev);
         INIT_LIST_HEAD(&bridge_device->ports_list);
         if (vlan_enabled) {
                 bridge->vlan_enabled_exists = true;
@@ -810,6 +815,60 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port,
         return 0;
  }
  
+static int mlxsw_sp_smid_router_port_set(struct mlxsw_sp *mlxsw_sp,
+                                        u16 mid_idx, bool add)
+{
+       char *smid_pl;
+       int err;
+
+       smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL);
+       if (!smid_pl)
+               return -ENOMEM;
+
+       mlxsw_reg_smid_pack(smid_pl, mid_idx,
+                           mlxsw_sp_router_port(mlxsw_sp), add);
+       err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
+       kfree(smid_pl);
+       return err;
+}
+
+static void
+mlxsw_sp_bridge_mrouter_update_mdb(struct mlxsw_sp *mlxsw_sp,
+                                  struct mlxsw_sp_bridge_device *bridge_device,
+                                  bool add)
+{
+       struct mlxsw_sp_mid *mid;
+
+       list_for_each_entry(mid, &bridge_device->mids_list, list)
+               mlxsw_sp_smid_router_port_set(mlxsw_sp, mid->mid, add);
+}
+
+static int
+mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port,
+                                 struct switchdev_trans *trans,
+                                 struct net_device *orig_dev,
+                                 bool is_mrouter)
+{
+       struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
+       struct mlxsw_sp_bridge_device *bridge_device;
+
+       if (switchdev_trans_ph_prepare(trans))
+               return 0;
+
+       /* It's possible we failed to enslave the port, yet this
+        * operation is executed due to it being deferred.
+        */
+       bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev);
+       if (!bridge_device)
+               return 0;
+
+       if (bridge_device->mrouter != is_mrouter)
+               mlxsw_sp_bridge_mrouter_update_mdb(mlxsw_sp, bridge_device,
+                                                  is_mrouter);
+       bridge_device->mrouter = is_mrouter;
+       return 0;
+}
+
  static int mlxsw_sp_port_attr_set(struct net_device *dev,
                                   const struct switchdev_attr *attr,
                                   struct switchdev_trans *trans)
@@ -847,6 +906,11 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev,
                                                     attr->orig_dev,
                                                     attr->u.mc_disabled);
                 break;
+       case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER:
+               err = mlxsw_sp_port_attr_br_mrouter_set(mlxsw_sp_port, trans,
+                                                       attr->orig_dev,
+                                                       attr->u.mrouter);
+               break;
         default:
                 err = -EOPNOTSUPP;
                 break;
@@ -1241,7 +1305,8 @@ static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr,
  }
  
  static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
-                                        long *ports_bitmap)
+                                        long *ports_bitmap,
+                                        bool set_router_port)
  {
         char *smid_pl;
         int err, i;
@@ -1256,9 +1321,15 @@ static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx,
                         mlxsw_reg_smid_port_mask_set(smid_pl, i, 1);
         }
  
+       mlxsw_reg_smid_port_mask_set(smid_pl,
+                                    mlxsw_sp_router_port(mlxsw_sp), 1);
+
         for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core))
                 mlxsw_reg_smid_port_set(smid_pl, i, 1);
  
+       mlxsw_reg_smid_port_set(smid_pl, mlxsw_sp_router_port(mlxsw_sp),
+                               set_router_port);
+
         err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl);
         kfree(smid_pl);
         return err;
@@ -1362,7 +1433,8 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp,
         mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp);
  
         mid->mid = mid_idx;
-       err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap);
+       err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap,
+                                           bridge_device->mrouter);
         kfree(flood_bitmap);
         if (err)
                 return false;
@@ -1735,12 +1807,15 @@ static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = {
  static int
  mlxsw_sp_bridge_8021q_port_join(struct mlxsw_sp_bridge_device *bridge_device,
                                 struct mlxsw_sp_bridge_port *bridge_port,
-                               struct mlxsw_sp_port *mlxsw_sp_port)
+                               struct mlxsw_sp_port *mlxsw_sp_port,
+                               struct netlink_ext_ack *extack)
  {
         struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
  
-       if (is_vlan_dev(bridge_port->dev))
+       if (is_vlan_dev(bridge_port->dev)) {
+               NL_SET_ERR_MSG(extack, "spectrum: Can not enslave a VLAN device to a VLAN-aware bridge");
                 return -EINVAL;
+       }
  
         mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, 1);
         if (WARN_ON(!mlxsw_sp_port_vlan))
@@ -1797,13 +1872,16 @@ mlxsw_sp_port_is_br_member(const struct mlxsw_sp_port *mlxsw_sp_port,
  static int
  mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device,
                                 struct mlxsw_sp_bridge_port *bridge_port,
-                               struct mlxsw_sp_port *mlxsw_sp_port)
+                               struct mlxsw_sp_port *mlxsw_sp_port,
+                               struct netlink_ext_ack *extack)
  {
         struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
         u16 vid;
  
-       if (!is_vlan_dev(bridge_port->dev))
+       if (!is_vlan_dev(bridge_port->dev)) {
+               NL_SET_ERR_MSG(extack, "spectrum: Only VLAN devices can be enslaved to a VLAN-unaware bridge");
                 return -EINVAL;
+       }
         vid = vlan_dev_vlan_id(bridge_port->dev);
  
         mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid);
@@ -1811,7 +1889,7 @@ mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device,
                 return -EINVAL;
  
         if (mlxsw_sp_port_is_br_member(mlxsw_sp_port, bridge_device->dev)) {
-               netdev_err(mlxsw_sp_port->dev, "Can't bridge VLAN uppers of the same port\n");
+               NL_SET_ERR_MSG(extack, "spectrum: Can not bridge VLAN uppers of the same port");
                 return -EINVAL;
         }
  
@@ -1854,7 +1932,8 @@ static const struct mlxsw_sp_bridge_ops mlxsw_sp_bridge_8021d_ops = {
  
  int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
                               struct net_device *brport_dev,
-                             struct net_device *br_dev)
+                             struct net_device *br_dev,
+                             struct netlink_ext_ack *extack)
  {
         struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
         struct mlxsw_sp_bridge_device *bridge_device;
@@ -1867,7 +1946,7 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port,
         bridge_device = bridge_port->bridge_device;
  
         err = bridge_device->ops->port_join(bridge_device, bridge_port,
-                                           mlxsw_sp_port);
+                                           mlxsw_sp_port, extack);
         if (err)
                 goto err_port_join;
  
diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile

index becaacf..bd3b2bd 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/Makefile
+++ b/drivers/net/ethernet/netronome/nfp/Makefile
@@ -14,6 +14,7 @@ nfp-objs := \
             nfpcore/nfp_resource.o \
             nfpcore/nfp_rtsym.o \
             nfpcore/nfp_target.o \
+           nfp_asm.o \
             nfp_app.o \
             nfp_app_nic.o \
             nfp_devlink.o \
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c

index 239dfbe..13148f3 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -110,150 +110,7 @@ nfp_prog_offset_to_index(struct nfp_prog *nfp_prog, unsigned int offset)
         return offset - nfp_prog->start_off;
  }
  
-/* --- SW reg --- */
-struct nfp_insn_ur_regs {
-       enum alu_dst_ab dst_ab;
-       u16 dst;
-       u16 areg, breg;
-       bool swap;
-       bool wr_both;
-};
-
-struct nfp_insn_re_regs {
-       enum alu_dst_ab dst_ab;
-       u8 dst;
-       u8 areg, breg;
-       bool swap;
-       bool wr_both;
-       bool i8;
-};
-
-static u16 nfp_swreg_to_unreg(u32 swreg, bool is_dst)
-{
-       u16 val = FIELD_GET(NN_REG_VAL, swreg);
-
-       switch (FIELD_GET(NN_REG_TYPE, swreg)) {
-       case NN_REG_GPR_A:
-       case NN_REG_GPR_B:
-       case NN_REG_GPR_BOTH:
-               return val;
-       case NN_REG_NNR:
-               return UR_REG_NN | val;
-       case NN_REG_XFER:
-               return UR_REG_XFR | val;
-       case NN_REG_IMM:
-               if (val & ~0xff) {
-                       pr_err("immediate too large\n");
-                       return 0;
-               }
-               return UR_REG_IMM_encode(val);
-       case NN_REG_NONE:
-               return is_dst ? UR_REG_NO_DST : REG_NONE;
-       default:
-               pr_err("unrecognized reg encoding %08x\n", swreg);
-               return 0;
-       }
-}
-
-static int
-swreg_to_unrestricted(u32 dst, u32 lreg, u32 rreg, struct nfp_insn_ur_regs *reg)
-{
-       memset(reg, 0, sizeof(*reg));
-
-       /* Decode destination */
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM)
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_B)
-               reg->dst_ab = ALU_DST_B;
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_BOTH)
-               reg->wr_both = true;
-       reg->dst = nfp_swreg_to_unreg(dst, true);
-
-       /* Decode source operands */
-       if (FIELD_GET(NN_REG_TYPE, lreg) == FIELD_GET(NN_REG_TYPE, rreg))
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, lreg) == NN_REG_GPR_B ||
-           FIELD_GET(NN_REG_TYPE, rreg) == NN_REG_GPR_A) {
-               reg->areg = nfp_swreg_to_unreg(rreg, false);
-               reg->breg = nfp_swreg_to_unreg(lreg, false);
-               reg->swap = true;
-       } else {
-               reg->areg = nfp_swreg_to_unreg(lreg, false);
-               reg->breg = nfp_swreg_to_unreg(rreg, false);
-       }
-
-       return 0;
-}
-
-static u16 nfp_swreg_to_rereg(u32 swreg, bool is_dst, bool has_imm8, bool *i8)
-{
-       u16 val = FIELD_GET(NN_REG_VAL, swreg);
-
-       switch (FIELD_GET(NN_REG_TYPE, swreg)) {
-       case NN_REG_GPR_A:
-       case NN_REG_GPR_B:
-       case NN_REG_GPR_BOTH:
-               return val;
-       case NN_REG_XFER:
-               return RE_REG_XFR | val;
-       case NN_REG_IMM:
-               if (val & ~(0x7f | has_imm8 << 7)) {
-                       pr_err("immediate too large\n");
-                       return 0;
-               }
-               *i8 = val & 0x80;
-               return RE_REG_IMM_encode(val & 0x7f);
-       case NN_REG_NONE:
-               return is_dst ? RE_REG_NO_DST : REG_NONE;
-       default:
-               pr_err("unrecognized reg encoding\n");
-               return 0;
-       }
-}
-
-static int
-swreg_to_restricted(u32 dst, u32 lreg, u32 rreg, struct nfp_insn_re_regs *reg,
-                   bool has_imm8)
-{
-       memset(reg, 0, sizeof(*reg));
-
-       /* Decode destination */
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM)
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_B)
-               reg->dst_ab = ALU_DST_B;
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_BOTH)
-               reg->wr_both = true;
-       reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL);
-
-       /* Decode source operands */
-       if (FIELD_GET(NN_REG_TYPE, lreg) == FIELD_GET(NN_REG_TYPE, rreg))
-               return -EFAULT;
-
-       if (FIELD_GET(NN_REG_TYPE, lreg) == NN_REG_GPR_B ||
-           FIELD_GET(NN_REG_TYPE, rreg) == NN_REG_GPR_A) {
-               reg->areg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
-               reg->breg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
-               reg->swap = true;
-       } else {
-               reg->areg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
-               reg->breg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
-       }
-
-       return 0;
-}
-
  /* --- Emitters --- */
-static const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
-       [CMD_TGT_WRITE8] =              { 0x00, 0x42 },
-       [CMD_TGT_READ8] =               { 0x01, 0x43 },
-       [CMD_TGT_READ_LE] =             { 0x01, 0x40 },
-       [CMD_TGT_READ_SWAP_LE] =        { 0x03, 0x40 },
-};
-
  static void
  __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
            u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync)
@@ -281,7 +138,7 @@ __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
  
  static void
  emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
-        u8 mode, u8 xfer, u32 lreg, u32 rreg, u8 size, bool sync)
+        u8 mode, u8 xfer, swreg lreg, swreg rreg, u8 size, bool sync)
  {
         struct nfp_insn_re_regs reg;
         int err;
@@ -296,6 +153,11 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
                 nfp_prog->error = -EFAULT;
                 return;
         }
+       if (reg.dst_lmextn || reg.src_lmextn) {
+               pr_err("cmd can't use LMextn\n");
+               nfp_prog->error = -EFAULT;
+               return;
+       }
  
         __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync);
  }
@@ -341,7 +203,7 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
  
  static void
  __emit_br_byte(struct nfp_prog *nfp_prog, u8 areg, u8 breg, bool imm8,
-              u8 byte, bool equal, u16 addr, u8 defer)
+              u8 byte, bool equal, u16 addr, u8 defer, bool src_lmextn)
  {
         u16 addr_lo, addr_hi;
         u64 insn;
@@ -357,32 +219,34 @@ __emit_br_byte(struct nfp_prog *nfp_prog, u8 areg, u8 breg, bool imm8,
                 FIELD_PREP(OP_BB_EQ, equal) |
                 FIELD_PREP(OP_BB_DEFBR, defer) |
                 FIELD_PREP(OP_BB_ADDR_LO, addr_lo) |
-               FIELD_PREP(OP_BB_ADDR_HI, addr_hi);
+               FIELD_PREP(OP_BB_ADDR_HI, addr_hi) |
+               FIELD_PREP(OP_BB_SRC_LMEXTN, src_lmextn);
  
         nfp_prog_push(nfp_prog, insn);
  }
  
  static void
  emit_br_byte_neq(struct nfp_prog *nfp_prog,
-                u32 dst, u8 imm, u8 byte, u16 addr, u8 defer)
+                swreg src, u8 imm, u8 byte, u16 addr, u8 defer)
  {
         struct nfp_insn_re_regs reg;
         int err;
  
-       err = swreg_to_restricted(reg_none(), dst, reg_imm(imm), &reg, true);
+       err = swreg_to_restricted(reg_none(), src, reg_imm(imm), &reg, true);
         if (err) {
                 nfp_prog->error = err;
                 return;
         }
  
         __emit_br_byte(nfp_prog, reg.areg, reg.breg, reg.i8, byte, false, addr,
-                      defer);
+                      defer, reg.src_lmextn);
  }
  
  static void
  __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
              enum immed_width width, bool invert,
-            enum immed_shift shift, bool wr_both)
+            enum immed_shift shift, bool wr_both,
+            bool dst_lmextn, bool src_lmextn)
  {
         u64 insn;
  
@@ -393,19 +257,21 @@ __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
                 FIELD_PREP(OP_IMMED_WIDTH, width) |
                 FIELD_PREP(OP_IMMED_INV, invert) |
                 FIELD_PREP(OP_IMMED_SHIFT, shift) |
-               FIELD_PREP(OP_IMMED_WR_AB, wr_both);
+               FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
+               FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
  
         nfp_prog_push(nfp_prog, insn);
  }
  
  static void
-emit_immed(struct nfp_prog *nfp_prog, u32 dst, u16 imm,
+emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
            enum immed_width width, bool invert, enum immed_shift shift)
  {
         struct nfp_insn_ur_regs reg;
         int err;
  
-       if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM) {
+       if (swreg_type(dst) == NN_REG_IMM) {
                 nfp_prog->error = -EFAULT;
                 return;
         }
@@ -417,13 +283,15 @@ emit_immed(struct nfp_prog *nfp_prog, u32 dst, u16 imm,
         }
  
         __emit_immed(nfp_prog, reg.areg, reg.breg, imm >> 8, width,
-                    invert, shift, reg.wr_both);
+                    invert, shift, reg.wr_both,
+                    reg.dst_lmextn, reg.src_lmextn);
  }
  
  static void
  __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
            enum shf_sc sc, u8 shift,
-          u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both)
+          u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
+          bool dst_lmextn, bool src_lmextn)
  {
         u64 insn;
  
@@ -445,14 +313,16 @@ __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
                 FIELD_PREP(OP_SHF_SHIFT, shift) |
                 FIELD_PREP(OP_SHF_OP, op) |
                 FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
-               FIELD_PREP(OP_SHF_WR_AB, wr_both);
+               FIELD_PREP(OP_SHF_WR_AB, wr_both) |
+               FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
  
         nfp_prog_push(nfp_prog, insn);
  }
  
  static void
-emit_shf(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum shf_op op, u32 rreg,
-        enum shf_sc sc, u8 shift)
+emit_shf(struct nfp_prog *nfp_prog, swreg dst,
+        swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
  {
         struct nfp_insn_re_regs reg;
         int err;
@@ -464,12 +334,14 @@ emit_shf(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum shf_op op, u32 rreg,
         }
  
         __emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
-                  reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both);
+                  reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
+                  reg.dst_lmextn, reg.src_lmextn);
  }
  
  static void
  __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
-          u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both)
+          u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
+          bool dst_lmextn, bool src_lmextn)
  {
         u64 insn;
  
@@ -480,13 +352,16 @@ __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
                 FIELD_PREP(OP_ALU_SW, swap) |
                 FIELD_PREP(OP_ALU_OP, op) |
                 FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
-               FIELD_PREP(OP_ALU_WR_AB, wr_both);
+               FIELD_PREP(OP_ALU_WR_AB, wr_both) |
+               FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
  
         nfp_prog_push(nfp_prog, insn);
  }
  
  static void
-emit_alu(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum alu_op op, u32 rreg)
+emit_alu(struct nfp_prog *nfp_prog, swreg dst,
+        swreg lreg, enum alu_op op, swreg rreg)
  {
         struct nfp_insn_ur_regs reg;
         int err;
@@ -498,13 +373,15 @@ emit_alu(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum alu_op op, u32 rreg)
         }
  
         __emit_alu(nfp_prog, reg.dst, reg.dst_ab,
-                  reg.areg, op, reg.breg, reg.swap, reg.wr_both);
+                  reg.areg, op, reg.breg, reg.swap, reg.wr_both,
+                  reg.dst_lmextn, reg.src_lmextn);
  }
  
  static void
  __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
                 u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
-               bool zero, bool swap, bool wr_both)
+               bool zero, bool swap, bool wr_both,
+               bool dst_lmextn, bool src_lmextn)
  {
         u64 insn;
  
@@ -517,35 +394,44 @@ __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
                 FIELD_PREP(OP_LDF_ZF, zero) |
                 FIELD_PREP(OP_LDF_BMASK, bmask) |
                 FIELD_PREP(OP_LDF_SHF, shift) |
-               FIELD_PREP(OP_LDF_WR_AB, wr_both);
+               FIELD_PREP(OP_LDF_WR_AB, wr_both) |
+               FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
+               FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
  
         nfp_prog_push(nfp_prog, insn);
  }
  
  static void
  emit_ld_field_any(struct nfp_prog *nfp_prog, enum shf_sc sc, u8 shift,
-                 u32 dst, u8 bmask, u32 src, bool zero)
+                 swreg dst, u8 bmask, swreg src, bool zero)
  {
         struct nfp_insn_re_regs reg;
         int err;
  
-       err = swreg_to_restricted(reg_none(), dst, src, &reg, true);
+       /* Note: ld_field is special as it uses one of the src regs as dst */
+       err = swreg_to_restricted(dst, dst, src, &reg, true);
         if (err) {
                 nfp_prog->error = err;
                 return;
         }
  
         __emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
-                       reg.i8, zero, reg.swap, reg.wr_both);
+                       reg.i8, zero, reg.swap, reg.wr_both,
+                       reg.dst_lmextn, reg.src_lmextn);
  }
  
  static void
-emit_ld_field(struct nfp_prog *nfp_prog, u32 dst, u8 bmask, u32 src,
+emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
               enum shf_sc sc, u8 shift)
  {
         emit_ld_field_any(nfp_prog, sc, shift, dst, bmask, src, false);
  }
  
+static void emit_nop(struct nfp_prog *nfp_prog)
+{
+       __emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
+}
+
  /* --- Wrappers --- */
  static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
  {
@@ -565,7 +451,7 @@ static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
         return true;
  }
  
-static void wrp_immed(struct nfp_prog *nfp_prog, u32 dst, u32 imm)
+static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
  {
         enum immed_shift shift;
         u16 val;
@@ -586,7 +472,7 @@ static void wrp_immed(struct nfp_prog *nfp_prog, u32 dst, u32 imm)
   * If the @imm is small enough encode it directly in operand and return
   * otherwise load @imm to a spare register and return its encoding.
   */
-static u32 ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg)
+static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
  {
         if (FIELD_FIT(UR_REG_IMM_MAX, imm))
                 return reg_imm(imm);
@@ -599,7 +485,7 @@ static u32 ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg)
   * If the @imm is small enough encode it directly in operand and return
   * otherwise load @imm to a spare register and return its encoding.
   */
-static u32 re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg)
+static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
  {
         if (FIELD_FIT(RE_REG_IMM_MAX, imm))
                 return reg_imm(imm);
@@ -629,7 +515,7 @@ construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset,
  {
         unsigned int i;
         u16 shift, sz;
-       u32 tmp_reg;
+       swreg tmp_reg;
  
         /* We load the value from the address indicated in @offset and then
          * shift out the data we don't need.  Note: this is big endian!
@@ -646,22 +532,22 @@ construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset,
                 emit_alu(nfp_prog, imm_a(nfp_prog),
                          imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
                 emit_alu(nfp_prog, reg_none(),
-                        NFP_BPF_ABI_LEN, ALU_OP_SUB, imm_a(nfp_prog));
+                        plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
                 wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT);
                 /* Load data */
                 emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
-                        pkt_reg(nfp_prog), imm_b(nfp_prog), sz - 1, true);
+                        pptr_reg(nfp_prog), imm_b(nfp_prog), sz - 1, true);
         } else {
                 /* Check packet length */
                 tmp_reg = ur_load_imm_any(nfp_prog, offset + size,
                                           imm_a(nfp_prog));
                 emit_alu(nfp_prog, reg_none(),
-                        NFP_BPF_ABI_LEN, ALU_OP_SUB, tmp_reg);
+                        plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
                 wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT);
                 /* Load data */
                 tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
                 emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
-                        pkt_reg(nfp_prog), tmp_reg, sz - 1, true);
+                        pptr_reg(nfp_prog), tmp_reg, sz - 1, true);
         }
  
         i = 0;
@@ -684,20 +570,10 @@ static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size)
         return construct_data_ind_ld(nfp_prog, offset, 0, false, size);
  }
  
-static int wrp_set_mark(struct nfp_prog *nfp_prog, u8 src)
-{
-       emit_alu(nfp_prog, NFP_BPF_ABI_MARK,
-                reg_none(), ALU_OP_NONE, reg_b(src));
-       emit_alu(nfp_prog, NFP_BPF_ABI_FLAGS,
-                NFP_BPF_ABI_FLAGS, ALU_OP_OR, reg_imm(NFP_BPF_ABI_FLAG_MARK));
-
-       return 0;
-}
-
  static void
  wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
  {
-       u32 tmp_reg;
+       swreg tmp_reg;
  
         if (alu_op == ALU_OP_AND) {
                 if (!imm)
@@ -815,7 +691,7 @@ wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
         const struct bpf_insn *insn = &meta->insn;
         u64 imm = insn->imm; /* sign extend */
         u8 reg = insn->dst_reg * 2;
-       u32 tmp_reg;
+       swreg tmp_reg;
  
         if (insn->off < 0) /* TODO */
                 return -EOPNOTSUPP;
@@ -967,12 +843,24 @@ static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
         const struct bpf_insn *insn = &meta->insn;
-
-       if (insn->imm != 32)
-               return 1; /* TODO */
-
-       wrp_reg_mov(nfp_prog, insn->dst_reg * 2 + 1, insn->dst_reg * 2);
-       wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), 0);
+       u8 dst = insn->dst_reg * 2;
+
+       if (insn->imm < 32) {
+               emit_shf(nfp_prog, reg_both(dst + 1),
+                        reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_R_DSHF, 32 - insn->imm);
+               emit_shf(nfp_prog, reg_both(dst),
+                        reg_none(), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_L_SHF, insn->imm);
+       } else if (insn->imm == 32) {
+               wrp_reg_mov(nfp_prog, dst + 1, dst);
+               wrp_immed(nfp_prog, reg_both(dst), 0);
+       } else if (insn->imm > 32) {
+               emit_shf(nfp_prog, reg_both(dst + 1),
+                        reg_none(), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_L_SHF, insn->imm - 32);
+               wrp_immed(nfp_prog, reg_both(dst), 0);
+       }
  
         return 0;
  }
@@ -980,12 +868,24 @@ static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
         const struct bpf_insn *insn = &meta->insn;
-
-       if (insn->imm != 32)
-               return 1; /* TODO */
-
-       wrp_reg_mov(nfp_prog, insn->dst_reg * 2, insn->dst_reg * 2 + 1);
-       wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
+       u8 dst = insn->dst_reg * 2;
+
+       if (insn->imm < 32) {
+               emit_shf(nfp_prog, reg_both(dst),
+                        reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
+                        SHF_SC_R_DSHF, insn->imm);
+               emit_shf(nfp_prog, reg_both(dst + 1),
+                        reg_none(), SHF_OP_NONE, reg_b(dst + 1),
+                        SHF_SC_R_SHF, insn->imm);
+       } else if (insn->imm == 32) {
+               wrp_reg_mov(nfp_prog, dst, dst + 1);
+               wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+       } else if (insn->imm > 32) {
+               emit_shf(nfp_prog, reg_both(dst),
+                        reg_none(), SHF_OP_NONE, reg_b(dst + 1),
+                        SHF_SC_R_SHF, insn->imm - 32);
+               wrp_immed(nfp_prog, reg_both(dst + 1), 0);
+       }
  
         return 0;
  }
@@ -1130,7 +1030,7 @@ static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
         if (meta->insn.off == offsetof(struct sk_buff, len))
                 emit_alu(nfp_prog, reg_both(meta->insn.dst_reg * 2),
-                        reg_none(), ALU_OP_NONE, NFP_BPF_ABI_LEN);
+                        reg_none(), ALU_OP_NONE, plen_reg(nfp_prog));
         else
                 return -EOPNOTSUPP;
  
@@ -1139,18 +1039,18 @@ static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  
  static int mem_ldx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
-       u32 dst = reg_both(meta->insn.dst_reg * 2);
+       swreg dst = reg_both(meta->insn.dst_reg * 2);
  
         if (meta->insn.off != offsetof(struct xdp_md, data) &&
             meta->insn.off != offsetof(struct xdp_md, data_end))
                 return -EOPNOTSUPP;
  
-       emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT);
+       emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, pptr_reg(nfp_prog));
  
         if (meta->insn.off == offsetof(struct xdp_md, data))
                 return 0;
  
-       emit_alu(nfp_prog, dst, dst, ALU_OP_ADD, NFP_BPF_ABI_LEN);
+       emit_alu(nfp_prog, dst, dst, ALU_OP_ADD, plen_reg(nfp_prog));
  
         return 0;
  }
@@ -1171,9 +1071,6 @@ static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  
  static int mem_stx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
-       if (meta->insn.off == offsetof(struct sk_buff, mark))
-               return wrp_set_mark(nfp_prog, meta->insn.src_reg * 2);
-
         return -EOPNOTSUPP;
  }
  
@@ -1202,8 +1099,10 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
         const struct bpf_insn *insn = &meta->insn;
         u64 imm = insn->imm; /* sign extend */
-       u32 or1 = reg_a(insn->dst_reg * 2), or2 = reg_b(insn->dst_reg * 2 + 1);
-       u32 tmp_reg;
+       swreg or1, or2, tmp_reg;
+
+       or1 = reg_a(insn->dst_reg * 2);
+       or2 = reg_b(insn->dst_reg * 2 + 1);
  
         if (insn->off < 0) /* TODO */
                 return -EOPNOTSUPP;
@@ -1252,7 +1151,7 @@ static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
         const struct bpf_insn *insn = &meta->insn;
         u64 imm = insn->imm; /* sign extend */
-       u32 tmp_reg;
+       swreg tmp_reg;
  
         if (insn->off < 0) /* TODO */
                 return -EOPNOTSUPP;
@@ -1283,7 +1182,7 @@ static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
  {
         const struct bpf_insn *insn = &meta->insn;
         u64 imm = insn->imm; /* sign extend */
-       u32 tmp_reg;
+       swreg tmp_reg;
  
         if (insn->off < 0) /* TODO */
                 return -EOPNOTSUPP;
@@ -1510,8 +1409,9 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
  
  static void nfp_intro(struct nfp_prog *nfp_prog)
  {
-       emit_alu(nfp_prog, pkt_reg(nfp_prog),
-                reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT);
+       wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
+       emit_alu(nfp_prog, plen_reg(nfp_prog),
+                plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
  }
  
  static void nfp_outro_tc_legacy(struct nfp_prog *nfp_prog)
@@ -1656,7 +1556,7 @@ static void nfp_outro(struct nfp_prog *nfp_prog)
  static int nfp_translate(struct nfp_prog *nfp_prog)
  {
         struct nfp_insn_meta *meta;
-       int err;
+       int i, err;
  
         nfp_intro(nfp_prog);
         if (nfp_prog->error)
@@ -1688,6 +1588,11 @@ static int nfp_translate(struct nfp_prog *nfp_prog)
         if (nfp_prog->error)
                 return nfp_prog->error;
  
+       for (i = 0; i < NFP_USTORE_PREFETCH_WINDOW; i++)
+               emit_nop(nfp_prog);
+       if (nfp_prog->error)
+               return nfp_prog->error;
+
         return nfp_fixup_branches(nfp_prog);
  }
  
@@ -1737,38 +1642,6 @@ static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
         }
  }
  
-/* Try to rename registers so that program uses only low ones */
-static int nfp_bpf_opt_reg_rename(struct nfp_prog *nfp_prog)
-{
-       bool reg_used[MAX_BPF_REG] = {};
-       u8 tgt_reg[MAX_BPF_REG] = {};
-       struct nfp_insn_meta *meta;
-       unsigned int i, j;
-
-       list_for_each_entry(meta, &nfp_prog->insns, l) {
-               if (meta->skip)
-                       continue;
-
-               reg_used[meta->insn.src_reg] = true;
-               reg_used[meta->insn.dst_reg] = true;
-       }
-
-       for (i = 0, j = 0; i < ARRAY_SIZE(tgt_reg); i++) {
-               if (!reg_used[i])
-                       continue;
-
-               tgt_reg[i] = j++;
-       }
-       nfp_prog->num_regs = j;
-
-       list_for_each_entry(meta, &nfp_prog->insns, l) {
-               meta->insn.src_reg = tgt_reg[meta->insn.src_reg];
-               meta->insn.dst_reg = tgt_reg[meta->insn.dst_reg];
-       }
-
-       return 0;
-}
-
  /* Remove masking after load since our load guarantees this is not needed */
  static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
  {
@@ -1845,20 +1718,33 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
  
  static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
  {
-       int ret;
-
         nfp_bpf_opt_reg_init(nfp_prog);
  
-       ret = nfp_bpf_opt_reg_rename(nfp_prog);
-       if (ret)
-               return ret;
-
         nfp_bpf_opt_ld_mask(nfp_prog);
         nfp_bpf_opt_ld_shift(nfp_prog);
  
         return 0;
  }
  
+static int nfp_bpf_ustore_calc(struct nfp_prog *nfp_prog, __le64 *ustore)
+{
+       int i;
+
+       for (i = 0; i < nfp_prog->prog_len; i++) {
+               int err;
+
+               err = nfp_ustore_check_valid_no_ecc(nfp_prog->prog[i]);
+               if (err)
+                       return err;
+
+               nfp_prog->prog[i] = nfp_ustore_calc_ecc_insn(nfp_prog->prog[i]);
+
+               ustore[i] = cpu_to_le64(nfp_prog->prog[i]);
+       }
+
+       return 0;
+}
+
  /**
   * nfp_bpf_jit() - translate BPF code into NFP assembly
   * @filter:    kernel BPF filter struct
@@ -1899,10 +1785,8 @@ nfp_bpf_jit(struct bpf_prog *filter, void *prog_mem,
         if (ret)
                 goto out;
  
-       if (nfp_prog->num_regs <= 7)
-               nfp_prog->regs_per_thread = 16;
-       else
-               nfp_prog->regs_per_thread = 32;
+       nfp_prog->num_regs = MAX_BPF_REG;
+       nfp_prog->regs_per_thread = 32;
  
         nfp_prog->prog = prog_mem;
         nfp_prog->__prog_alloc_len = prog_sz;
@@ -1912,10 +1796,13 @@ nfp_bpf_jit(struct bpf_prog *filter, void *prog_mem,
                 pr_err("Translation failed with error %d (translated: %u)\n",
                        ret, nfp_prog->n_translated);
                 ret = -EINVAL;
+               goto out;
         }
  
+       ret = nfp_bpf_ustore_calc(nfp_prog, (__force __le64 *)prog_mem);
+
         res->n_instr = nfp_prog->prog_len;
-       res->dense_mode = nfp_prog->num_regs <= 7;
+       res->dense_mode = false;
  out:
         nfp_prog_free(nfp_prog);
  
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c

index be2cf10..0747269 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -89,14 +89,6 @@ nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
         struct nfp_net_bpf_priv *priv;
         int ret;
  
-       /* Limit to single port, otherwise it's just a NIC */
-       if (id > 0) {
-               nfp_warn(app->cpp,
-                        "BPF NIC doesn't support more than one port right now\n");
-               nn->port = nfp_port_alloc(app, NFP_PORT_INVALID, nn->dp.netdev);
-               return PTR_ERR_OR_ZERO(nn->port);
-       }
-
         priv = kmalloc(sizeof(*priv), GFP_KERNEL);
         if (!priv)
                 return -ENOMEM;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h

index 4051e94..b7a112a 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -39,6 +39,7 @@
  #include <linux/list.h>
  #include <linux/types.h>
  
+#include "../nfp_asm.h"
  #include "../nfp_net.h"
  
  /* For branch fixup logic use up-most byte of branch instruction as scratch
@@ -53,9 +54,13 @@ enum br_special {
  };
  
  enum static_regs {
-       STATIC_REG_PKT          = 1,
-#define REG_PKT_BANK   ALU_DST_A
-       STATIC_REG_IMM          = 2, /* Bank AB */
+       STATIC_REG_IMM          = 21, /* Bank AB */
+       STATIC_REG_PKT_LEN      = 22, /* Bank B */
+};
+
+enum pkt_vec {
+       PKT_VEC_PKT_LEN         = 0,
+       PKT_VEC_PKT_PTR         = 2,
  };
  
  enum nfp_bpf_action_type {
@@ -65,39 +70,17 @@ enum nfp_bpf_action_type {
         NN_ACT_XDP,
  };
  
-/* Software register representation, hardware encoding in asm.h */
-#define NN_REG_TYPE    GENMASK(31, 24)
-#define NN_REG_VAL     GENMASK(7, 0)
-
-enum nfp_bpf_reg_type {
-       NN_REG_GPR_A =  BIT(0),
-       NN_REG_GPR_B =  BIT(1),
-       NN_REG_NNR =    BIT(2),
-       NN_REG_XFER =   BIT(3),
-       NN_REG_IMM =    BIT(4),
-       NN_REG_NONE =   BIT(5),
-};
-
-#define NN_REG_GPR_BOTH        (NN_REG_GPR_A | NN_REG_GPR_B)
-
-#define reg_both(x)    ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_BOTH))
-#define reg_a(x)       ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_A))
-#define reg_b(x)       ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_B))
-#define reg_nnr(x)     ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_NNR))
-#define reg_xfer(x)    ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_XFER))
-#define reg_imm(x)     ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_IMM))
-#define reg_none()     (FIELD_PREP(NN_REG_TYPE, NN_REG_NONE))
+#define pv_len(np)     reg_lm(1, PKT_VEC_PKT_LEN)
+#define pv_ctm_ptr(np) reg_lm(1, PKT_VEC_PKT_PTR)
  
-#define pkt_reg(np)    reg_a((np)->regs_per_thread - STATIC_REG_PKT)
-#define imm_a(np)      reg_a((np)->regs_per_thread - STATIC_REG_IMM)
-#define imm_b(np)      reg_b((np)->regs_per_thread - STATIC_REG_IMM)
-#define imm_both(np)   reg_both((np)->regs_per_thread - STATIC_REG_IMM)
+#define plen_reg(np)   reg_b(STATIC_REG_PKT_LEN)
+#define pptr_reg(np)   pv_ctm_ptr(np)
+#define imm_a(np)      reg_a(STATIC_REG_IMM)
+#define imm_b(np)      reg_b(STATIC_REG_IMM)
+#define imm_both(np)   reg_both(STATIC_REG_IMM)
  
-#define NFP_BPF_ABI_FLAGS      reg_nnr(0)
+#define NFP_BPF_ABI_FLAGS      reg_imm(0)
  #define   NFP_BPF_ABI_FLAG_MARK        1
-#define NFP_BPF_ABI_MARK       reg_nnr(1)
-#define NFP_BPF_ABI_PKT                reg_nnr(2)
-#define NFP_BPF_ABI_LEN                reg_nnr(3)
  
  struct nfp_prog;
  struct nfp_insn_meta;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c

index 38f3835..1194c47 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -36,6 +36,7 @@
  #include <net/switchdev.h>
  #include <net/tc_act/tc_gact.h>
  #include <net/tc_act/tc_mirred.h>
+#include <net/tc_act/tc_pedit.h>
  #include <net/tc_act/tc_vlan.h>
  #include <net/tc_act/tc_tunnel_key.h>
  
@@ -223,6 +224,247 @@ nfp_fl_set_vxlan(struct nfp_fl_set_vxlan *set_vxlan,
         return 0;
  }
  
+static void nfp_fl_set_helper32(u32 value, u32 mask, u8 *p_exact, u8 *p_mask)
+{
+       u32 oldvalue = get_unaligned((u32 *)p_exact);
+       u32 oldmask = get_unaligned((u32 *)p_mask);
+
+       value &= mask;
+       value |= oldvalue & ~mask;
+
+       put_unaligned(oldmask | mask, (u32 *)p_mask);
+       put_unaligned(value, (u32 *)p_exact);
+}
+
+static int
+nfp_fl_set_eth(const struct tc_action *action, int idx, u32 off,
+              struct nfp_fl_set_eth *set_eth)
+{
+       u16 tmp_set_eth_op;
+       u32 exact, mask;
+
+       if (off + 4 > ETH_ALEN * 2)
+               return -EOPNOTSUPP;
+
+       mask = ~tcf_pedit_mask(action, idx);
+       exact = tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       nfp_fl_set_helper32(exact, mask, &set_eth->eth_addr_val[off],
+                           &set_eth->eth_addr_mask[off]);
+
+       set_eth->reserved = cpu_to_be16(0);
+       tmp_set_eth_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+                                   sizeof(*set_eth) >> NFP_FL_LW_SIZ) |
+                        FIELD_PREP(NFP_FL_ACT_JMP_ID,
+                                   NFP_FL_ACTION_OPCODE_SET_ETHERNET);
+       set_eth->a_op = cpu_to_be16(tmp_set_eth_op);
+
+       return 0;
+}
+
+static int
+nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off,
+              struct nfp_fl_set_ip4_addrs *set_ip_addr)
+{
+       u16 tmp_set_ipv4_op;
+       __be32 exact, mask;
+
+       /* We are expecting tcf_pedit to return a big endian value */
+       mask = (__force __be32)~tcf_pedit_mask(action, idx);
+       exact = (__force __be32)tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       switch (off) {
+       case offsetof(struct iphdr, daddr):
+               set_ip_addr->ipv4_dst_mask = mask;
+               set_ip_addr->ipv4_dst = exact;
+               break;
+       case offsetof(struct iphdr, saddr):
+               set_ip_addr->ipv4_src_mask = mask;
+               set_ip_addr->ipv4_src = exact;
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
+
+       set_ip_addr->reserved = cpu_to_be16(0);
+       tmp_set_ipv4_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+                                    sizeof(*set_ip_addr) >> NFP_FL_LW_SIZ) |
+                         FIELD_PREP(NFP_FL_ACT_JMP_ID,
+                                    NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS);
+       set_ip_addr->a_op = cpu_to_be16(tmp_set_ipv4_op);
+
+       return 0;
+}
+
+static void
+nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask,
+                     struct nfp_fl_set_ipv6_addr *ip6)
+{
+       u16 tmp_set_op;
+
+       ip6->ipv6[idx % 4].mask = mask;
+       ip6->ipv6[idx % 4].exact = exact;
+
+       ip6->reserved = cpu_to_be16(0);
+       tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW, sizeof(*ip6) >>
+                               NFP_FL_LW_SIZ) |
+                    FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode_tag);
+       ip6->a_op = cpu_to_be16(tmp_set_op);
+}
+
+static int
+nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off,
+              struct nfp_fl_set_ipv6_addr *ip_dst,
+              struct nfp_fl_set_ipv6_addr *ip_src)
+{
+       __be32 exact, mask;
+
+       /* We are expecting tcf_pedit to return a big endian value */
+       mask = (__force __be32)~tcf_pedit_mask(action, idx);
+       exact = (__force __be32)tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       if (off < offsetof(struct ipv6hdr, saddr))
+               return -EOPNOTSUPP;
+       else if (off < offsetof(struct ipv6hdr, daddr))
+               nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, idx,
+                                     exact, mask, ip_src);
+       else if (off < offsetof(struct ipv6hdr, daddr) +
+                      sizeof(struct in6_addr))
+               nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, idx,
+                                     exact, mask, ip_dst);
+       else
+               return -EOPNOTSUPP;
+
+       return 0;
+}
+
+static int
+nfp_fl_set_tport(const struct tc_action *action, int idx, u32 off,
+                struct nfp_fl_set_tport *set_tport, int opcode)
+{
+       u32 exact, mask;
+       u16 tmp_set_op;
+
+       if (off)
+               return -EOPNOTSUPP;
+
+       mask = ~tcf_pedit_mask(action, idx);
+       exact = tcf_pedit_val(action, idx);
+
+       if (exact & ~mask)
+               return -EOPNOTSUPP;
+
+       nfp_fl_set_helper32(exact, mask, set_tport->tp_port_val,
+                           set_tport->tp_port_mask);
+
+       set_tport->reserved = cpu_to_be16(0);
+       tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW,
+                               sizeof(*set_tport) >> NFP_FL_LW_SIZ);
+       tmp_set_op |= FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode);
+       set_tport->a_op = cpu_to_be16(tmp_set_op);
+
+       return 0;
+}
+
+static int
+nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
+{
+       struct nfp_fl_set_ipv6_addr set_ip6_dst, set_ip6_src;
+       struct nfp_fl_set_ip4_addrs set_ip_addr;
+       struct nfp_fl_set_tport set_tport;
+       struct nfp_fl_set_eth set_eth;
+       enum pedit_header_type htype;
+       int idx, nkeys, err;
+       size_t act_size;
+       u32 offset, cmd;
+
+       memset(&set_ip6_dst, 0, sizeof(set_ip6_dst));
+       memset(&set_ip6_src, 0, sizeof(set_ip6_src));
+       memset(&set_ip_addr, 0, sizeof(set_ip_addr));
+       memset(&set_tport, 0, sizeof(set_tport));
+       memset(&set_eth, 0, sizeof(set_eth));
+       nkeys = tcf_pedit_nkeys(action);
+
+       for (idx = 0; idx < nkeys; idx++) {
+               cmd = tcf_pedit_cmd(action, idx);
+               htype = tcf_pedit_htype(action, idx);
+               offset = tcf_pedit_offset(action, idx);
+
+               if (cmd != TCA_PEDIT_KEY_EX_CMD_SET)
+                       return -EOPNOTSUPP;
+
+               switch (htype) {
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH:
+                       err = nfp_fl_set_eth(action, idx, offset, &set_eth);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4:
+                       err = nfp_fl_set_ip4(action, idx, offset, &set_ip_addr);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6:
+                       err = nfp_fl_set_ip6(action, idx, offset, &set_ip6_dst,
+                                            &set_ip6_src);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP:
+                       err = nfp_fl_set_tport(action, idx, offset, &set_tport,
+                                              NFP_FL_ACTION_OPCODE_SET_TCP);
+                       break;
+               case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP:
+                       err = nfp_fl_set_tport(action, idx, offset, &set_tport,
+                                              NFP_FL_ACTION_OPCODE_SET_UDP);
+                       break;
+               default:
+                       return -EOPNOTSUPP;
+               }
+               if (err)
+                       return err;
+       }
+
+       if (set_eth.a_op) {
+               act_size = sizeof(set_eth);
+               memcpy(nfp_action, &set_eth, act_size);
+               *a_len += act_size;
+       } else if (set_ip_addr.a_op) {
+               act_size = sizeof(set_ip_addr);
+               memcpy(nfp_action, &set_ip_addr, act_size);
+               *a_len += act_size;
+       } else if (set_ip6_dst.a_op && set_ip6_src.a_op) {
+               /* TC compiles set src and dst IPv6 address as a single action,
+                * the hardware requires this to be 2 separate actions.
+                */
+               act_size = sizeof(set_ip6_src);
+               memcpy(nfp_action, &set_ip6_src, act_size);
+               *a_len += act_size;
+
+               act_size = sizeof(set_ip6_dst);
+               memcpy(&nfp_action[sizeof(set_ip6_src)], &set_ip6_dst,
+                      act_size);
+               *a_len += act_size;
+       } else if (set_ip6_dst.a_op) {
+               act_size = sizeof(set_ip6_dst);
+               memcpy(nfp_action, &set_ip6_dst, act_size);
+               *a_len += act_size;
+       } else if (set_ip6_src.a_op) {
+               act_size = sizeof(set_ip6_src);
+               memcpy(nfp_action, &set_ip6_src, act_size);
+               *a_len += act_size;
+       } else if (set_tport.a_op) {
+               act_size = sizeof(set_tport);
+               memcpy(nfp_action, &set_tport, act_size);
+               *a_len += act_size;
+       }
+
+       return 0;
+}
+
  static int
  nfp_flower_loop_action(const struct tc_action *a,
                        struct nfp_fl_payload *nfp_fl, int *a_len,
@@ -301,6 +543,9 @@ nfp_flower_loop_action(const struct tc_action *a,
         } else if (is_tcf_tunnel_release(a)) {
                 /* Tunnel decap is handled by default so accept action. */
                 return 0;
+       } else if (is_tcf_pedit(a)) {
+               if (nfp_fl_pedit(a, &nfp_fl->action_data[*a_len], a_len))
+                       return -EOPNOTSUPP;
         } else {
                 /* Currently we do not handle any other actions. */
                 return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h

index 504ddaa..f7b7242 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -57,6 +57,11 @@
  #define NFP_FLOWER_MASK_VLAN_CFI       BIT(12)
  #define NFP_FLOWER_MASK_VLAN_VID       GENMASK(11, 0)
  
+#define NFP_FLOWER_MASK_MPLS_LB                GENMASK(31, 12)
+#define NFP_FLOWER_MASK_MPLS_TC                GENMASK(11, 9)
+#define NFP_FLOWER_MASK_MPLS_BOS       BIT(8)
+#define NFP_FLOWER_MASK_MPLS_Q         BIT(0)
+
  #define NFP_FL_SC_ACT_DROP             0x80000000
  #define NFP_FL_SC_ACT_USER             0x7D000000
  #define NFP_FL_SC_ACT_POPV             0x6A000000
@@ -72,6 +77,12 @@
  #define NFP_FL_ACTION_OPCODE_PUSH_VLAN         1
  #define NFP_FL_ACTION_OPCODE_POP_VLAN          2
  #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL   6
+#define NFP_FL_ACTION_OPCODE_SET_ETHERNET      7
+#define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS    9
+#define NFP_FL_ACTION_OPCODE_SET_IPV6_SRC      11
+#define NFP_FL_ACTION_OPCODE_SET_IPV6_DST      12
+#define NFP_FL_ACTION_OPCODE_SET_UDP           14
+#define NFP_FL_ACTION_OPCODE_SET_TCP           15
  #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL                17
  #define NFP_FL_ACTION_OPCODE_NUM               32
  
@@ -102,6 +113,38 @@ enum nfp_flower_tun_type {
         NFP_FL_TUNNEL_VXLAN =   2,
  };
  
+struct nfp_fl_set_eth {
+       __be16 a_op;
+       __be16 reserved;
+       u8 eth_addr_mask[ETH_ALEN * 2];
+       u8 eth_addr_val[ETH_ALEN * 2];
+};
+
+struct nfp_fl_set_ip4_addrs {
+       __be16 a_op;
+       __be16 reserved;
+       __be32 ipv4_src_mask;
+       __be32 ipv4_src;
+       __be32 ipv4_dst_mask;
+       __be32 ipv4_dst;
+};
+
+struct nfp_fl_set_ipv6_addr {
+       __be16 a_op;
+       __be16 reserved;
+       struct {
+               __be32 mask;
+               __be32 exact;
+       } ipv6[4];
+};
+
+struct nfp_fl_set_tport {
+       __be16 a_op;
+       __be16 reserved;
+       u8 tp_port_mask[4];
+       u8 tp_port_val[4];
+};
+
  struct nfp_fl_output {
         __be16 a_op;
         __be16 flags;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c

index 865a815..60614d4 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/flower/match.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/match.c
@@ -111,8 +111,21 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *frame,
                 ether_addr_copy(frame->mac_src, &addr->src[0]);
         }
  
-       if (mask_version)
-               frame->mpls_lse = cpu_to_be32(~0);
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_MPLS)) {
+               struct flow_dissector_key_mpls *mpls;
+               u32 t_mpls;
+
+               mpls = skb_flow_dissector_target(flow->dissector,
+                                                FLOW_DISSECTOR_KEY_MPLS,
+                                                target);
+
+               t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, mpls->mpls_label) |
+                        FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, mpls->mpls_tc) |
+                        FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, mpls->mpls_bos) |
+                        NFP_FLOWER_MASK_MPLS_Q;
+
+               frame->mpls_lse = cpu_to_be32(t_mpls);
+       }
  }
  
  static void
@@ -143,7 +156,6 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
         struct flow_dissector_key_ipv4_addrs *addr;
         struct flow_dissector_key_basic *basic;
  
-       /* Wildcard TOS/TTL for now. */
         memset(frame, 0, sizeof(struct nfp_flower_ipv4));
  
         if (dissector_uses_key(flow->dissector,
@@ -161,6 +173,16 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame,
                                                   target);
                 frame->proto = basic->ip_proto;
         }
+
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) {
+               struct flow_dissector_key_ip *flow_ip;
+
+               flow_ip = skb_flow_dissector_target(flow->dissector,
+                                                   FLOW_DISSECTOR_KEY_IP,
+                                                   target);
+               frame->tos = flow_ip->tos;
+               frame->ttl = flow_ip->ttl;
+       }
  }
  
  static void
@@ -172,7 +194,6 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame,
         struct flow_dissector_key_ipv6_addrs *addr;
         struct flow_dissector_key_basic *basic;
  
-       /* Wildcard LABEL/TOS/TTL for now. */
         memset(frame, 0, sizeof(struct nfp_flower_ipv6));
  
         if (dissector_uses_key(flow->dissector,
@@ -190,6 +211,16 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame,
                                                   target);
                 frame->proto = basic->ip_proto;
         }
+
+       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) {
+               struct flow_dissector_key_ip *flow_ip;
+
+               flow_ip = skb_flow_dissector_target(flow->dissector,
+                                                   FLOW_DISSECTOR_KEY_IP,
+                                                   target);
+               frame->tos = flow_ip->tos;
+               frame->ttl = flow_ip->ttl;
+       }
  }
  
  static void
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c

index 3d9537e..6f239c2 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -57,6 +57,7 @@
          BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \
          BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \
          BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | \
+        BIT(FLOW_DISSECTOR_KEY_MPLS) | \
          BIT(FLOW_DISSECTOR_KEY_IP))
  
  #define NFP_FLOWER_WHITELIST_TUN_DISSECTOR \
@@ -134,7 +135,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
  {
         struct flow_dissector_key_basic *mask_basic = NULL;
         struct flow_dissector_key_basic *key_basic = NULL;
-       struct flow_dissector_key_ip *mask_ip = NULL;
         u32 key_layer_two;
         u8 key_layer;
         int key_size;
@@ -206,28 +206,15 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
                                                       flow->key);
         }
  
-       if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP))
-               mask_ip = skb_flow_dissector_target(flow->dissector,
-                                                   FLOW_DISSECTOR_KEY_IP,
-                                                   flow->mask);
-
         if (mask_basic && mask_basic->n_proto) {
                 /* Ethernet type is present in the key. */
                 switch (key_basic->n_proto) {
                 case cpu_to_be16(ETH_P_IP):
-                       if (mask_ip && mask_ip->tos)
-                               return -EOPNOTSUPP;
-                       if (mask_ip && mask_ip->ttl)
-                               return -EOPNOTSUPP;
                         key_layer |= NFP_FLOWER_LAYER_IPV4;
                         key_size += sizeof(struct nfp_flower_ipv4);
                         break;
  
                 case cpu_to_be16(ETH_P_IPV6):
-                       if (mask_ip && mask_ip->tos)
-                               return -EOPNOTSUPP;
-                       if (mask_ip && mask_ip->ttl)
-                               return -EOPNOTSUPP;
                         key_layer |= NFP_FLOWER_LAYER_IPV6;
                         key_size += sizeof(struct nfp_flower_ipv6);
                         break;
@@ -238,11 +225,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls,
                 case cpu_to_be16(ETH_P_ARP):
                         return -EOPNOTSUPP;
  
-               /* Currently we do not offload MPLS. */
-               case cpu_to_be16(ETH_P_MPLS_UC):
-               case cpu_to_be16(ETH_P_MPLS_MC):
-                       return -EOPNOTSUPP;
-
                 /* Will be included in layer 2. */
                 case cpu_to_be16(ETH_P_8021Q):
                         break;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h

index af640b5..857bb33 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -36,6 +36,8 @@
  
  #include <net/devlink.h>
  
+#include <trace/events/devlink.h>
+
  #include "nfp_net_repr.h"
  
  struct bpf_prog;
@@ -271,11 +273,17 @@ static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn,
  
  static inline bool nfp_app_ctrl_tx(struct nfp_app *app, struct sk_buff *skb)
  {
+       trace_devlink_hwmsg(priv_to_devlink(app->pf), false, 0,
+                           skb->data, skb->len);
+
         return nfp_ctrl_tx(app->ctrl, skb);
  }
  
  static inline void nfp_app_ctrl_rx(struct nfp_app *app, struct sk_buff *skb)
  {
+       trace_devlink_hwmsg(priv_to_devlink(app->pf), true, 0,
+                           skb->data, skb->len);
+
         app->type->ctrl_msg_rx(app, skb);
  }
  
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c

new file mode 100644 (file)

index 0000000..de76e74
--- /dev/null
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (C) 2016-2017 Netronome Systems, Inc.
+ *
+ * This software is dual licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree or the BSD 2-Clause License provided below.  You have the
+ * option to license this software under the complete terms of either license.
+ *
+ * The BSD 2-Clause License:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      1. Redistributions of source code must retain the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer.
+ *
+ *      2. Redistributions in binary form must reproduce the above
+ *         copyright notice, this list of conditions and the following
+ *         disclaimer in the documentation and/or other materials
+ *         provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "nfp_asm.h"
+
+const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
+       [CMD_TGT_WRITE8] =              { 0x00, 0x42 },
+       [CMD_TGT_READ8] =               { 0x01, 0x43 },
+       [CMD_TGT_READ_LE] =             { 0x01, 0x40 },
+       [CMD_TGT_READ_SWAP_LE] =        { 0x03, 0x40 },
+};
+
+static u16 nfp_swreg_to_unreg(swreg reg, bool is_dst)
+{
+       bool lm_id, lm_dec = false;
+       u16 val = swreg_value(reg);
+
+       switch (swreg_type(reg)) {
+       case NN_REG_GPR_A:
+       case NN_REG_GPR_B:
+       case NN_REG_GPR_BOTH:
+               return val;
+       case NN_REG_NNR:
+               return UR_REG_NN | val;
+       case NN_REG_XFER:
+               return UR_REG_XFR | val;
+       case NN_REG_LMEM:
+               lm_id = swreg_lm_idx(reg);
+
+               switch (swreg_lm_mode(reg)) {
+               case NN_LM_MOD_NONE:
+                       if (val & ~UR_REG_LM_IDX_MAX) {
+                               pr_err("LM offset too large\n");
+                               return 0;
+                       }
+                       return UR_REG_LM | FIELD_PREP(UR_REG_LM_IDX, lm_id) |
+                               val;
+               case NN_LM_MOD_DEC:
+                       lm_dec = true;
+                       /* fall through */
+               case NN_LM_MOD_INC:
+                       if (val) {
+                               pr_err("LM offset in inc/dev mode\n");
+                               return 0;
+                       }
+                       return UR_REG_LM | UR_REG_LM_POST_MOD |
+                               FIELD_PREP(UR_REG_LM_IDX, lm_id) |
+                               FIELD_PREP(UR_REG_LM_POST_MOD_DEC, lm_dec);
+               default:
+                       pr_err("bad LM mode for unrestricted operands %d\n",
+                              swreg_lm_mode(reg));
+                       return 0;
+               }
+       case NN_REG_IMM:
+               if (val & ~0xff) {
+                       pr_err("immediate too large\n");
+                       return 0;
+               }
+               return UR_REG_IMM_encode(val);
+       case NN_REG_NONE:
+               return is_dst ? UR_REG_NO_DST : REG_NONE;
+       }
+
+       pr_err("unrecognized reg encoding %08x\n", reg);
+       return 0;
+}
+
+int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg,
+                         struct nfp_insn_ur_regs *reg)
+{
+       memset(reg, 0, sizeof(*reg));
+
+       /* Decode destination */
+       if (swreg_type(dst) == NN_REG_IMM)
+               return -EFAULT;
+
+       if (swreg_type(dst) == NN_REG_GPR_B)
+               reg->dst_ab = ALU_DST_B;
+       if (swreg_type(dst) == NN_REG_GPR_BOTH)
+               reg->wr_both = true;
+       reg->dst = nfp_swreg_to_unreg(dst, true);
+
+       /* Decode source operands */
+       if (swreg_type(lreg) == swreg_type(rreg))
+               return -EFAULT;
+
+       if (swreg_type(lreg) == NN_REG_GPR_B ||
+           swreg_type(rreg) == NN_REG_GPR_A) {
+               reg->areg = nfp_swreg_to_unreg(rreg, false);
+               reg->breg = nfp_swreg_to_unreg(lreg, false);
+               reg->swap = true;
+       } else {
+               reg->areg = nfp_swreg_to_unreg(lreg, false);
+               reg->breg = nfp_swreg_to_unreg(rreg, false);
+       }
+
+       reg->dst_lmextn = swreg_lmextn(dst);
+       reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg);
+
+       return 0;
+}
+
+static u16 nfp_swreg_to_rereg(swreg reg, bool is_dst, bool has_imm8, bool *i8)
+{
+       u16 val = swreg_value(reg);
+       bool lm_id;
+
+       switch (swreg_type(reg)) {
+       case NN_REG_GPR_A:
+       case NN_REG_GPR_B:
+       case NN_REG_GPR_BOTH:
+               return val;
+       case NN_REG_XFER:
+               return RE_REG_XFR | val;
+       case NN_REG_LMEM:
+               lm_id = swreg_lm_idx(reg);
+
+               if (swreg_lm_mode(reg) != NN_LM_MOD_NONE) {
+                       pr_err("bad LM mode for restricted operands %d\n",
+                              swreg_lm_mode(reg));
+                       return 0;
+               }
+
+               if (val & ~RE_REG_LM_IDX_MAX) {
+                       pr_err("LM offset too large\n");
+                       return 0;
+               }
+
+               return RE_REG_LM | FIELD_PREP(RE_REG_LM_IDX, lm_id) | val;
+       case NN_REG_IMM:
+               if (val & ~(0x7f | has_imm8 << 7)) {
+                       pr_err("immediate too large\n");
+                       return 0;
+               }
+               *i8 = val & 0x80;
+               return RE_REG_IMM_encode(val & 0x7f);
+       case NN_REG_NONE:
+               return is_dst ? RE_REG_NO_DST : REG_NONE;
+       case NN_REG_NNR:
+               pr_err("NNRs used with restricted encoding\n");
+               return 0;
+       }
+
+       pr_err("unrecognized reg encoding\n");
+       return 0;
+}
+
+int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg,
+                       struct nfp_insn_re_regs *reg, bool has_imm8)
+{
+       memset(reg, 0, sizeof(*reg));
+
+       /* Decode destination */
+       if (swreg_type(dst) == NN_REG_IMM)
+               return -EFAULT;
+
+       if (swreg_type(dst) == NN_REG_GPR_B)
+               reg->dst_ab = ALU_DST_B;
+       if (swreg_type(dst) == NN_REG_GPR_BOTH)
+               reg->wr_both = true;
+       reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL);
+
+       /* Decode source operands */
+       if (swreg_type(lreg) == swreg_type(rreg))
+               return -EFAULT;
+
+       if (swreg_type(lreg) == NN_REG_GPR_B ||
+           swreg_type(rreg) == NN_REG_GPR_A) {
+               reg->areg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
+               reg->breg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
+               reg->swap = true;
+       } else {
+               reg->areg = nfp_swreg_to_rereg(lreg, false, has_imm8, &reg->i8);
+               reg->breg = nfp_swreg_to_rereg(rreg, false, has_imm8, &reg->i8);
+       }
+
+       reg->dst_lmextn = swreg_lmextn(dst);
+       reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg);
+
+       return 0;
+}
+
+#define NFP_USTORE_ECC_POLY_WORDS              7
+#define NFP_USTORE_OP_BITS                     45
+
+static const u64 nfp_ustore_ecc_polynomials[NFP_USTORE_ECC_POLY_WORDS] = {
+       0x0ff800007fffULL,
+       0x11f801ff801fULL,
+       0x1e387e0781e1ULL,
+       0x17cb8e388e22ULL,
+       0x1af5b2c93244ULL,
+       0x1f56d5525488ULL,
+       0x0daf69a46910ULL,
+};
+
+static bool parity(u64 value)
+{
+       return hweight64(value) & 1;
+}
+
+int nfp_ustore_check_valid_no_ecc(u64 insn)
+{
+       if (insn & ~GENMASK_ULL(NFP_USTORE_OP_BITS, 0))
+               return -EINVAL;
+
+       return 0;
+}
+
+u64 nfp_ustore_calc_ecc_insn(u64 insn)
+{
+       u8 ecc = 0;
+       int i;
+
+       for (i = 0; i < NFP_USTORE_ECC_POLY_WORDS; i++)
+               ecc |= parity(nfp_ustore_ecc_polynomials[i] & insn) << i;
+
+       return insn | (u64)ecc << NFP_USTORE_OP_BITS;
+}
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h

index d2b5357..c4c18dd 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -34,6 +34,7 @@
  #ifndef __NFP_ASM_H__
  #define __NFP_ASM_H__ 1
  
+#include <linux/bitfield.h>
  #include <linux/types.h>
  
  #define REG_NONE       0
@@ -43,23 +44,31 @@
  #define RE_REG_IMM_encode(x)                                   \
         (RE_REG_IMM | ((x) & 0x1f) | (((x) & 0x60) << 1))
  #define RE_REG_IMM_MAX  0x07fULL
+#define RE_REG_LM      0x050
+#define RE_REG_LM_IDX  0x008
+#define RE_REG_LM_IDX_MAX      0x7
  #define RE_REG_XFR     0x080
  
  #define UR_REG_XFR     0x180
+#define UR_REG_LM      0x200
+#define UR_REG_LM_IDX  0x020
+#define UR_REG_LM_POST_MOD     0x010
+#define UR_REG_LM_POST_MOD_DEC 0x001
+#define UR_REG_LM_IDX_MAX      0xf
  #define UR_REG_NN      0x280
  #define UR_REG_NO_DST  0x300
  #define UR_REG_IMM     UR_REG_NO_DST
  #define UR_REG_IMM_encode(x) (UR_REG_IMM | (x))
  #define UR_REG_IMM_MAX  0x0ffULL
  
-#define OP_BR_BASE     0x0d800000020ULL
-#define OP_BR_BASE_MASK        0x0f8000c3ce0ULL
-#define OP_BR_MASK     0x0000000001fULL
-#define OP_BR_EV_PIP   0x00000000300ULL
-#define OP_BR_CSS      0x0000003c000ULL
-#define OP_BR_DEFBR    0x00000300000ULL
-#define OP_BR_ADDR_LO  0x007ffc00000ULL
-#define OP_BR_ADDR_HI  0x10000000000ULL
+#define OP_BR_BASE             0x0d800000020ULL
+#define OP_BR_BASE_MASK                0x0f8000c3ce0ULL
+#define OP_BR_MASK             0x0000000001fULL
+#define OP_BR_EV_PIP           0x00000000300ULL
+#define OP_BR_CSS              0x0000003c000ULL
+#define OP_BR_DEFBR            0x00000300000ULL
+#define OP_BR_ADDR_LO          0x007ffc00000ULL
+#define OP_BR_ADDR_HI          0x10000000000ULL
  
  #define nfp_is_br(_insn)                               \
         (((_insn) & OP_BR_BASE_MASK) == OP_BR_BASE)
@@ -82,30 +91,33 @@ enum br_ctx_signal_state {
         BR_CSS_NONE = 2,
  };
  
-#define OP_BBYTE_BASE  0x0c800000000ULL
-#define OP_BB_A_SRC    0x000000000ffULL
-#define OP_BB_BYTE     0x00000000300ULL
-#define OP_BB_B_SRC    0x0000003fc00ULL
-#define OP_BB_I8       0x00000040000ULL
-#define OP_BB_EQ       0x00000080000ULL
-#define OP_BB_DEFBR    0x00000300000ULL
-#define OP_BB_ADDR_LO  0x007ffc00000ULL
-#define OP_BB_ADDR_HI  0x10000000000ULL
-
-#define OP_BALU_BASE   0x0e800000000ULL
-#define OP_BA_A_SRC    0x000000003ffULL
-#define OP_BA_B_SRC    0x000000ffc00ULL
-#define OP_BA_DEFBR    0x00000300000ULL
-#define OP_BA_ADDR_HI  0x0007fc00000ULL
-
-#define OP_IMMED_A_SRC 0x000000003ffULL
-#define OP_IMMED_B_SRC 0x000000ffc00ULL
-#define OP_IMMED_IMM   0x0000ff00000ULL
-#define OP_IMMED_WIDTH 0x00060000000ULL
-#define OP_IMMED_INV   0x00080000000ULL
-#define OP_IMMED_SHIFT 0x00600000000ULL
-#define OP_IMMED_BASE  0x0f000000000ULL
-#define OP_IMMED_WR_AB 0x20000000000ULL
+#define OP_BBYTE_BASE          0x0c800000000ULL
+#define OP_BB_A_SRC            0x000000000ffULL
+#define OP_BB_BYTE             0x00000000300ULL
+#define OP_BB_B_SRC            0x0000003fc00ULL
+#define OP_BB_I8               0x00000040000ULL
+#define OP_BB_EQ               0x00000080000ULL
+#define OP_BB_DEFBR            0x00000300000ULL
+#define OP_BB_ADDR_LO          0x007ffc00000ULL
+#define OP_BB_ADDR_HI          0x10000000000ULL
+#define OP_BB_SRC_LMEXTN       0x40000000000ULL
+
+#define OP_BALU_BASE           0x0e800000000ULL
+#define OP_BA_A_SRC            0x000000003ffULL
+#define OP_BA_B_SRC            0x000000ffc00ULL
+#define OP_BA_DEFBR            0x00000300000ULL
+#define OP_BA_ADDR_HI          0x0007fc00000ULL
+
+#define OP_IMMED_A_SRC         0x000000003ffULL
+#define OP_IMMED_B_SRC         0x000000ffc00ULL
+#define OP_IMMED_IMM           0x0000ff00000ULL
+#define OP_IMMED_WIDTH         0x00060000000ULL
+#define OP_IMMED_INV           0x00080000000ULL
+#define OP_IMMED_SHIFT         0x00600000000ULL
+#define OP_IMMED_BASE          0x0f000000000ULL
+#define OP_IMMED_WR_AB         0x20000000000ULL
+#define OP_IMMED_SRC_LMEXTN    0x40000000000ULL
+#define OP_IMMED_DST_LMEXTN    0x80000000000ULL
  
  enum immed_width {
         IMMED_WIDTH_ALL = 0,
@@ -119,17 +131,19 @@ enum immed_shift {
         IMMED_SHIFT_2B = 2,
  };
  
-#define OP_SHF_BASE    0x08000000000ULL
-#define OP_SHF_A_SRC   0x000000000ffULL
-#define OP_SHF_SC      0x00000000300ULL
-#define OP_SHF_B_SRC   0x0000003fc00ULL
-#define OP_SHF_I8      0x00000040000ULL
-#define OP_SHF_SW      0x00000080000ULL
-#define OP_SHF_DST     0x0000ff00000ULL
-#define OP_SHF_SHIFT   0x001f0000000ULL
-#define OP_SHF_OP      0x00e00000000ULL
-#define OP_SHF_DST_AB  0x01000000000ULL
-#define OP_SHF_WR_AB   0x20000000000ULL
+#define OP_SHF_BASE            0x08000000000ULL
+#define OP_SHF_A_SRC           0x000000000ffULL
+#define OP_SHF_SC              0x00000000300ULL
+#define OP_SHF_B_SRC           0x0000003fc00ULL
+#define OP_SHF_I8              0x00000040000ULL
+#define OP_SHF_SW              0x00000080000ULL
+#define OP_SHF_DST             0x0000ff00000ULL
+#define OP_SHF_SHIFT           0x001f0000000ULL
+#define OP_SHF_OP              0x00e00000000ULL
+#define OP_SHF_DST_AB          0x01000000000ULL
+#define OP_SHF_WR_AB           0x20000000000ULL
+#define OP_SHF_SRC_LMEXTN      0x40000000000ULL
+#define OP_SHF_DST_LMEXTN      0x80000000000ULL
  
  enum shf_op {
         SHF_OP_NONE = 0,
@@ -144,14 +158,16 @@ enum shf_sc {
         SHF_SC_R_DSHF = 3,
  };
  
-#define OP_ALU_A_SRC   0x000000003ffULL
-#define OP_ALU_B_SRC   0x000000ffc00ULL
-#define OP_ALU_DST     0x0003ff00000ULL
-#define OP_ALU_SW      0x00040000000ULL
-#define OP_ALU_OP      0x00f80000000ULL
-#define OP_ALU_DST_AB  0x01000000000ULL
-#define OP_ALU_BASE    0x0a000000000ULL
-#define OP_ALU_WR_AB   0x20000000000ULL
+#define OP_ALU_A_SRC           0x000000003ffULL
+#define OP_ALU_B_SRC           0x000000ffc00ULL
+#define OP_ALU_DST             0x0003ff00000ULL
+#define OP_ALU_SW              0x00040000000ULL
+#define OP_ALU_OP              0x00f80000000ULL
+#define OP_ALU_DST_AB          0x01000000000ULL
+#define OP_ALU_BASE            0x0a000000000ULL
+#define OP_ALU_WR_AB           0x20000000000ULL
+#define OP_ALU_SRC_LMEXTN      0x40000000000ULL
+#define OP_ALU_DST_LMEXTN      0x80000000000ULL
  
  enum alu_op {
         ALU_OP_NONE     = 0x00,
@@ -170,26 +186,28 @@ enum alu_dst_ab {
         ALU_DST_B = 1,
  };
  
-#define OP_LDF_BASE    0x0c000000000ULL
-#define OP_LDF_A_SRC   0x000000000ffULL
-#define OP_LDF_SC      0x00000000300ULL
-#define OP_LDF_B_SRC   0x0000003fc00ULL
-#define OP_LDF_I8      0x00000040000ULL
-#define OP_LDF_SW      0x00000080000ULL
-#define OP_LDF_ZF      0x00000100000ULL
-#define OP_LDF_BMASK   0x0000f000000ULL
-#define OP_LDF_SHF     0x001f0000000ULL
-#define OP_LDF_WR_AB   0x20000000000ULL
-
-#define OP_CMD_A_SRC    0x000000000ffULL
-#define OP_CMD_CTX      0x00000000300ULL
-#define OP_CMD_B_SRC    0x0000003fc00ULL
-#define OP_CMD_TOKEN    0x000000c0000ULL
-#define OP_CMD_XFER     0x00001f00000ULL
-#define OP_CMD_CNT      0x0000e000000ULL
-#define OP_CMD_SIG      0x000f0000000ULL
-#define OP_CMD_TGT_CMD  0x07f00000000ULL
-#define OP_CMD_MODE    0x1c0000000000ULL
+#define OP_LDF_BASE            0x0c000000000ULL
+#define OP_LDF_A_SRC           0x000000000ffULL
+#define OP_LDF_SC              0x00000000300ULL
+#define OP_LDF_B_SRC           0x0000003fc00ULL
+#define OP_LDF_I8              0x00000040000ULL
+#define OP_LDF_SW              0x00000080000ULL
+#define OP_LDF_ZF              0x00000100000ULL
+#define OP_LDF_BMASK           0x0000f000000ULL
+#define OP_LDF_SHF             0x001f0000000ULL
+#define OP_LDF_WR_AB           0x20000000000ULL
+#define OP_LDF_SRC_LMEXTN      0x40000000000ULL
+#define OP_LDF_DST_LMEXTN      0x80000000000ULL
+
+#define OP_CMD_A_SRC           0x000000000ffULL
+#define OP_CMD_CTX             0x00000000300ULL
+#define OP_CMD_B_SRC           0x0000003fc00ULL
+#define OP_CMD_TOKEN           0x000000c0000ULL
+#define OP_CMD_XFER            0x00001f00000ULL
+#define OP_CMD_CNT             0x0000e000000ULL
+#define OP_CMD_SIG             0x000f0000000ULL
+#define OP_CMD_TGT_CMD         0x07f00000000ULL
+#define OP_CMD_MODE           0x1c0000000000ULL
  
  struct cmd_tgt_act {
         u8 token;
@@ -204,6 +222,8 @@ enum cmd_tgt_map {
         __CMD_TGT_MAP_SIZE,
  };
  
+extern const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE];
+
  enum cmd_mode {
         CMD_MODE_40b_AB = 0,
         CMD_MODE_40b_BA = 1,
@@ -215,11 +235,13 @@ enum cmd_ctx_swap {
         CMD_CTX_NO_SWAP = 3,
  };
  
-#define OP_LCSR_BASE   0x0fc00000000ULL
-#define OP_LCSR_A_SRC  0x000000003ffULL
-#define OP_LCSR_B_SRC  0x000000ffc00ULL
-#define OP_LCSR_WRITE  0x00000200000ULL
-#define OP_LCSR_ADDR   0x001ffc00000ULL
+#define OP_LCSR_BASE           0x0fc00000000ULL
+#define OP_LCSR_A_SRC          0x000000003ffULL
+#define OP_LCSR_B_SRC          0x000000ffc00ULL
+#define OP_LCSR_WRITE          0x00000200000ULL
+#define OP_LCSR_ADDR           0x001ffc00000ULL
+#define OP_LCSR_SRC_LMEXTN     0x40000000000ULL
+#define OP_LCSR_DST_LMEXTN     0x80000000000ULL
  
  enum lcsr_wr_src {
         LCSR_WR_AREG,
@@ -227,7 +249,122 @@ enum lcsr_wr_src {
         LCSR_WR_IMM,
  };
  
-#define OP_CARB_BASE   0x0e000000000ULL
-#define OP_CARB_OR     0x00000010000ULL
+#define OP_CARB_BASE           0x0e000000000ULL
+#define OP_CARB_OR             0x00000010000ULL
+
+/* Software register representation, independent of operand type */
+#define NN_REG_TYPE    GENMASK(31, 24)
+#define NN_REG_LM_IDX  GENMASK(23, 22)
+#define NN_REG_LM_IDX_HI       BIT(23)
+#define NN_REG_LM_IDX_LO       BIT(22)
+#define NN_REG_LM_MOD  GENMASK(21, 20)
+#define NN_REG_VAL     GENMASK(7, 0)
+
+enum nfp_bpf_reg_type {
+       NN_REG_GPR_A =  BIT(0),
+       NN_REG_GPR_B =  BIT(1),
+       NN_REG_GPR_BOTH = NN_REG_GPR_A | NN_REG_GPR_B,
+       NN_REG_NNR =    BIT(2),
+       NN_REG_XFER =   BIT(3),
+       NN_REG_IMM =    BIT(4),
+       NN_REG_NONE =   BIT(5),
+       NN_REG_LMEM =   BIT(6),
+};
+
+enum nfp_bpf_lm_mode {
+       NN_LM_MOD_NONE = 0,
+       NN_LM_MOD_INC,
+       NN_LM_MOD_DEC,
+};
+
+#define reg_both(x)    __enc_swreg((x), NN_REG_GPR_BOTH)
+#define reg_a(x)       __enc_swreg((x), NN_REG_GPR_A)
+#define reg_b(x)       __enc_swreg((x), NN_REG_GPR_B)
+#define reg_nnr(x)     __enc_swreg((x), NN_REG_NNR)
+#define reg_xfer(x)    __enc_swreg((x), NN_REG_XFER)
+#define reg_imm(x)     __enc_swreg((x), NN_REG_IMM)
+#define reg_none()     __enc_swreg(0, NN_REG_NONE)
+#define reg_lm(x, off) __enc_swreg_lm((x), NN_LM_MOD_NONE, (off))
+#define reg_lm_inc(x)  __enc_swreg_lm((x), NN_LM_MOD_INC, 0)
+#define reg_lm_dec(x)  __enc_swreg_lm((x), NN_LM_MOD_DEC, 0)
+#define __reg_lm(x, mod, off)  __enc_swreg_lm((x), (mod), (off))
+
+typedef __u32 __bitwise swreg;
+
+static inline swreg __enc_swreg(u16 id, u8 type)
+{
+       return (__force swreg)(id | FIELD_PREP(NN_REG_TYPE, type));
+}
+
+static inline swreg __enc_swreg_lm(u8 id, enum nfp_bpf_lm_mode mode, u8 off)
+{
+       WARN_ON(id > 3 || (off && mode != NN_LM_MOD_NONE));
+
+       return (__force swreg)(FIELD_PREP(NN_REG_TYPE, NN_REG_LMEM) |
+                              FIELD_PREP(NN_REG_LM_IDX, id) |
+                              FIELD_PREP(NN_REG_LM_MOD, mode) |
+                              off);
+}
+
+static inline u32 swreg_raw(swreg reg)
+{
+       return (__force u32)reg;
+}
+
+static inline enum nfp_bpf_reg_type swreg_type(swreg reg)
+{
+       return FIELD_GET(NN_REG_TYPE, swreg_raw(reg));
+}
+
+static inline u16 swreg_value(swreg reg)
+{
+       return FIELD_GET(NN_REG_VAL, swreg_raw(reg));
+}
+
+static inline bool swreg_lm_idx(swreg reg)
+{
+       return FIELD_GET(NN_REG_LM_IDX_LO, swreg_raw(reg));
+}
+
+static inline bool swreg_lmextn(swreg reg)
+{
+       return FIELD_GET(NN_REG_LM_IDX_HI, swreg_raw(reg));
+}
+
+static inline enum nfp_bpf_lm_mode swreg_lm_mode(swreg reg)
+{
+       return FIELD_GET(NN_REG_LM_MOD, swreg_raw(reg));
+}
+
+struct nfp_insn_ur_regs {
+       enum alu_dst_ab dst_ab;
+       u16 dst;
+       u16 areg, breg;
+       bool swap;
+       bool wr_both;
+       bool dst_lmextn;
+       bool src_lmextn;
+};
+
+struct nfp_insn_re_regs {
+       enum alu_dst_ab dst_ab;
+       u8 dst;
+       u8 areg, breg;
+       bool swap;
+       bool wr_both;
+       bool i8;
+       bool dst_lmextn;
+       bool src_lmextn;
+};
+
+int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg,
+                         struct nfp_insn_ur_regs *reg);
+int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg,
+                       struct nfp_insn_re_regs *reg, bool has_imm8);
+
+#define NFP_USTORE_PREFETCH_WINDOW     8
+
+int nfp_ustore_check_valid_no_ecc(u64 insn);
+u64 nfp_ustore_calc_ecc_insn(u64 insn);
  
  #endif
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h

index b0a452b..782d452 100644 (file)
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
@@ -255,7 +255,7 @@
   * @NFP_NET_CFG_BPF_ADDR:      DMA address of the buffer with JITed BPF code
   */
  #define NFP_NET_CFG_BPF_ABI            0x0080
-#define   NFP_NET_BPF_ABI              1
+#define   NFP_NET_BPF_ABI              2
  #define NFP_NET_CFG_BPF_CAP            0x0081
  #define   NFP_NET_BPF_CAP_RELO         (1 << 0) /* seamless reload */
  #define NFP_NET_CFG_BPF_MAX_LEN                0x0082
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c

index 8f6ccc0..6e15d3c 100644 (file)
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -2308,7 +2308,7 @@ static int qed_dcbnl_ieee_setapp(struct qed_dev *cdev, struct dcb_app *app)
  
         DP_VERBOSE(hwfn, QED_MSG_DCB, "selector = %d protocol = %d pri = %d\n",
                    app->selector, app->protocol, app->priority);
-       if (app->priority < 0 || app->priority >= QED_MAX_PFC_PRIORITIES) {
+       if (app->priority >= QED_MAX_PFC_PRIORITIES) {
                 DP_INFO(hwfn, "Invalid priority %d\n", app->priority);
                 return -EINVAL;
         }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c

index 8fc9c81..b2b1f87 100644 (file)
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -1415,7 +1415,12 @@ int qed_iwarp_alloc(struct qed_hwfn *p_hwfn)
  
  void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn)
  {
+       struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+
         qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1);
+       kfree(iwarp_info->mpa_bufs);
+       kfree(iwarp_info->partial_fpdus);
+       kfree(iwarp_info->mpa_intermediate_buf);
  }
  
  int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams)
@@ -1713,6 +1718,569 @@ qed_iwarp_parse_rx_pkt(struct qed_hwfn *p_hwfn,
         return 0;
  }
  
+static struct qed_iwarp_fpdu *qed_iwarp_get_curr_fpdu(struct qed_hwfn *p_hwfn,
+                                                     u16 cid)
+{
+       struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+       struct qed_iwarp_fpdu *partial_fpdu;
+       u32 idx;
+
+       idx = cid - qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_IWARP);
+       if (idx >= iwarp_info->max_num_partial_fpdus) {
+               DP_ERR(p_hwfn, "Invalid cid %x max_num_partial_fpdus=%x\n", cid,
+                      iwarp_info->max_num_partial_fpdus);
+               return NULL;
+       }
+
+       partial_fpdu = &iwarp_info->partial_fpdus[idx];
+
+       return partial_fpdu;
+}
+
+enum qed_iwarp_mpa_pkt_type {
+       QED_IWARP_MPA_PKT_PACKED,
+       QED_IWARP_MPA_PKT_PARTIAL,
+       QED_IWARP_MPA_PKT_UNALIGNED
+};
+
+#define QED_IWARP_INVALID_FPDU_LENGTH 0xffff
+#define QED_IWARP_MPA_FPDU_LENGTH_SIZE (2)
+#define QED_IWARP_MPA_CRC32_DIGEST_SIZE (4)
+
+/* Pad to multiple of 4 */
+#define QED_IWARP_PDU_DATA_LEN_WITH_PAD(data_len) ALIGN(data_len, 4)
+#define QED_IWARP_FPDU_LEN_WITH_PAD(_mpa_len)                             \
+       (QED_IWARP_PDU_DATA_LEN_WITH_PAD((_mpa_len) +                      \
+                                        QED_IWARP_MPA_FPDU_LENGTH_SIZE) + \
+                                        QED_IWARP_MPA_CRC32_DIGEST_SIZE)
+
+/* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */
+#define QED_IWARP_MAX_BDS_PER_FPDU 3
+
+char *pkt_type_str[] = {
+       "QED_IWARP_MPA_PKT_PACKED",
+       "QED_IWARP_MPA_PKT_PARTIAL",
+       "QED_IWARP_MPA_PKT_UNALIGNED"
+};
+
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+                     struct qed_iwarp_fpdu *fpdu,
+                     struct qed_iwarp_ll2_buff *buf);
+
+static enum qed_iwarp_mpa_pkt_type
+qed_iwarp_mpa_classify(struct qed_hwfn *p_hwfn,
+                      struct qed_iwarp_fpdu *fpdu,
+                      u16 tcp_payload_len, u8 *mpa_data)
+{
+       enum qed_iwarp_mpa_pkt_type pkt_type;
+       u16 mpa_len;
+
+       if (fpdu->incomplete_bytes) {
+               pkt_type = QED_IWARP_MPA_PKT_UNALIGNED;
+               goto out;
+       }
+
+       /* special case of one byte remaining...
+        * lower byte will be read next packet
+        */
+       if (tcp_payload_len == 1) {
+               fpdu->fpdu_length = *mpa_data << BITS_PER_BYTE;
+               pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+               goto out;
+       }
+
+       mpa_len = ntohs(*((u16 *)(mpa_data)));
+       fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+
+       if (fpdu->fpdu_length <= tcp_payload_len)
+               pkt_type = QED_IWARP_MPA_PKT_PACKED;
+       else
+               pkt_type = QED_IWARP_MPA_PKT_PARTIAL;
+
+out:
+       DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                  "MPA_ALIGN: %s: fpdu_length=0x%x tcp_payload_len:0x%x\n",
+                  pkt_type_str[pkt_type], fpdu->fpdu_length, tcp_payload_len);
+
+       return pkt_type;
+}
+
+static void
+qed_iwarp_init_fpdu(struct qed_iwarp_ll2_buff *buf,
+                   struct qed_iwarp_fpdu *fpdu,
+                   struct unaligned_opaque_data *pkt_data,
+                   u16 tcp_payload_size, u8 placement_offset)
+{
+       fpdu->mpa_buf = buf;
+       fpdu->pkt_hdr = buf->data_phys_addr + placement_offset;
+       fpdu->pkt_hdr_size = pkt_data->tcp_payload_offset;
+       fpdu->mpa_frag = buf->data_phys_addr + pkt_data->first_mpa_offset;
+       fpdu->mpa_frag_virt = (u8 *)(buf->data) + pkt_data->first_mpa_offset;
+
+       if (tcp_payload_size == 1)
+               fpdu->incomplete_bytes = QED_IWARP_INVALID_FPDU_LENGTH;
+       else if (tcp_payload_size < fpdu->fpdu_length)
+               fpdu->incomplete_bytes = fpdu->fpdu_length - tcp_payload_size;
+       else
+               fpdu->incomplete_bytes = 0;     /* complete fpdu */
+
+       fpdu->mpa_frag_len = fpdu->fpdu_length - fpdu->incomplete_bytes;
+}
+
+static int
+qed_iwarp_cp_pkt(struct qed_hwfn *p_hwfn,
+                struct qed_iwarp_fpdu *fpdu,
+                struct unaligned_opaque_data *pkt_data,
+                struct qed_iwarp_ll2_buff *buf, u16 tcp_payload_size)
+{
+       u8 *tmp_buf = p_hwfn->p_rdma_info->iwarp.mpa_intermediate_buf;
+       int rc;
+
+       /* need to copy the data from the partial packet stored in fpdu
+        * to the new buf, for this we also need to move the data currently
+        * placed on the buf. The assumption is that the buffer is big enough
+        * since fpdu_length <= mss, we use an intermediate buffer since
+        * we may need to copy the new data to an overlapping location
+        */
+       if ((fpdu->mpa_frag_len + tcp_payload_size) > (u16)buf->buff_size) {
+               DP_ERR(p_hwfn,
+                      "MPA ALIGN: Unexpected: buffer is not large enough for split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+                      buf->buff_size, fpdu->mpa_frag_len,
+                      tcp_payload_size, fpdu->incomplete_bytes);
+               return -EINVAL;
+       }
+
+       DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                  "MPA ALIGN Copying fpdu: [%p, %d] [%p, %d]\n",
+                  fpdu->mpa_frag_virt, fpdu->mpa_frag_len,
+                  (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+                  tcp_payload_size);
+
+       memcpy(tmp_buf, fpdu->mpa_frag_virt, fpdu->mpa_frag_len);
+       memcpy(tmp_buf + fpdu->mpa_frag_len,
+              (u8 *)(buf->data) + pkt_data->first_mpa_offset,
+              tcp_payload_size);
+
+       rc = qed_iwarp_recycle_pkt(p_hwfn, fpdu, fpdu->mpa_buf);
+       if (rc)
+               return rc;
+
+       /* If we managed to post the buffer copy the data to the new buffer
+        * o/w this will occur in the next round...
+        */
+       memcpy((u8 *)(buf->data), tmp_buf,
+              fpdu->mpa_frag_len + tcp_payload_size);
+
+       fpdu->mpa_buf = buf;
+       /* fpdu->pkt_hdr remains as is */
+       /* fpdu->mpa_frag is overridden with new buf */
+       fpdu->mpa_frag = buf->data_phys_addr;
+       fpdu->mpa_frag_virt = buf->data;
+       fpdu->mpa_frag_len += tcp_payload_size;
+
+       fpdu->incomplete_bytes -= tcp_payload_size;
+
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA ALIGN: split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n",
+                  buf->buff_size, fpdu->mpa_frag_len, tcp_payload_size,
+                  fpdu->incomplete_bytes);
+
+       return 0;
+}
+
+static void
+qed_iwarp_update_fpdu_length(struct qed_hwfn *p_hwfn,
+                            struct qed_iwarp_fpdu *fpdu, u8 *mpa_data)
+{
+       u16 mpa_len;
+
+       /* Update incomplete packets if needed */
+       if (fpdu->incomplete_bytes == QED_IWARP_INVALID_FPDU_LENGTH) {
+               /* Missing lower byte is now available */
+               mpa_len = fpdu->fpdu_length | *mpa_data;
+               fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len);
+               fpdu->mpa_frag_len = fpdu->fpdu_length;
+               /* one byte of hdr */
+               fpdu->incomplete_bytes = fpdu->fpdu_length - 1;
+               DP_VERBOSE(p_hwfn,
+                          QED_MSG_RDMA,
+                          "MPA_ALIGN: Partial header mpa_len=%x fpdu_length=%x incomplete_bytes=%x\n",
+                          mpa_len, fpdu->fpdu_length, fpdu->incomplete_bytes);
+       }
+}
+
+#define QED_IWARP_IS_RIGHT_EDGE(_curr_pkt) \
+       (GET_FIELD((_curr_pkt)->flags,     \
+                  UNALIGNED_OPAQUE_DATA_PKT_REACHED_WIN_RIGHT_EDGE))
+
+/* This function is used to recycle a buffer using the ll2 drop option. It
+ * uses the mechanism to ensure that all buffers posted to tx before this one
+ * were completed. The buffer sent here will be sent as a cookie in the tx
+ * completion function and can then be reposted to rx chain when done. The flow
+ * that requires this is the flow where a FPDU splits over more than 3 tcp
+ * segments. In this case the driver needs to re-post a rx buffer instead of
+ * the one received, but driver can't simply repost a buffer it copied from
+ * as there is a case where the buffer was originally a packed FPDU, and is
+ * partially posted to FW. Driver needs to ensure FW is done with it.
+ */
+static int
+qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn,
+                     struct qed_iwarp_fpdu *fpdu,
+                     struct qed_iwarp_ll2_buff *buf)
+{
+       struct qed_ll2_tx_pkt_info tx_pkt;
+       u8 ll2_handle;
+       int rc;
+
+       memset(&tx_pkt, 0, sizeof(tx_pkt));
+       tx_pkt.num_of_bds = 1;
+       tx_pkt.tx_dest = QED_LL2_TX_DEST_DROP;
+       tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+       tx_pkt.first_frag = fpdu->pkt_hdr;
+       tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+       buf->piggy_buf = NULL;
+       tx_pkt.cookie = buf;
+
+       ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+       rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+       if (rc)
+               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                          "Can't drop packet rc=%d\n", rc);
+
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA_ALIGN: send drop tx packet [%lx, 0x%x], buf=%p, rc=%d\n",
+                  (unsigned long int)tx_pkt.first_frag,
+                  tx_pkt.first_frag_len, buf, rc);
+
+       return rc;
+}
+
+static int
+qed_iwarp_win_right_edge(struct qed_hwfn *p_hwfn, struct qed_iwarp_fpdu *fpdu)
+{
+       struct qed_ll2_tx_pkt_info tx_pkt;
+       u8 ll2_handle;
+       int rc;
+
+       memset(&tx_pkt, 0, sizeof(tx_pkt));
+       tx_pkt.num_of_bds = 1;
+       tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+       tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2;
+
+       tx_pkt.first_frag = fpdu->pkt_hdr;
+       tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+       tx_pkt.enable_ip_cksum = true;
+       tx_pkt.enable_l4_cksum = true;
+       tx_pkt.calc_ip_len = true;
+       /* vlan overload with enum iwarp_ll2_tx_queues */
+       tx_pkt.vlan = IWARP_LL2_ALIGNED_RIGHT_TRIMMED_TX_QUEUE;
+
+       ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+       rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+       if (rc)
+               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                          "Can't send right edge rc=%d\n", rc);
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA_ALIGN: Sent right edge FPDU num_bds=%d [%lx, 0x%x], rc=%d\n",
+                  tx_pkt.num_of_bds,
+                  (unsigned long int)tx_pkt.first_frag,
+                  tx_pkt.first_frag_len, rc);
+
+       return rc;
+}
+
+static int
+qed_iwarp_send_fpdu(struct qed_hwfn *p_hwfn,
+                   struct qed_iwarp_fpdu *fpdu,
+                   struct unaligned_opaque_data *curr_pkt,
+                   struct qed_iwarp_ll2_buff *buf,
+                   u16 tcp_payload_size, enum qed_iwarp_mpa_pkt_type pkt_type)
+{
+       struct qed_ll2_tx_pkt_info tx_pkt;
+       u8 ll2_handle;
+       int rc;
+
+       memset(&tx_pkt, 0, sizeof(tx_pkt));
+
+       /* An unaligned packet means it's split over two tcp segments. So the
+        * complete packet requires 3 bds, one for the header, one for the
+        * part of the fpdu of the first tcp segment, and the last fragment
+        * will point to the remainder of the fpdu. A packed pdu, requires only
+        * two bds, one for the header and one for the data.
+        */
+       tx_pkt.num_of_bds = (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED) ? 3 : 2;
+       tx_pkt.tx_dest = QED_LL2_TX_DEST_LB;
+       tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; /* offset in words */
+
+       /* Send the mpa_buf only with the last fpdu (in case of packed) */
+       if (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED ||
+           tcp_payload_size <= fpdu->fpdu_length)
+               tx_pkt.cookie = fpdu->mpa_buf;
+
+       tx_pkt.first_frag = fpdu->pkt_hdr;
+       tx_pkt.first_frag_len = fpdu->pkt_hdr_size;
+       tx_pkt.enable_ip_cksum = true;
+       tx_pkt.enable_l4_cksum = true;
+       tx_pkt.calc_ip_len = true;
+       /* vlan overload with enum iwarp_ll2_tx_queues */
+       tx_pkt.vlan = IWARP_LL2_ALIGNED_TX_QUEUE;
+
+       /* special case of unaligned packet and not packed, need to send
+        * both buffers as cookie to release.
+        */
+       if (tcp_payload_size == fpdu->incomplete_bytes)
+               fpdu->mpa_buf->piggy_buf = buf;
+
+       ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle;
+
+       /* Set first fragment to header */
+       rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true);
+       if (rc)
+               goto out;
+
+       /* Set second fragment to first part of packet */
+       rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn, ll2_handle,
+                                              fpdu->mpa_frag,
+                                              fpdu->mpa_frag_len);
+       if (rc)
+               goto out;
+
+       if (!fpdu->incomplete_bytes)
+               goto out;
+
+       /* Set third fragment to second part of the packet */
+       rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn,
+                                              ll2_handle,
+                                              buf->data_phys_addr +
+                                              curr_pkt->first_mpa_offset,
+                                              fpdu->incomplete_bytes);
+out:
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "MPA_ALIGN: Sent FPDU num_bds=%d first_frag_len=%x, mpa_frag_len=0x%x, incomplete_bytes:0x%x rc=%d\n",
+                  tx_pkt.num_of_bds,
+                  tx_pkt.first_frag_len,
+                  fpdu->mpa_frag_len,
+                  fpdu->incomplete_bytes, rc);
+
+       return rc;
+}
+
+static void
+qed_iwarp_mpa_get_data(struct qed_hwfn *p_hwfn,
+                      struct unaligned_opaque_data *curr_pkt,
+                      u32 opaque_data0, u32 opaque_data1)
+{
+       u64 opaque_data;
+
+       opaque_data = HILO_64(opaque_data1, opaque_data0);
+       *curr_pkt = *((struct unaligned_opaque_data *)&opaque_data);
+
+       curr_pkt->first_mpa_offset = curr_pkt->tcp_payload_offset +
+                                    le16_to_cpu(curr_pkt->first_mpa_offset);
+       curr_pkt->cid = le32_to_cpu(curr_pkt->cid);
+}
+
+/* This function is called when an unaligned or incomplete MPA packet arrives
+ * driver needs to align the packet, perhaps using previous data and send
+ * it down to FW once it is aligned.
+ */
+static int
+qed_iwarp_process_mpa_pkt(struct qed_hwfn *p_hwfn,
+                         struct qed_iwarp_ll2_mpa_buf *mpa_buf)
+{
+       struct unaligned_opaque_data *curr_pkt = &mpa_buf->data;
+       struct qed_iwarp_ll2_buff *buf = mpa_buf->ll2_buf;
+       enum qed_iwarp_mpa_pkt_type pkt_type;
+       struct qed_iwarp_fpdu *fpdu;
+       int rc = -EINVAL;
+       u8 *mpa_data;
+
+       fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, curr_pkt->cid & 0xffff);
+       if (!fpdu) { /* something corrupt with cid, post rx back */
+               DP_ERR(p_hwfn, "Invalid cid, drop and post back to rx cid=%x\n",
+                      curr_pkt->cid);
+               goto err;
+       }
+
+       do {
+               mpa_data = ((u8 *)(buf->data) + curr_pkt->first_mpa_offset);
+
+               pkt_type = qed_iwarp_mpa_classify(p_hwfn, fpdu,
+                                                 mpa_buf->tcp_payload_len,
+                                                 mpa_data);
+
+               switch (pkt_type) {
+               case QED_IWARP_MPA_PKT_PARTIAL:
+                       qed_iwarp_init_fpdu(buf, fpdu,
+                                           curr_pkt,
+                                           mpa_buf->tcp_payload_len,
+                                           mpa_buf->placement_offset);
+
+                       if (!QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+                               mpa_buf->tcp_payload_len = 0;
+                               break;
+                       }
+
+                       rc = qed_iwarp_win_right_edge(p_hwfn, fpdu);
+
+                       if (rc) {
+                               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                                          "Can't send FPDU:reset rc=%d\n", rc);
+                               memset(fpdu, 0, sizeof(*fpdu));
+                               break;
+                       }
+
+                       mpa_buf->tcp_payload_len = 0;
+                       break;
+               case QED_IWARP_MPA_PKT_PACKED:
+                       qed_iwarp_init_fpdu(buf, fpdu,
+                                           curr_pkt,
+                                           mpa_buf->tcp_payload_len,
+                                           mpa_buf->placement_offset);
+
+                       rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+                                                mpa_buf->tcp_payload_len,
+                                                pkt_type);
+                       if (rc) {
+                               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                                          "Can't send FPDU:reset rc=%d\n", rc);
+                               memset(fpdu, 0, sizeof(*fpdu));
+                               break;
+                       }
+
+                       mpa_buf->tcp_payload_len -= fpdu->fpdu_length;
+                       curr_pkt->first_mpa_offset += fpdu->fpdu_length;
+                       break;
+               case QED_IWARP_MPA_PKT_UNALIGNED:
+                       qed_iwarp_update_fpdu_length(p_hwfn, fpdu, mpa_data);
+                       if (mpa_buf->tcp_payload_len < fpdu->incomplete_bytes) {
+                               /* special handling of fpdu split over more
+                                * than 2 segments
+                                */
+                               if (QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) {
+                                       rc = qed_iwarp_win_right_edge(p_hwfn,
+                                                                     fpdu);
+                                       /* packet will be re-processed later */
+                                       if (rc)
+                                               return rc;
+                               }
+
+                               rc = qed_iwarp_cp_pkt(p_hwfn, fpdu, curr_pkt,
+                                                     buf,
+                                                     mpa_buf->tcp_payload_len);
+                               if (rc) /* packet will be re-processed later */
+                                       return rc;
+
+                               mpa_buf->tcp_payload_len = 0;
+                               break;
+                       }
+
+                       rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf,
+                                                mpa_buf->tcp_payload_len,
+                                                pkt_type);
+                       if (rc) {
+                               DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+                                          "Can't send FPDU:delay rc=%d\n", rc);
+                               /* don't reset fpdu -> we need it for next
+                                * classify
+                                */
+                               break;
+                       }
+
+                       mpa_buf->tcp_payload_len -= fpdu->incomplete_bytes;
+                       curr_pkt->first_mpa_offset += fpdu->incomplete_bytes;
+                       /* The framed PDU was sent - no more incomplete bytes */
+                       fpdu->incomplete_bytes = 0;
+                       break;
+               }
+       } while (mpa_buf->tcp_payload_len && !rc);
+
+       return rc;
+
+err:
+       qed_iwarp_ll2_post_rx(p_hwfn,
+                             buf,
+                             p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle);
+       return rc;
+}
+
+static void qed_iwarp_process_pending_pkts(struct qed_hwfn *p_hwfn)
+{
+       struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+       struct qed_iwarp_ll2_mpa_buf *mpa_buf = NULL;
+       int rc;
+
+       while (!list_empty(&iwarp_info->mpa_buf_pending_list)) {
+               mpa_buf = list_first_entry(&iwarp_info->mpa_buf_pending_list,
+                                          struct qed_iwarp_ll2_mpa_buf,
+                                          list_entry);
+
+               rc = qed_iwarp_process_mpa_pkt(p_hwfn, mpa_buf);
+
+               /* busy means break and continue processing later, don't
+                * remove the buf from the pending list.
+                */
+               if (rc == -EBUSY)
+                       break;
+
+               list_del(&mpa_buf->list_entry);
+               list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_list);
+
+               if (rc) {       /* different error, don't continue */
+                       DP_NOTICE(p_hwfn, "process pkts failed rc=%d\n", rc);
+                       break;
+               }
+       }
+}
+
+static void
+qed_iwarp_ll2_comp_mpa_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
+{
+       struct qed_iwarp_ll2_mpa_buf *mpa_buf;
+       struct qed_iwarp_info *iwarp_info;
+       struct qed_hwfn *p_hwfn = cxt;
+
+       iwarp_info = &p_hwfn->p_rdma_info->iwarp;
+       mpa_buf = list_first_entry(&iwarp_info->mpa_buf_list,
+                                  struct qed_iwarp_ll2_mpa_buf, list_entry);
+       if (!mpa_buf) {
+               DP_ERR(p_hwfn, "No free mpa buf\n");
+               goto err;
+       }
+
+       list_del(&mpa_buf->list_entry);
+       qed_iwarp_mpa_get_data(p_hwfn, &mpa_buf->data,
+                              data->opaque_data_0, data->opaque_data_1);
+
+       DP_VERBOSE(p_hwfn,
+                  QED_MSG_RDMA,
+                  "LL2 MPA CompRx payload_len:0x%x\tfirst_mpa_offset:0x%x\ttcp_payload_offset:0x%x\tflags:0x%x\tcid:0x%x\n",
+                  data->length.packet_length, mpa_buf->data.first_mpa_offset,
+                  mpa_buf->data.tcp_payload_offset, mpa_buf->data.flags,
+                  mpa_buf->data.cid);
+
+       mpa_buf->ll2_buf = data->cookie;
+       mpa_buf->tcp_payload_len = data->length.packet_length -
+                                  mpa_buf->data.first_mpa_offset;
+       mpa_buf->data.first_mpa_offset += data->u.placement_offset;
+       mpa_buf->placement_offset = data->u.placement_offset;
+
+       list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_pending_list);
+
+       qed_iwarp_process_pending_pkts(p_hwfn);
+       return;
+err:
+       qed_iwarp_ll2_post_rx(p_hwfn, data->cookie,
+                             iwarp_info->ll2_mpa_handle);
+}
+
  static void
  qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data)
  {
@@ -1855,10 +2423,25 @@ static void qed_iwarp_ll2_comp_tx_pkt(void *cxt, u8 connection_handle,
                                       bool b_last_fragment, bool b_last_packet)
  {
         struct qed_iwarp_ll2_buff *buffer = cookie;
+       struct qed_iwarp_ll2_buff *piggy;
         struct qed_hwfn *p_hwfn = cxt;
  
+       if (!buffer)            /* can happen in packed mpa unaligned... */
+               return;
+
         /* this was originally an rx packet, post it back */
+       piggy = buffer->piggy_buf;
+       if (piggy) {
+               buffer->piggy_buf = NULL;
+               qed_iwarp_ll2_post_rx(p_hwfn, piggy, connection_handle);
+       }
+
         qed_iwarp_ll2_post_rx(p_hwfn, buffer, connection_handle);
+
+       if (connection_handle == p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle)
+               qed_iwarp_process_pending_pkts(p_hwfn);
+
+       return;
  }
  
  static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
@@ -1871,12 +2454,44 @@ static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle,
         if (!buffer)
                 return;
  
+       if (buffer->piggy_buf) {
+               dma_free_coherent(&p_hwfn->cdev->pdev->dev,
+                                 buffer->piggy_buf->buff_size,
+                                 buffer->piggy_buf->data,
+                                 buffer->piggy_buf->data_phys_addr);
+
+               kfree(buffer->piggy_buf);
+       }
+
         dma_free_coherent(&p_hwfn->cdev->pdev->dev, buffer->buff_size,
                           buffer->data, buffer->data_phys_addr);
  
         kfree(buffer);
  }
  
+/* The only slowpath for iwarp ll2 is unalign flush. When this completion
+ * is received, need to reset the FPDU.
+ */
+void
+qed_iwarp_ll2_slowpath(void *cxt,
+                      u8 connection_handle,
+                      u32 opaque_data_0, u32 opaque_data_1)
+{
+       struct unaligned_opaque_data unalign_data;
+       struct qed_hwfn *p_hwfn = cxt;
+       struct qed_iwarp_fpdu *fpdu;
+
+       qed_iwarp_mpa_get_data(p_hwfn, &unalign_data,
+                              opaque_data_0, opaque_data_1);
+
+       DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "(0x%x) Flush fpdu\n",
+                  unalign_data.cid);
+
+       fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, (u16)unalign_data.cid);
+       if (fpdu)
+               memset(fpdu, 0, sizeof(*fpdu));
+}
+
  static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
  {
         struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp;
@@ -1902,6 +2517,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
                 iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
         }
  
+       if (iwarp_info->ll2_mpa_handle != QED_IWARP_HANDLE_INVAL) {
+               rc = qed_ll2_terminate_connection(p_hwfn,
+                                                 iwarp_info->ll2_mpa_handle);
+               if (rc)
+                       DP_INFO(p_hwfn, "Failed to terminate mpa connection\n");
+
+               qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+               iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
+       }
+
         qed_llh_remove_mac_filter(p_hwfn,
                                   p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr);
         return rc;
@@ -1953,12 +2578,15 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn,
         struct qed_iwarp_info *iwarp_info;
         struct qed_ll2_acquire_data data;
         struct qed_ll2_cbs cbs;
+       u32 mpa_buff_size;
         u16 n_ooo_bufs;
         int rc = 0;
+       int i;
  
         iwarp_info = &p_hwfn->p_rdma_info->iwarp;
         iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL;
         iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL;
+       iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL;
  
         iwarp_info->max_mtu = params->max_mtu;
  
@@ -2029,6 +2657,68 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn,
         if (rc)
                 goto err;
  
+       /* Start Unaligned MPA connection */
+       cbs.rx_comp_cb = qed_iwarp_ll2_comp_mpa_pkt;
+       cbs.slowpath_cb = qed_iwarp_ll2_slowpath;
+
+       memset(&data, 0, sizeof(data));
+       data.input.conn_type = QED_LL2_TYPE_IWARP;
+       data.input.mtu = params->max_mtu;
+       /* FW requires that once a packet arrives OOO, it must have at
+        * least 2 rx buffers available on the unaligned connection
+        * for handling the case that it is a partial fpdu.
+        */
+       data.input.rx_num_desc = n_ooo_bufs * 2;
+       data.input.tx_num_desc = data.input.rx_num_desc;
+       data.input.tx_max_bds_per_packet = QED_IWARP_MAX_BDS_PER_FPDU;
+       data.p_connection_handle = &iwarp_info->ll2_mpa_handle;
+       data.input.secondary_queue = true;
+       data.cbs = &cbs;
+
+       rc = qed_ll2_acquire_connection(p_hwfn, &data);
+       if (rc)
+               goto err;
+
+       rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_mpa_handle);
+       if (rc)
+               goto err;
+
+       mpa_buff_size = QED_IWARP_MAX_BUF_SIZE(params->max_mtu);
+       rc = qed_iwarp_ll2_alloc_buffers(p_hwfn,
+                                        data.input.rx_num_desc,
+                                        mpa_buff_size,
+                                        iwarp_info->ll2_mpa_handle);
+       if (rc)
+               goto err;
+
+       iwarp_info->partial_fpdus = kcalloc((u16)p_hwfn->p_rdma_info->num_qps,
+                                           sizeof(*iwarp_info->partial_fpdus),
+                                           GFP_KERNEL);
+       if (!iwarp_info->partial_fpdus)
+               goto err;
+
+       iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps;
+
+       iwarp_info->mpa_intermediate_buf = kzalloc(mpa_buff_size, GFP_KERNEL);
+       if (!iwarp_info->mpa_intermediate_buf)
+               goto err;
+
+       /* The mpa_bufs array serves for pending RX packets received on the
+        * mpa ll2 that don't have place on the tx ring and require later
+        * processing. We can't fail on allocation of such a struct therefore
+        * we allocate enough to take care of all rx packets
+        */
+       iwarp_info->mpa_bufs = kcalloc(data.input.rx_num_desc,
+                                      sizeof(*iwarp_info->mpa_bufs),
+                                      GFP_KERNEL);
+       if (!iwarp_info->mpa_bufs)
+               goto err;
+
+       INIT_LIST_HEAD(&iwarp_info->mpa_buf_pending_list);
+       INIT_LIST_HEAD(&iwarp_info->mpa_buf_list);
+       for (i = 0; i < data.input.rx_num_desc; i++)
+               list_add_tail(&iwarp_info->mpa_bufs[i].list_entry,
+                             &iwarp_info->mpa_buf_list);
         return rc;
  err:
         qed_iwarp_ll2_stop(p_hwfn, p_ptt);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h

index 9e2bfde..c1ecd74 100644 (file)
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h
@@ -55,15 +55,43 @@ enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state);
  #define QED_IWARP_HANDLE_INVAL         (0xff)
  
  struct qed_iwarp_ll2_buff {
+       struct qed_iwarp_ll2_buff *piggy_buf;
         void *data;
         dma_addr_t data_phys_addr;
         u32 buff_size;
  };
  
+struct qed_iwarp_ll2_mpa_buf {
+       struct list_head list_entry;
+       struct qed_iwarp_ll2_buff *ll2_buf;
+       struct unaligned_opaque_data data;
+       u16 tcp_payload_len;
+       u8 placement_offset;
+};
+
+/* In some cases a fpdu will arrive with only one byte of the header, in this
+ * case the fpdu_length will be partial (contain only higher byte and
+ * incomplete bytes will contain the invalid value
+ */
+#define QED_IWARP_INVALID_INCOMPLETE_BYTES 0xffff
+
+struct qed_iwarp_fpdu {
+       struct qed_iwarp_ll2_buff *mpa_buf;
+       void *mpa_frag_virt;
+       dma_addr_t mpa_frag;
+       dma_addr_t pkt_hdr;
+       u16 mpa_frag_len;
+       u16 fpdu_length;
+       u16 incomplete_bytes;
+       u8 pkt_hdr_size;
+};
+
  struct qed_iwarp_info {
         struct list_head listen_list;   /* qed_iwarp_listener */
         struct list_head ep_list;       /* qed_iwarp_ep */
         struct list_head ep_free_list;  /* pre-allocated ep's */
+       struct list_head mpa_buf_list;  /* list of mpa_bufs */
+       struct list_head mpa_buf_pending_list;
         spinlock_t iw_lock;     /* for iwarp resources */
         spinlock_t qp_lock;     /* for teardown races */
         u32 rcv_wnd_scale;
@@ -73,9 +101,14 @@ struct qed_iwarp_info {
         u8 tcp_flags;
         u8 ll2_syn_handle;
         u8 ll2_ooo_handle;
+       u8 ll2_mpa_handle;
         u8 peer2peer;
         enum mpa_negotiation_mode mpa_rev;
         enum mpa_rtr_type rtr_type;
+       struct qed_iwarp_fpdu *partial_fpdus;
+       struct qed_iwarp_ll2_mpa_buf *mpa_bufs;
+       u8 *mpa_intermediate_buf;
+       u16 max_num_partial_fpdus;
  };
  
  enum qed_iwarp_ep_state {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c

index 250afa5..047f556 100644 (file)
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -423,6 +423,41 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn,
  }
  
  static int
+qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn,
+                       struct qed_ll2_info *p_ll2_conn,
+                       union core_rx_cqe_union *p_cqe,
+                       unsigned long *p_lock_flags)
+{
+       struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue;
+       struct core_rx_slow_path_cqe *sp_cqe;
+
+       sp_cqe = &p_cqe->rx_cqe_sp;
+       if (sp_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH) {
+               DP_NOTICE(p_hwfn,
+                         "LL2 - unexpected Rx CQE slowpath ramrod_cmd_id:%d\n",
+                         sp_cqe->ramrod_cmd_id);
+               return -EINVAL;
+       }
+
+       if (!p_ll2_conn->cbs.slowpath_cb) {
+               DP_NOTICE(p_hwfn,
+                         "LL2 - received RX_QUEUE_FLUSH but no callback was provided\n");
+               return -EINVAL;
+       }
+
+       spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags);
+
+       p_ll2_conn->cbs.slowpath_cb(p_ll2_conn->cbs.cookie,
+                                   p_ll2_conn->my_id,
+                                   le32_to_cpu(sp_cqe->opaque_data.data[0]),
+                                   le32_to_cpu(sp_cqe->opaque_data.data[1]));
+
+       spin_lock_irqsave(&p_rx->lock, *p_lock_flags);
+
+       return 0;
+}
+
+static int
  qed_ll2_rxq_handle_completion(struct qed_hwfn *p_hwfn,
                               struct qed_ll2_info *p_ll2_conn,
                               union core_rx_cqe_union *p_cqe,
@@ -495,8 +530,8 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie)
  
                 switch (cqe->rx_cqe_sp.type) {
                 case CORE_RX_CQE_TYPE_SLOW_PATH:
-                       DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n");
-                       rc = -EINVAL;
+                       rc = qed_ll2_handle_slowpath(p_hwfn, p_ll2_conn,
+                                                    cqe, &flags);
                         break;
                 case CORE_RX_CQE_TYPE_GSI_OFFLOAD:
                 case CORE_RX_CQE_TYPE_REGULAR:
@@ -894,7 +929,7 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn,
         p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg;
         p_ramrod->inner_vlan_removal_en = p_ll2_conn->input.rx_vlan_removal_en;
         p_ramrod->queue_id = p_ll2_conn->queue_id;
-       p_ramrod->main_func_queue = (conn_type == QED_LL2_TYPE_OOO) ? 0 : 1;
+       p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0;
  
         if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) &&
             p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) &&
@@ -1105,6 +1140,7 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
                                          struct qed_ll2_info *p_ll2_info)
  {
         struct qed_ll2_tx_packet *p_descq;
+       u32 desc_size;
         u32 capacity;
         int rc = 0;
  
@@ -1122,13 +1158,17 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn,
                 goto out;
  
         capacity = qed_chain_get_capacity(&p_ll2_info->tx_queue.txq_chain);
-       p_descq = kcalloc(capacity, sizeof(struct qed_ll2_tx_packet),
-                         GFP_KERNEL);
+       /* First element is part of the packet, rest are flexibly added */
+       desc_size = (sizeof(*p_descq) +
+                    (p_ll2_info->input.tx_max_bds_per_packet - 1) *
+                    sizeof(p_descq->bds_set));
+
+       p_descq = kcalloc(capacity, desc_size, GFP_KERNEL);
         if (!p_descq) {
                 rc = -ENOMEM;
                 goto out;
         }
-       p_ll2_info->tx_queue.descq_array = p_descq;
+       p_ll2_info->tx_queue.descq_mem = p_descq;
  
         DP_VERBOSE(p_hwfn, QED_MSG_LL2,
                    "Allocated LL2 Txq [Type %08x] with 0x%08x buffers\n",
@@ -1209,6 +1249,7 @@ qed_ll2_set_cbs(struct qed_ll2_info *p_ll2_info, const struct qed_ll2_cbs *cbs)
         p_ll2_info->cbs.rx_release_cb = cbs->rx_release_cb;
         p_ll2_info->cbs.tx_comp_cb = cbs->tx_comp_cb;
         p_ll2_info->cbs.tx_release_cb = cbs->tx_release_cb;
+       p_ll2_info->cbs.slowpath_cb = cbs->slowpath_cb;
         p_ll2_info->cbs.cookie = cbs->cookie;
  
         return 0;
@@ -1260,6 +1301,11 @@ int qed_ll2_acquire_connection(void *cxt, struct qed_ll2_acquire_data *data)
  
         p_ll2_info->tx_dest = (data->input.tx_dest == QED_LL2_TX_DEST_NW) ?
                               CORE_TX_DEST_NW : CORE_TX_DEST_LB;
+       if (data->input.conn_type == QED_LL2_TYPE_OOO ||
+           data->input.secondary_queue)
+               p_ll2_info->main_func_queue = false;
+       else
+               p_ll2_info->main_func_queue = true;
  
         /* Correct maximum number of Tx BDs */
         p_tx_max = &p_ll2_info->input.tx_max_bds_per_packet;
@@ -1359,11 +1405,13 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
  {
         struct qed_hwfn *p_hwfn = cxt;
         struct qed_ll2_info *p_ll2_conn;
+       struct qed_ll2_tx_packet *p_pkt;
         struct qed_ll2_rx_queue *p_rx;
         struct qed_ll2_tx_queue *p_tx;
         struct qed_ptt *p_ptt;
         int rc = -EINVAL;
         u32 i, capacity;
+       u32 desc_size;
         u8 qid;
  
         p_ptt = qed_ptt_acquire(p_hwfn);
@@ -1397,9 +1445,15 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle)
         INIT_LIST_HEAD(&p_tx->sending_descq);
         spin_lock_init(&p_tx->lock);
         capacity = qed_chain_get_capacity(&p_tx->txq_chain);
-       for (i = 0; i < capacity; i++)
-               list_add_tail(&p_tx->descq_array[i].list_entry,
-                             &p_tx->free_descq);
+       /* First element is part of the packet, rest are flexibly added */
+       desc_size = (sizeof(*p_pkt) +
+                    (p_ll2_conn->input.tx_max_bds_per_packet - 1) *
+                    sizeof(p_pkt->bds_set));
+
+       for (i = 0; i < capacity; i++) {
+               p_pkt = p_tx->descq_mem + desc_size * i;
+               list_add_tail(&p_pkt->list_entry, &p_tx->free_descq);
+       }
         p_tx->cur_completing_bd_idx = 0;
         p_tx->bds_idx = 0;
         p_tx->b_completing_packet = false;
@@ -1579,11 +1633,28 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
         roce_flavor = (pkt->qed_roce_flavor == QED_LL2_ROCE) ? CORE_ROCE
                                                              : CORE_RROCE;
  
-       tx_dest = (pkt->tx_dest == QED_LL2_TX_DEST_NW) ? CORE_TX_DEST_NW
-                                                      : CORE_TX_DEST_LB;
+       switch (pkt->tx_dest) {
+       case QED_LL2_TX_DEST_NW:
+               tx_dest = CORE_TX_DEST_NW;
+               break;
+       case QED_LL2_TX_DEST_LB:
+               tx_dest = CORE_TX_DEST_LB;
+               break;
+       case QED_LL2_TX_DEST_DROP:
+               tx_dest = CORE_TX_DEST_DROP;
+               break;
+       default:
+               tx_dest = CORE_TX_DEST_LB;
+               break;
+       }
  
         start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
-       start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
+       if (QED_IS_IWARP_PERSONALITY(p_hwfn) &&
+           p_ll2->input.conn_type == QED_LL2_TYPE_OOO)
+               start_bd->nw_vlan_or_lb_echo =
+                   cpu_to_le16(IWARP_LL2_IN_ORDER_TX_QUEUE);
+       else
+               start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan);
         SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W,
                   cpu_to_le16(pkt->l4_hdr_offset_w));
         SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest);
@@ -1591,6 +1662,9 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
         SET_FIELD(bd_data, CORE_TX_BD_DATA_START_BD, 0x1);
         SET_FIELD(bd_data, CORE_TX_BD_DATA_NBDS, pkt->num_of_bds);
         SET_FIELD(bd_data, CORE_TX_BD_DATA_ROCE_FLAV, roce_flavor);
+       SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_CSUM, !!(pkt->enable_ip_cksum));
+       SET_FIELD(bd_data, CORE_TX_BD_DATA_L4_CSUM, !!(pkt->enable_l4_cksum));
+       SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_LEN, !!(pkt->calc_ip_len));
         start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data);
         DMA_REGPAIR_LE(start_bd->addr, pkt->first_frag);
         start_bd->nbytes = cpu_to_le16(pkt->first_frag_len);
@@ -1698,7 +1772,7 @@ int qed_ll2_prepare_tx_packet(void *cxt,
         p_tx = &p_ll2_conn->tx_queue;
         p_tx_chain = &p_tx->txq_chain;
  
-       if (pkt->num_of_bds > CORE_LL2_TX_MAX_BDS_PER_PACKET)
+       if (pkt->num_of_bds > p_ll2_conn->input.tx_max_bds_per_packet)
                 return -EIO;
  
         spin_lock_irqsave(&p_tx->lock, flags);
@@ -1858,7 +1932,7 @@ void qed_ll2_release_connection(void *cxt, u8 connection_handle)
                 qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index);
         }
  
-       kfree(p_ll2_conn->tx_queue.descq_array);
+       kfree(p_ll2_conn->tx_queue.descq_mem);
         qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain);
  
         kfree(p_ll2_conn->rx_queue.descq_array);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h

index a822528..f658170 100644 (file)
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h
@@ -63,17 +63,14 @@ struct qed_ll2_rx_packet {
  struct qed_ll2_tx_packet {
         struct list_head list_entry;
         u16 bd_used;
-       u16 vlan;
-       u16 l4_hdr_offset_w;
-       u8 bd_flags;
         bool notify_fw;
         void *cookie;
-
+       /* Flexible Array of bds_set determined by max_bds_per_packet */
         struct {
                 struct core_tx_bd *txq_bd;
                 dma_addr_t tx_frag;
                 u16 frag_len;
-       } bds_set[ETH_TX_MAX_BDS_PER_NON_LSO_PACKET];
+       } bds_set[1];
  };
  
  struct qed_ll2_rx_queue {
@@ -101,7 +98,7 @@ struct qed_ll2_tx_queue {
         struct list_head active_descq;
         struct list_head free_descq;
         struct list_head sending_descq;
-       struct qed_ll2_tx_packet *descq_array;
+       void *descq_mem; /* memory for variable sized qed_ll2_tx_packet*/
         struct qed_ll2_tx_packet *cur_send_packet;
         struct qed_ll2_tx_packet cur_completing_packet;
         u16 cur_completing_bd_idx;
@@ -124,6 +121,7 @@ struct qed_ll2_info {
         bool b_active;
         enum core_tx_dest tx_dest;
         u8 tx_stats_en;
+       bool main_func_queue;
         struct qed_ll2_rx_queue rx_queue;
         struct qed_ll2_tx_queue tx_queue;
         struct qed_ll2_cbs cbs;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c

index 866444b..2c6d7c6 100644 (file)
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c
@@ -51,15 +51,11 @@
  #define NSS_COMMON_CLK_SRC_CTRL_RGMII(x)       1
  #define NSS_COMMON_CLK_SRC_CTRL_SGMII(x)       ((x >= 2) ? 1 : 0)
  
-#define NSS_COMMON_MACSEC_CTL                  0x28
-#define NSS_COMMON_MACSEC_CTL_EXT_BYPASS_EN(x) (1 << x)
-
  #define NSS_COMMON_GMAC_CTL(x)                 (0x30 + (x * 4))
  #define NSS_COMMON_GMAC_CTL_CSYS_REQ           BIT(19)
  #define NSS_COMMON_GMAC_CTL_PHY_IFACE_SEL      BIT(16)
  #define NSS_COMMON_GMAC_CTL_IFG_LIMIT_OFFSET   8
  #define NSS_COMMON_GMAC_CTL_IFG_OFFSET         0
-#define NSS_COMMON_GMAC_CTL_IFG_MASK           0x3f
  
  #define NSS_COMMON_CLK_DIV_RGMII_1000          1
  #define NSS_COMMON_CLK_DIV_RGMII_100           9
@@ -68,9 +64,6 @@
  #define NSS_COMMON_CLK_DIV_SGMII_100           4
  #define NSS_COMMON_CLK_DIV_SGMII_10            49
  
-#define QSGMII_PCS_MODE_CTL                    0x68
-#define QSGMII_PCS_MODE_CTL_AUTONEG_EN(x)      BIT((x * 8) + 7)
-
  #define QSGMII_PCS_CAL_LCKDT_CTL               0x120
  #define QSGMII_PCS_CAL_LCKDT_CTL_RST           BIT(19)
  
@@ -83,15 +76,10 @@
  #define QSGMII_PHY_TX_DRIVER_EN                        BIT(3)
  #define QSGMII_PHY_QSGMII_EN                   BIT(7)
  #define QSGMII_PHY_PHASE_LOOP_GAIN_OFFSET      12
-#define QSGMII_PHY_PHASE_LOOP_GAIN_MASK                0x7
  #define QSGMII_PHY_RX_DC_BIAS_OFFSET           18
-#define QSGMII_PHY_RX_DC_BIAS_MASK             0x3
  #define QSGMII_PHY_RX_INPUT_EQU_OFFSET         20
-#define QSGMII_PHY_RX_INPUT_EQU_MASK           0x3
  #define QSGMII_PHY_CDR_PI_SLEW_OFFSET          22
-#define QSGMII_PHY_CDR_PI_SLEW_MASK            0x3
  #define QSGMII_PHY_TX_DRV_AMP_OFFSET           28
-#define QSGMII_PHY_TX_DRV_AMP_MASK             0xf
  
  struct ipq806x_gmac {
         struct platform_device *pdev;
@@ -217,7 +205,7 @@ static int ipq806x_gmac_of_parse(struct ipq806x_gmac *gmac)
          * code and keep it consistent with the Linux convention, we'll number
          * them from 0 to 3 here.
          */
-       if (gmac->id < 0 || gmac->id > 3) {
+       if (gmac->id > 3) {
                 dev_err(dev, "invalid gmac id\n");
                 return -EINVAL;
         }
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h

index 6f550e1..a81335e 100644 (file)
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -704,6 +704,14 @@ struct netvsc_reconfig {
         u32 event;
  };
  
+/* L4 hash bits for different protocols */
+#define HV_TCP4_L4HASH 1
+#define HV_TCP6_L4HASH 2
+#define HV_UDP4_L4HASH 4
+#define HV_UDP6_L4HASH 8
+#define HV_DEFAULT_L4HASH (HV_TCP4_L4HASH | HV_TCP6_L4HASH | HV_UDP4_L4HASH | \
+                          HV_UDP6_L4HASH)
+
  /* The context of the netvsc device  */
  struct net_device_context {
         /* point back to our device context */
@@ -726,10 +734,9 @@ struct net_device_context {
         u32 tx_send_table[VRSS_SEND_TAB_SIZE];
  
         /* Ethtool settings */
-       bool udp4_l4_hash;
-       bool udp6_l4_hash;
         u8 duplex;
         u32 speed;
+       u32 l4_hash; /* L4 hash settings */
         struct netvsc_ethtool_stats eth_stats;
  
         /* State to manage the associated VF interface. */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c

index dfb9864..44746de 100644 (file)
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -203,7 +203,7 @@ static inline u32 netvsc_get_hash(
         const struct net_device_context *ndc)
  {
         struct flow_keys flow;
-       u32 hash;
+       u32 hash, pkt_proto = 0;
         static u32 hashrnd __read_mostly;
  
         net_get_random_once(&hashrnd, sizeof(hashrnd));
@@ -211,11 +211,25 @@ static inline u32 netvsc_get_hash(
         if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
                 return 0;
  
-       if (flow.basic.ip_proto == IPPROTO_TCP ||
-           (flow.basic.ip_proto == IPPROTO_UDP &&
-            ((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) ||
-             (flow.basic.n_proto == htons(ETH_P_IPV6) &&
-              ndc->udp6_l4_hash)))) {
+       switch (flow.basic.ip_proto) {
+       case IPPROTO_TCP:
+               if (flow.basic.n_proto == htons(ETH_P_IP))
+                       pkt_proto = HV_TCP4_L4HASH;
+               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
+                       pkt_proto = HV_TCP6_L4HASH;
+
+               break;
+
+       case IPPROTO_UDP:
+               if (flow.basic.n_proto == htons(ETH_P_IP))
+                       pkt_proto = HV_UDP4_L4HASH;
+               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
+                       pkt_proto = HV_UDP6_L4HASH;
+
+               break;
+       }
+
+       if (pkt_proto & ndc->l4_hash) {
                 return skb_get_hash(skb);
         } else {
                 if (flow.basic.n_proto == htons(ETH_P_IP))
@@ -898,8 +912,7 @@ static void netvsc_init_settings(struct net_device *dev)
  {
         struct net_device_context *ndc = netdev_priv(dev);
  
-       ndc->udp4_l4_hash = true;
-       ndc->udp6_l4_hash = true;
+       ndc->l4_hash = HV_DEFAULT_L4HASH;
  
         ndc->speed = SPEED_UNKNOWN;
         ndc->duplex = DUPLEX_FULL;
@@ -1245,23 +1258,32 @@ static int
  netvsc_get_rss_hash_opts(struct net_device_context *ndc,
                          struct ethtool_rxnfc *info)
  {
+       const u32 l4_flag = RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
         info->data = RXH_IP_SRC | RXH_IP_DST;
  
         switch (info->flow_type) {
         case TCP_V4_FLOW:
+               if (ndc->l4_hash & HV_TCP4_L4HASH)
+                       info->data |= l4_flag;
+
+               break;
+
         case TCP_V6_FLOW:
-               info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+               if (ndc->l4_hash & HV_TCP6_L4HASH)
+                       info->data |= l4_flag;
+
                 break;
  
         case UDP_V4_FLOW:
-               if (ndc->udp4_l4_hash)
-                       info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+               if (ndc->l4_hash & HV_UDP4_L4HASH)
+                       info->data |= l4_flag;
  
                 break;
  
         case UDP_V6_FLOW:
-               if (ndc->udp6_l4_hash)
-                       info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+               if (ndc->l4_hash & HV_UDP6_L4HASH)
+                       info->data |= l4_flag;
  
                 break;
  
@@ -1302,23 +1324,51 @@ static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
  {
         if (info->data == (RXH_IP_SRC | RXH_IP_DST |
                            RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
-               if (info->flow_type == UDP_V4_FLOW)
-                       ndc->udp4_l4_hash = true;
-               else if (info->flow_type == UDP_V6_FLOW)
-                       ndc->udp6_l4_hash = true;
-               else
+               switch (info->flow_type) {
+               case TCP_V4_FLOW:
+                       ndc->l4_hash |= HV_TCP4_L4HASH;
+                       break;
+
+               case TCP_V6_FLOW:
+                       ndc->l4_hash |= HV_TCP6_L4HASH;
+                       break;
+
+               case UDP_V4_FLOW:
+                       ndc->l4_hash |= HV_UDP4_L4HASH;
+                       break;
+
+               case UDP_V6_FLOW:
+                       ndc->l4_hash |= HV_UDP6_L4HASH;
+                       break;
+
+               default:
                         return -EOPNOTSUPP;
+               }
  
                 return 0;
         }
  
         if (info->data == (RXH_IP_SRC | RXH_IP_DST)) {
-               if (info->flow_type == UDP_V4_FLOW)
-                       ndc->udp4_l4_hash = false;
-               else if (info->flow_type == UDP_V6_FLOW)
-                       ndc->udp6_l4_hash = false;
-               else
+               switch (info->flow_type) {
+               case TCP_V4_FLOW:
+                       ndc->l4_hash &= ~HV_TCP4_L4HASH;
+                       break;
+
+               case TCP_V6_FLOW:
+                       ndc->l4_hash &= ~HV_TCP6_L4HASH;
+                       break;
+
+               case UDP_V4_FLOW:
+                       ndc->l4_hash &= ~HV_UDP4_L4HASH;
+                       break;
+
+               case UDP_V6_FLOW:
+                       ndc->l4_hash &= ~HV_UDP6_L4HASH;
+                       break;
+
+               default:
                         return -EOPNOTSUPP;
+               }
  
                 return 0;
         }
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig

index cd931cf..e2cf8ff 100644 (file)
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -366,6 +366,11 @@ config REALTEK_PHY
         ---help---
           Supports the Realtek 821x PHY.
  
+config RENESAS_PHY
+       tristate "Driver for Renesas PHYs"
+       ---help---
+         Supports the Renesas PHYs uPD60620 and uPD60620A.
+
  config ROCKCHIP_PHY
          tristate "Driver for Rockchip Ethernet PHYs"
          ---help---
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile

index 416df92..1404ad3 100644 (file)
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_MICROSEMI_PHY)   += mscc.o
  obj-$(CONFIG_NATIONAL_PHY)     += national.o
  obj-$(CONFIG_QSEMI_PHY)                += qsemi.o
  obj-$(CONFIG_REALTEK_PHY)      += realtek.o
+obj-$(CONFIG_RENESAS_PHY)      += uPD60620.o
  obj-$(CONFIG_ROCKCHIP_PHY)     += rockchip.o
  obj-$(CONFIG_SMSC_PHY)         += smsc.o
  obj-$(CONFIG_STE10XP)          += ste10Xp.o
diff --git a/drivers/net/phy/uPD60620.c b/drivers/net/phy/uPD60620.c

new file mode 100644 (file)

index 0000000..96b3347
--- /dev/null
+++ b/drivers/net/phy/uPD60620.c
@@ -0,0 +1,109 @@
+/*
+ * Driver for the Renesas PHY uPD60620.
+ *
+ * Copyright (C) 2015 Softing Industrial Automation GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/phy.h>
+
+#define UPD60620_PHY_ID    0xb8242824
+
+/* Extended Registers and values */
+/* PHY Special Control/Status    */
+#define PHY_PHYSCR         0x1F      /* PHY.31 */
+#define PHY_PHYSCR_10MB    0x0004    /* PHY speed = 10mb */
+#define PHY_PHYSCR_100MB   0x0008    /* PHY speed = 100mb */
+#define PHY_PHYSCR_DUPLEX  0x0010    /* PHY Duplex */
+
+/* PHY Special Modes */
+#define PHY_SPM            0x12      /* PHY.18 */
+
+/* Init PHY */
+
+static int upd60620_config_init(struct phy_device *phydev)
+{
+       /* Enable support for passive HUBs (could be a strap option) */
+       /* PHYMODE: All speeds, HD in parallel detect */
+       return phy_write(phydev, PHY_SPM, 0x0180 | phydev->mdio.addr);
+}
+
+/* Get PHY status from common registers */
+
+static int upd60620_read_status(struct phy_device *phydev)
+{
+       int phy_state;
+
+       /* Read negotiated state */
+       phy_state = phy_read(phydev, MII_BMSR);
+       if (phy_state < 0)
+               return phy_state;
+
+       phydev->link = 0;
+       phydev->lp_advertising = 0;
+       phydev->pause = 0;
+       phydev->asym_pause = 0;
+
+       if (phy_state & (BMSR_ANEGCOMPLETE | BMSR_LSTATUS)) {
+               phy_state = phy_read(phydev, PHY_PHYSCR);
+               if (phy_state < 0)
+                       return phy_state;
+
+               if (phy_state & (PHY_PHYSCR_10MB | PHY_PHYSCR_100MB)) {
+                       phydev->link = 1;
+                       phydev->speed = SPEED_10;
+                       phydev->duplex = DUPLEX_HALF;
+
+                       if (phy_state & PHY_PHYSCR_100MB)
+                               phydev->speed = SPEED_100;
+                       if (phy_state & PHY_PHYSCR_DUPLEX)
+                               phydev->duplex = DUPLEX_FULL;
+
+                       phy_state = phy_read(phydev, MII_LPA);
+                       if (phy_state < 0)
+                               return phy_state;
+
+                       phydev->lp_advertising
+                               = mii_lpa_to_ethtool_lpa_t(phy_state);
+
+                       if (phydev->duplex == DUPLEX_FULL) {
+                               if (phy_state & LPA_PAUSE_CAP)
+                                       phydev->pause = 1;
+                               if (phy_state & LPA_PAUSE_ASYM)
+                                       phydev->asym_pause = 1;
+                       }
+               }
+       }
+       return 0;
+}
+
+MODULE_DESCRIPTION("Renesas uPD60620 PHY driver");
+MODULE_AUTHOR("Bernd Edlinger <bernd.edlinger@hotmail.de>");
+MODULE_LICENSE("GPL");
+
+static struct phy_driver upd60620_driver[1] = { {
+       .phy_id         = UPD60620_PHY_ID,
+       .phy_id_mask    = 0xfffffffe,
+       .name           = "Renesas uPD60620",
+       .features       = PHY_BASIC_FEATURES,
+       .flags          = 0,
+       .config_init    = upd60620_config_init,
+       .config_aneg    = genphy_config_aneg,
+       .read_status    = upd60620_read_status,
+} };
+
+module_phy_driver(upd60620_driver);
+
+static struct mdio_device_id __maybe_unused upd60620_tbl[] = {
+       { UPD60620_PHY_ID, 0xfffffffe },
+       { }
+};
+
+MODULE_DEVICE_TABLE(mdio, upd60620_tbl);
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c

index c3f77e3..e365866 100644 (file)
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1339,7 +1339,17 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64)
  
  static int ppp_dev_init(struct net_device *dev)
  {
+       struct ppp *ppp;
+
         netdev_lockdep_set_classes(dev);
+
+       ppp = netdev_priv(dev);
+       /* Let the netdevice take a reference on the ppp file. This ensures
+        * that ppp_destroy_interface() won't run before the device gets
+        * unregistered.
+        */
+       atomic_inc(&ppp->file.refcnt);
+
         return 0;
  }
  
@@ -1362,6 +1372,15 @@ static void ppp_dev_uninit(struct net_device *dev)
         wake_up_interruptible(&ppp->file.rwait);
  }
  
+static void ppp_dev_priv_destructor(struct net_device *dev)
+{
+       struct ppp *ppp;
+
+       ppp = netdev_priv(dev);
+       if (atomic_dec_and_test(&ppp->file.refcnt))
+               ppp_destroy_interface(ppp);
+}
+
  static const struct net_device_ops ppp_netdev_ops = {
         .ndo_init        = ppp_dev_init,
         .ndo_uninit      = ppp_dev_uninit,
@@ -1387,6 +1406,7 @@ static void ppp_setup(struct net_device *dev)
         dev->tx_queue_len = 3;
         dev->type = ARPHRD_PPP;
         dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+       dev->priv_destructor = ppp_dev_priv_destructor;
         netif_keep_dst(dev);
  }
  
diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c

index 29c7e2e..52ea80b 100644 (file)
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -560,6 +560,7 @@ static const struct driver_info wwan_info = {
  #define NVIDIA_VENDOR_ID       0x0955
  #define HP_VENDOR_ID           0x03f0
  #define MICROSOFT_VENDOR_ID    0x045e
+#define UBLOX_VENDOR_ID                0x1546
  
  static const struct usb_device_id      products[] = {
  /* BLACKLIST !!
@@ -869,6 +870,18 @@ static const struct usb_device_id  products[] = {
                                       USB_CDC_PROTO_NONE),
         .driver_info = (unsigned long)&zte_cdc_info,
  }, {
+       /* U-blox TOBY-L2 */
+       USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1143, USB_CLASS_COMM,
+                                     USB_CDC_SUBCLASS_ETHERNET,
+                                     USB_CDC_PROTO_NONE),
+       .driver_info = (unsigned long)&wwan_info,
+}, {
+       /* U-blox SARA-U2 */
+       USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1104, USB_CLASS_COMM,
+                                     USB_CDC_SUBCLASS_ETHERNET,
+                                     USB_CDC_PROTO_NONE),
+       .driver_info = (unsigned long)&wwan_info,
+}, {
         USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET,
                         USB_CDC_PROTO_NONE),
         .driver_info = (unsigned long) &cdc_info,
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index bb2aad0..5a14cc7 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2136,7 +2136,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
         struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
  
         if (a == &dev_attr_uuid.attr) {
-               if (uuid_is_null(&ns->uuid) ||
+               if (uuid_is_null(&ns->uuid) &&
                     !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
                         return 0;
         }
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c

index cb73bc8..3f5a04c 100644 (file)
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -94,7 +94,7 @@ struct nvme_dev {
         struct mutex shutdown_lock;
         bool subsystem;
         void __iomem *cmb;
-       dma_addr_t cmb_dma_addr;
+       pci_bus_addr_t cmb_bus_addr;
         u64 cmb_size;
         u32 cmbsz;
         u32 cmbloc;
@@ -1226,7 +1226,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
         if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
                 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
                                                       dev->ctrl.page_size);
-               nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
+               nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
                 nvmeq->sq_cmds_io = dev->cmb + offset;
         } else {
                 nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
@@ -1527,7 +1527,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
         resource_size_t bar_size;
         struct pci_dev *pdev = to_pci_dev(dev->dev);
         void __iomem *cmb;
-       dma_addr_t dma_addr;
+       int bar;
  
         dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
         if (!(NVME_CMB_SZ(dev->cmbsz)))
@@ -1540,7 +1540,8 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
         szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
         size = szu * NVME_CMB_SZ(dev->cmbsz);
         offset = szu * NVME_CMB_OFST(dev->cmbloc);
-       bar_size = pci_resource_len(pdev, NVME_CMB_BIR(dev->cmbloc));
+       bar = NVME_CMB_BIR(dev->cmbloc);
+       bar_size = pci_resource_len(pdev, bar);
  
         if (offset > bar_size)
                 return NULL;
@@ -1553,12 +1554,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
         if (size > bar_size - offset)
                 size = bar_size - offset;
  
-       dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(dev->cmbloc)) + offset;
-       cmb = ioremap_wc(dma_addr, size);
+       cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
         if (!cmb)
                 return NULL;
  
-       dev->cmb_dma_addr = dma_addr;
+       dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
         dev->cmb_size = size;
         return cmb;
  }
diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c

index 785fb42..2799a6b 100644 (file)
--- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
+++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
@@ -3767,7 +3767,7 @@ static int ibmvscsis_write_pending(struct se_cmd *se_cmd)
          */
         if ((vscsi->flags & (CLIENT_FAILED | RESPONSE_Q_DOWN))) {
                 pr_err("write_pending failed since: %d\n", vscsi->flags);
-               return 0;
+               return -EIO;
         }
  
         rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma,
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c

index bd4605a..c62e8d1 100644 (file)
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup);
  /**
   * iscsi_session_teardown - destroy session, host, and cls_session
   * @cls_session: iscsi session
- *
- * The driver must have called iscsi_remove_session before
- * calling this.
   */
  void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
  {
@@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
  
         iscsi_pool_free(&session->cmdpool);
  
+       iscsi_remove_session(cls_session);
+
         kfree(session->password);
         kfree(session->password_in);
         kfree(session->username);
@@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session)
         kfree(session->portal_type);
         kfree(session->discovery_parent_type);
  
-       iscsi_destroy_session(cls_session);
+       iscsi_free_session(cls_session);
+
         iscsi_host_dec_session_cnt(shost);
         module_put(owner);
  }
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c

index e7818af..15590a0 100644 (file)
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -956,6 +956,9 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
         if (*bflags & BLIST_NO_DIF)
                 sdev->no_dif = 1;
  
+       if (*bflags & BLIST_UNMAP_LIMIT_WS)
+               sdev->unmap_limit_for_ws = 1;
+
         sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
  
         if (*bflags & BLIST_TRY_VPD_PAGES)
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c

index 0190aef..7404d26 100644 (file)
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2211,22 +2211,6 @@ void iscsi_free_session(struct iscsi_cls_session *session)
  EXPORT_SYMBOL_GPL(iscsi_free_session);
  
  /**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
- *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
- */
-int iscsi_destroy_session(struct iscsi_cls_session *session)
-{
-       iscsi_remove_session(session);
-       ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n");
-       iscsi_free_session(session);
-       return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_destroy_session);
-
-/**
   * iscsi_create_conn - create iscsi class connection
   * @session: iscsi cls session
   * @dd_size: private driver data size
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c

index fb9f8b5..d175c5c 100644 (file)
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -715,13 +715,21 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
                 break;
  
         case SD_LBP_WS16:
-               max_blocks = min_not_zero(sdkp->max_ws_blocks,
-                                         (u32)SD_MAX_WS16_BLOCKS);
+               if (sdkp->device->unmap_limit_for_ws)
+                       max_blocks = sdkp->max_unmap_blocks;
+               else
+                       max_blocks = sdkp->max_ws_blocks;
+
+               max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS);
                 break;
  
         case SD_LBP_WS10:
-               max_blocks = min_not_zero(sdkp->max_ws_blocks,
-                                         (u32)SD_MAX_WS10_BLOCKS);
+               if (sdkp->device->unmap_limit_for_ws)
+                       max_blocks = sdkp->max_unmap_blocks;
+               else
+                       max_blocks = sdkp->max_ws_blocks;
+
+               max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS);
                 break;
  
         case SD_LBP_ZERO:
@@ -3099,8 +3107,6 @@ static int sd_revalidate_disk(struct gendisk *disk)
                 sd_read_security(sdkp, buffer);
         }
  
-       sdkp->first_scan = 0;
-
         /*
          * We now have all cache related info, determine how we deal
          * with flush requests.
@@ -3115,7 +3121,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
         q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max);
  
         /*
-        * Use the device's preferred I/O size for reads and writes
+        * Determine the device's preferred I/O size for reads and writes
          * unless the reported value is unreasonably small, large, or
          * garbage.
          */
@@ -3129,8 +3135,19 @@ static int sd_revalidate_disk(struct gendisk *disk)
                 rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
                                       (sector_t)BLK_DEF_MAX_SECTORS);
  
-       /* Combine with controller limits */
-       q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q));
+       /* Do not exceed controller limit */
+       rw_max = min(rw_max, queue_max_hw_sectors(q));
+
+       /*
+        * Only update max_sectors if previously unset or if the current value
+        * exceeds the capabilities of the hardware.
+        */
+       if (sdkp->first_scan ||
+           q->limits.max_sectors > q->limits.max_dev_sectors ||
+           q->limits.max_sectors > q->limits.max_hw_sectors)
+               q->limits.max_sectors = rw_max;
+
+       sdkp->first_scan = 0;
  
         set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity));
         sd_config_write_same(sdkp);
diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c

index 0e79eeb..419a7a9 100644 (file)
--- a/drivers/thunderbolt/nhi.c
+++ b/drivers/thunderbolt/nhi.c
@@ -1144,5 +1144,5 @@ static void __exit nhi_unload(void)
         tb_domain_exit();
  }
  
-module_init(nhi_init);
+fs_initcall(nhi_init);
  module_exit(nhi_unload);
diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c

index f2d06f6..1380275 100644 (file)
--- a/drivers/thunderbolt/xdomain.c
+++ b/drivers/thunderbolt/xdomain.c
@@ -1487,6 +1487,9 @@ int tb_register_property_dir(const char *key, struct tb_property_dir *dir)
  {
         int ret;
  
+       if (WARN_ON(!xdomain_property_dir))
+               return -EAGAIN;
+
         if (!key || strlen(key) > 8)
                 return -EINVAL;
  
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c

index 58585ec..68677d9 100644 (file)
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -436,8 +436,8 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
         struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
         struct vhost_virtqueue *vq = &nvq->vq;
  
-       return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV
-               == nvq->done_idx;
+       return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
+              min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
  }
  
  /* Expects to be always run from workqueue - which acts as
@@ -480,11 +480,6 @@ static void handle_tx(struct vhost_net *net)
                 if (zcopy)
                         vhost_zerocopy_signal_used(net, vq);
  
-               /* If more outstanding DMAs, queue the work.
-                * Handle upend_idx wrap around
-                */
-               if (unlikely(vhost_exceeds_maxpend(net)))
-                       break;
  
                 head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
                                                 ARRAY_SIZE(vq->iov),
@@ -519,8 +514,7 @@ static void handle_tx(struct vhost_net *net)
                 len = msg_data_left(&msg);
  
                 zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
-                                  && (nvq->upend_idx + 1) % UIO_MAXIOV !=
-                                     nvq->done_idx
+                                  && !vhost_exceeds_maxpend(net)
                                    && vhost_net_tx_select_zcopy(net);
  
                 /* use msg_control to pass vhost zerocopy ubuf info to skb */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 899ddae..8fc6903 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -722,7 +722,7 @@ struct btrfs_delayed_root;
   * Indicate that a whole-filesystem exclusive operation is running
   * (device replace, resize, device add/delete, balance)
   */
-#define BTRFS_FS_EXCL_OP                       14
+#define BTRFS_FS_EXCL_OP                       16
  
  struct btrfs_fs_info {
         u8 fsid[BTRFS_FSID_SIZE];
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 12ab19a..970190c 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2801,7 +2801,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
                 }
         }
  
-       bio = btrfs_bio_alloc(bdev, sector << 9);
+       bio = btrfs_bio_alloc(bdev, (u64)sector << 9);
         bio_add_page(bio, page, page_size, offset);
         bio->bi_end_io = end_io_func;
         bio->bi_private = tree;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c

index 84edfc6..f23c820 100644 (file)
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -734,12 +734,13 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
                         inode = req->r_inode;
                         ihold(inode);
                 } else {
-                       /* req->r_dentry is non-null for LSSNAP request.
-                        * fall-thru */
-                       WARN_ON_ONCE(!req->r_dentry);
+                       /* req->r_dentry is non-null for LSSNAP request */
+                       rcu_read_lock();
+                       inode = get_nonsnap_parent(req->r_dentry);
+                       rcu_read_unlock();
+                       dout("__choose_mds using snapdir's parent %p\n", inode);
                 }
-       }
-       if (!inode && req->r_dentry) {
+       } else if (req->r_dentry) {
                 /* ignore race with rename; old or new d_parent is okay */
                 struct dentry *parent;
                 struct inode *dir;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c

index 1ffc8b4..7fc0b85 100644 (file)
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -374,12 +374,10 @@ static int build_snap_context(struct ceph_snap_realm *realm,
              realm->ino, realm, snapc, snapc->seq,
              (unsigned int) snapc->num_snaps);
  
-       if (realm->cached_context) {
-               ceph_put_snap_context(realm->cached_context);
-               /* queue realm for cap_snap creation */
-               list_add_tail(&realm->dirty_item, dirty_realms);
-       }
+       ceph_put_snap_context(realm->cached_context);
         realm->cached_context = snapc;
+       /* queue realm for cap_snap creation */
+       list_add_tail(&realm->dirty_item, dirty_realms);
         return 0;
  
  fail:
diff --git a/fs/namespace.c b/fs/namespace.c

index 54059b1..3b601f1 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -468,7 +468,9 @@ static inline int may_write_real(struct file *file)
  
         /* File refers to upper, writable layer? */
         upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
-       if (upperdentry && file_inode(file) == d_inode(upperdentry))
+       if (upperdentry &&
+           (file_inode(file) == d_inode(upperdentry) ||
+            file_inode(file) == d_inode(dentry)))
                 return 0;
  
         /* Lower layer: can't write to real file, sorry... */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c

index efebe6c..22880ef 100644 (file)
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -218,7 +218,6 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
  static void pnfs_init_server(struct nfs_server *server)
  {
         rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
-       rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
  }
  
  #else
@@ -888,6 +887,7 @@ struct nfs_server *nfs_alloc_server(void)
         ida_init(&server->openowner_id);
         ida_init(&server->lockowner_id);
         pnfs_init_server(server);
+       rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
  
         return server;
  }
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c

index 44c638b..508126e 100644 (file)
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -745,7 +745,8 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
         struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
  
         dprintk("--> %s\n", __func__);
-       nfs4_fl_put_deviceid(fl->dsaddr);
+       if (fl->dsaddr != NULL)
+               nfs4_fl_put_deviceid(fl->dsaddr);
         /* This assumes a single RW lseg */
         if (lseg->pls_range.iomode == IOMODE_RW) {
                 struct nfs4_filelayout *flo;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c

index dd5d27d..30426c1 100644 (file)
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -274,7 +274,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
         ssize_t ret;
  
         ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
-       if (ret <= 0)
+       if (ret < 0)
                 return ERR_PTR(ret);
  
         rkey = request_key(&key_type_id_resolver, desc, "");
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c

index 6c61e2b..f90090e 100644 (file)
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8399,8 +8399,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
                 lo = NFS_I(inode)->layout;
                 /* If the open stateid was bad, then recover it. */
                 if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
-                   nfs4_stateid_match_other(&lgp->args.stateid,
-                                       &lgp->args.ctx->state->stateid)) {
+                   !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
                         spin_unlock(&inode->i_lock);
                         exception->state = lgp->args.ctx->state;
                         exception->stateid = &lgp->args.stateid;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c

index 37c8af0..14ed979 100644 (file)
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1842,8 +1842,8 @@ static void encode_create_session(struct xdr_stream *xdr,
          * Assumes OPEN is the biggest non-idempotent compound.
          * 2 is the verifier.
          */
-       max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
-                             RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
+       max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2)
+                               * XDR_UNIT + RPC_MAX_AUTH_SIZE;
  
         encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
         p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c

index aad97b3..c441f93 100644 (file)
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -561,10 +561,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
                 c->tmpfile = true;
                 err = ovl_copy_up_locked(c);
         } else {
-               err = -EIO;
-               if (lock_rename(c->workdir, c->destdir) != NULL) {
-                       pr_err("overlayfs: failed to lock workdir+upperdir\n");
-               } else {
+               err = ovl_lock_rename_workdir(c->workdir, c->destdir);
+               if (!err) {
                         err = ovl_copy_up_locked(c);
                         unlock_rename(c->workdir, c->destdir);
                 }
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c

index 3309b19..cc961a3 100644 (file)
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -216,26 +216,6 @@ out_unlock:
         return err;
  }
  
-static int ovl_lock_rename_workdir(struct dentry *workdir,
-                                  struct dentry *upperdir)
-{
-       /* Workdir should not be the same as upperdir */
-       if (workdir == upperdir)
-               goto err;
-
-       /* Workdir should not be subdir of upperdir and vice versa */
-       if (lock_rename(workdir, upperdir) != NULL)
-               goto err_unlock;
-
-       return 0;
-
-err_unlock:
-       unlock_rename(workdir, upperdir);
-err:
-       pr_err("overlayfs: failed to lock workdir+upperdir\n");
-       return -EIO;
-}
-
  static struct dentry *ovl_clear_empty(struct dentry *dentry,
                                       struct list_head *list)
  {
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c

index c3addd1..654bea1 100644 (file)
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -506,6 +506,7 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry,
  
         index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
         if (IS_ERR(index)) {
+               err = PTR_ERR(index);
                 pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n"
                                     "overlayfs: mount with '-o index=off' to disable inodes index.\n",
                                     d_inode(origin)->i_ino, name.len, name.name,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h

index d4e8c1a..c706a6f 100644 (file)
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -235,6 +235,7 @@ bool ovl_inuse_trylock(struct dentry *dentry);
  void ovl_inuse_unlock(struct dentry *dentry);
  int ovl_nlink_start(struct dentry *dentry, bool *locked);
  void ovl_nlink_end(struct dentry *dentry, bool locked);
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
  
  static inline bool ovl_is_impuredir(struct dentry *dentry)
  {
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h

index 878a750..25d9b5a 100644 (file)
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -37,6 +37,9 @@ struct ovl_fs {
         bool noxattr;
         /* sb common to all layers */
         struct super_block *same_sb;
+       /* Did we take the inuse lock? */
+       bool upperdir_locked;
+       bool workdir_locked;
  };
  
  /* private information held for every overlayfs dentry */
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c

index 62e9b22..0f85ee9 100644 (file)
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -988,6 +988,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
                          struct path *lowerstack, unsigned int numlower)
  {
         int err;
+       struct dentry *index = NULL;
         struct inode *dir = dentry->d_inode;
         struct path path = { .mnt = mnt, .dentry = dentry };
         LIST_HEAD(list);
@@ -1007,8 +1008,6 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
  
         inode_lock_nested(dir, I_MUTEX_PARENT);
         list_for_each_entry(p, &list, l_node) {
-               struct dentry *index;
-
                 if (p->name[0] == '.') {
                         if (p->len == 1)
                                 continue;
@@ -1018,6 +1017,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
                 index = lookup_one_len(p->name, dentry, p->len);
                 if (IS_ERR(index)) {
                         err = PTR_ERR(index);
+                       index = NULL;
                         break;
                 }
                 err = ovl_verify_index(index, lowerstack, numlower);
@@ -1029,7 +1029,9 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
                                 break;
                 }
                 dput(index);
+               index = NULL;
         }
+       dput(index);
         inode_unlock(dir);
  out:
         ovl_cache_free(&list);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c

index fd5ea4f..092d150 100644 (file)
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -211,9 +211,10 @@ static void ovl_put_super(struct super_block *sb)
  
         dput(ufs->indexdir);
         dput(ufs->workdir);
-       ovl_inuse_unlock(ufs->workbasedir);
+       if (ufs->workdir_locked)
+               ovl_inuse_unlock(ufs->workbasedir);
         dput(ufs->workbasedir);
-       if (ufs->upper_mnt)
+       if (ufs->upper_mnt && ufs->upperdir_locked)
                 ovl_inuse_unlock(ufs->upper_mnt->mnt_root);
         mntput(ufs->upper_mnt);
         for (i = 0; i < ufs->numlower; i++)
@@ -881,9 +882,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                         goto out_put_upperpath;
  
                 err = -EBUSY;
-               if (!ovl_inuse_trylock(upperpath.dentry)) {
-                       pr_err("overlayfs: upperdir is in-use by another mount\n");
+               if (ovl_inuse_trylock(upperpath.dentry)) {
+                       ufs->upperdir_locked = true;
+               } else if (ufs->config.index) {
+                       pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
                         goto out_put_upperpath;
+               } else {
+                       pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
                 }
  
                 err = ovl_mount_dir(ufs->config.workdir, &workpath);
@@ -901,9 +906,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                 }
  
                 err = -EBUSY;
-               if (!ovl_inuse_trylock(workpath.dentry)) {
-                       pr_err("overlayfs: workdir is in-use by another mount\n");
+               if (ovl_inuse_trylock(workpath.dentry)) {
+                       ufs->workdir_locked = true;
+               } else if (ufs->config.index) {
+                       pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
                         goto out_put_workpath;
+               } else {
+                       pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
                 }
  
                 ufs->workbasedir = workpath.dentry;
@@ -1156,11 +1165,13 @@ out_put_lowerpath:
  out_free_lowertmp:
         kfree(lowertmp);
  out_unlock_workdentry:
-       ovl_inuse_unlock(workpath.dentry);
+       if (ufs->workdir_locked)
+               ovl_inuse_unlock(workpath.dentry);
  out_put_workpath:
         path_put(&workpath);
  out_unlock_upperdentry:
-       ovl_inuse_unlock(upperpath.dentry);
+       if (ufs->upperdir_locked)
+               ovl_inuse_unlock(upperpath.dentry);
  out_put_upperpath:
         path_put(&upperpath);
  out_free_config:
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c

index 1177945..b9b239f 100644 (file)
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -430,7 +430,7 @@ void ovl_inuse_unlock(struct dentry *dentry)
         }
  }
  
-/* Called must hold OVL_I(inode)->oi_lock */
+/* Caller must hold OVL_I(inode)->lock */
  static void ovl_cleanup_index(struct dentry *dentry)
  {
         struct inode *dir = ovl_indexdir(dentry->d_sb)->d_inode;
@@ -469,6 +469,9 @@ static void ovl_cleanup_index(struct dentry *dentry)
         err = PTR_ERR(index);
         if (!IS_ERR(index))
                 err = ovl_cleanup(dir, index);
+       else
+               index = NULL;
+
         inode_unlock(dir);
         if (err)
                 goto fail;
@@ -557,3 +560,22 @@ void ovl_nlink_end(struct dentry *dentry, bool locked)
                 mutex_unlock(&OVL_I(d_inode(dentry))->lock);
         }
  }
+
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
+{
+       /* Workdir should not be the same as upperdir */
+       if (workdir == upperdir)
+               goto err;
+
+       /* Workdir should not be subdir of upperdir and vice versa */
+       if (lock_rename(workdir, upperdir) != NULL)
+               goto err_unlock;
+
+       return 0;
+
+err_unlock:
+       unlock_rename(workdir, upperdir);
+err:
+       pr_err("overlayfs: failed to lock workdir+upperdir\n");
+       return -EIO;
+}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c

index bc6c6e1..e9db7fc 100644 (file)
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -2122,11 +2122,31 @@ xfs_swap_extents(
                 ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
                 tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
                 tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+       }
+
+       /* Swap the cow forks. */
+       if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+               xfs_extnum_t    extnum;
+
+               ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+               ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS);
+
+               extnum = ip->i_cnextents;
+               ip->i_cnextents = tip->i_cnextents;
+               tip->i_cnextents = extnum;
+
                 cowfp = ip->i_cowfp;
                 ip->i_cowfp = tip->i_cowfp;
                 tip->i_cowfp = cowfp;
-               xfs_inode_set_cowblocks_tag(ip);
-               xfs_inode_set_cowblocks_tag(tip);
+
+               if (ip->i_cowfp && ip->i_cnextents)
+                       xfs_inode_set_cowblocks_tag(ip);
+               else
+                       xfs_inode_clear_cowblocks_tag(ip);
+               if (tip->i_cowfp && tip->i_cnextents)
+                       xfs_inode_set_cowblocks_tag(tip);
+               else
+                       xfs_inode_clear_cowblocks_tag(tip);
         }
  
         xfs_trans_log_inode(tp, ip,  src_log_flags);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c

index 3246815..37e603b 100644 (file)
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -736,7 +736,13 @@ xfs_reflink_end_cow(
         /* If there is a hole at end_fsb - 1 go to the previous extent */
         if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
             got.br_startoff > end_fsb) {
-               ASSERT(idx > 0);
+               /*
+                * In case of racing, overlapping AIO writes no COW extents
+                * might be left by the time I/O completes for the loser of
+                * the race.  In that case we are done.
+                */
+               if (idx <= 0)
+                       goto out_cancel;
                 xfs_iext_get_extent(ifp, --idx, &got);
         }
  
@@ -809,6 +815,7 @@ next_extent:
  
  out_defer:
         xfs_defer_cancel(&dfops);
+out_cancel:
         xfs_trans_cancel(tp);
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
  out:
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index a67daea..4373125 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_map {
         struct work_struct work;
         atomic_t usercnt;
         struct bpf_map *inner_map_meta;
-       u8 name[BPF_OBJ_NAME_LEN];
+       char name[BPF_OBJ_NAME_LEN];
  };
  
  /* function argument constraints */
@@ -189,7 +189,7 @@ struct bpf_prog_aux {
         struct bpf_prog *prog;
         struct user_struct *user;
         u64 load_time; /* ns since boottime */
-       u8 name[BPF_OBJ_NAME_LEN];
+       char name[BPF_OBJ_NAME_LEN];
         union {
                 struct work_struct work;
                 struct rcu_head rcu;
@@ -407,6 +407,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
  {
  }
  
+static inline int bpf_obj_get_user(const char __user *pathname)
+{
+       return -EOPNOTSUPP;
+}
+
  static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
                                                        u32 key)
  {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h

index b8d200f..f00ef75 100644 (file)
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,6 +115,21 @@ struct bpf_insn_aux_data {
  
  #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
  
+#define BPF_VERIFIER_TMP_LOG_SIZE      1024
+
+struct bpf_verifer_log {
+       u32 level;
+       char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
+       char __user *ubuf;
+       u32 len_used;
+       u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+       return log->len_used >= log->len_total - 1;
+}
+
  struct bpf_verifier_env;
  struct bpf_ext_analyzer_ops {
         int (*insn_hook)(struct bpf_verifier_env *env,
@@ -139,6 +154,8 @@ struct bpf_verifier_env {
         bool allow_ptr_leaks;
         bool seen_direct_write;
         struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+       struct bpf_verifer_log log;
  };
  
  int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h

index 3cd18ac..02639eb 100644 (file)
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,6 +49,7 @@ struct br_ip_list {
  #define BR_MULTICAST_TO_UNICAST        BIT(12)
  #define BR_VLAN_TUNNEL         BIT(13)
  #define BR_BCAST_FLOOD         BIT(14)
+#define BR_NEIGH_SUPPRESS      BIT(15)
  
  #define BR_DEFAULT_AGEING_TIME (300 * HZ)
  
@@ -63,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
  bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
  bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
  bool br_multicast_enabled(const struct net_device *dev);
+bool br_multicast_router(const struct net_device *dev);
  #else
  static inline int br_multicast_list_adjacent(struct net_device *dev,
                                              struct list_head *br_ip_list)
@@ -83,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev)
  {
         return false;
  }
+static inline bool br_multicast_router(const struct net_device *dev)
+{
+       return false;
+}
  #endif
  
  #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
diff --git a/include/linux/if_phonet.h b/include/linux/if_phonet.h

index bbcdb0a..a118ee4 100644 (file)
--- a/include/linux/if_phonet.h
+++ b/include/linux/if_phonet.h
@@ -10,5 +10,5 @@
  
  #include <uapi/linux/if_phonet.h>
  
-extern struct header_ops phonet_header_ops;
+extern const struct header_ops phonet_header_ops;
  #endif
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h

index f3f2d07..9a43763 100644 (file)
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -316,7 +316,7 @@ struct mmc_host {
  #define MMC_CAP_UHS_SDR50      (1 << 18)       /* Host supports UHS SDR50 mode */
  #define MMC_CAP_UHS_SDR104     (1 << 19)       /* Host supports UHS SDR104 mode */
  #define MMC_CAP_UHS_DDR50      (1 << 20)       /* Host supports UHS DDR50 mode */
-#define MMC_CAP_NO_BOUNCE_BUFF (1 << 21)       /* Disable bounce buffers on host */
+/* (1 << 21) is free for reuse */
  #define MMC_CAP_DRIVER_TYPE_A  (1 << 23)       /* Host supports Driver Type A */
  #define MMC_CAP_DRIVER_TYPE_C  (1 << 24)       /* Host supports Driver Type C */
  #define MMC_CAP_DRIVER_TYPE_D  (1 << 25)       /* Host supports Driver Type D */
diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h

index 2c2a551..528b24c 100644 (file)
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -108,9 +108,10 @@ struct ebt_table {
  
  #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \
                      ~(__alignof__(struct _xt_align)-1))
-extern struct ebt_table *ebt_register_table(struct net *net,
-                                           const struct ebt_table *table,
-                                           const struct nf_hook_ops *);
+extern int ebt_register_table(struct net *net,
+                             const struct ebt_table *table,
+                             const struct nf_hook_ops *ops,
+                             struct ebt_table **res);
  extern void ebt_unregister_table(struct net *net, struct ebt_table *table,
                                  const struct nf_hook_ops *);
  extern unsigned int ebt_do_table(struct sk_buff *skb,
diff --git a/include/linux/nmi.h b/include/linux/nmi.h

index a36abe2..27e249e 100644 (file)
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -12,11 +12,31 @@
  
  #ifdef CONFIG_LOCKUP_DETECTOR
  void lockup_detector_init(void);
+void lockup_detector_soft_poweroff(void);
+void lockup_detector_cleanup(void);
+bool is_hardlockup(void);
+
+extern int watchdog_user_enabled;
+extern int nmi_watchdog_user_enabled;
+extern int soft_watchdog_user_enabled;
+extern int watchdog_thresh;
+extern unsigned long watchdog_enabled;
+
+extern struct cpumask watchdog_cpumask;
+extern unsigned long *watchdog_cpumask_bits;
+#ifdef CONFIG_SMP
+extern int sysctl_softlockup_all_cpu_backtrace;
+extern int sysctl_hardlockup_all_cpu_backtrace;
  #else
-static inline void lockup_detector_init(void)
-{
-}
-#endif
+#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
+#endif /* !CONFIG_SMP */
+
+#else /* CONFIG_LOCKUP_DETECTOR */
+static inline void lockup_detector_init(void) { }
+static inline void lockup_detector_soft_poweroff(void) { }
+static inline void lockup_detector_cleanup(void) { }
+#endif /* !CONFIG_LOCKUP_DETECTOR */
  
  #ifdef CONFIG_SOFTLOCKUP_DETECTOR
  extern void touch_softlockup_watchdog_sched(void);
@@ -24,29 +44,17 @@ extern void touch_softlockup_watchdog(void);
  extern void touch_softlockup_watchdog_sync(void);
  extern void touch_all_softlockup_watchdogs(void);
  extern unsigned int  softlockup_panic;
-extern int soft_watchdog_enabled;
-extern atomic_t watchdog_park_in_progress;
  #else
-static inline void touch_softlockup_watchdog_sched(void)
-{
-}
-static inline void touch_softlockup_watchdog(void)
-{
-}
-static inline void touch_softlockup_watchdog_sync(void)
-{
-}
-static inline void touch_all_softlockup_watchdogs(void)
-{
-}
+static inline void touch_softlockup_watchdog_sched(void) { }
+static inline void touch_softlockup_watchdog(void) { }
+static inline void touch_softlockup_watchdog_sync(void) { }
+static inline void touch_all_softlockup_watchdogs(void) { }
  #endif
  
  #ifdef CONFIG_DETECT_HUNG_TASK
  void reset_hung_task_detector(void);
  #else
-static inline void reset_hung_task_detector(void)
-{
-}
+static inline void reset_hung_task_detector(void) { }
  #endif
  
  /*
@@ -54,12 +62,12 @@ static inline void reset_hung_task_detector(void)
   * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
   * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
   *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
+ * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and
+ * 'soft_watchdog_user_enabled' are variables that are only used as an
+ * 'interface' between the parameters in /proc/sys/kernel and the internal
+ * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is
+ * handled differently because its value is not boolean, and the lockup
+ * detectors are 'suspended' while 'watchdog_thresh' is equal zero.
   */
  #define NMI_WATCHDOG_ENABLED_BIT   0
  #define SOFT_WATCHDOG_ENABLED_BIT  1
@@ -73,17 +81,41 @@ extern unsigned int hardlockup_panic;
  static inline void hardlockup_detector_disable(void) {}
  #endif
  
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+# define NMI_WATCHDOG_SYSCTL_PERM      0644
+#else
+# define NMI_WATCHDOG_SYSCTL_PERM      0444
+#endif
+
  #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
  extern void arch_touch_nmi_watchdog(void);
+extern void hardlockup_detector_perf_stop(void);
+extern void hardlockup_detector_perf_restart(void);
+extern void hardlockup_detector_perf_disable(void);
+extern void hardlockup_detector_perf_enable(void);
+extern void hardlockup_detector_perf_cleanup(void);
+extern int hardlockup_detector_perf_init(void);
  #else
-#if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+static inline void hardlockup_detector_perf_stop(void) { }
+static inline void hardlockup_detector_perf_restart(void) { }
+static inline void hardlockup_detector_perf_disable(void) { }
+static inline void hardlockup_detector_perf_enable(void) { }
+static inline void hardlockup_detector_perf_cleanup(void) { }
+# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
  static inline void arch_touch_nmi_watchdog(void) {}
+# else
+static inline int hardlockup_detector_perf_init(void) { return 0; }
+# endif
  #endif
-#endif
+
+void watchdog_nmi_stop(void);
+void watchdog_nmi_start(void);
+int watchdog_nmi_probe(void);
  
  /**
   * touch_nmi_watchdog - restart NMI watchdog timeout.
- * 
+ *
   * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
   * may be used to reset the timeout - for code which intentionally
   * disables interrupts for a long time. This call is stateless.
@@ -153,22 +185,6 @@ static inline bool trigger_single_cpu_backtrace(int cpu)
  u64 hw_nmi_get_sample_period(int watchdog_thresh);
  #endif
  
-#ifdef CONFIG_LOCKUP_DETECTOR
-extern int nmi_watchdog_enabled;
-extern int watchdog_user_enabled;
-extern int watchdog_thresh;
-extern unsigned long watchdog_enabled;
-extern struct cpumask watchdog_cpumask;
-extern unsigned long *watchdog_cpumask_bits;
-extern int __read_mostly watchdog_suspended;
-#ifdef CONFIG_SMP
-extern int sysctl_softlockup_all_cpu_backtrace;
-extern int sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
-#endif
-
  #if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
      defined(CONFIG_HARDLOCKUP_DETECTOR)
  void watchdog_update_hrtimer_threshold(u64 period);
@@ -176,7 +192,6 @@ void watchdog_update_hrtimer_threshold(u64 period);
  static inline void watchdog_update_hrtimer_threshold(u64 period) { }
  #endif
  
-extern bool is_hardlockup(void);
  struct ctl_table;
  extern int proc_watchdog(struct ctl_table *, int ,
                          void __user *, size_t *, loff_t *);
@@ -188,18 +203,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
                                 void __user *, size_t *, loff_t *);
  extern int proc_watchdog_cpumask(struct ctl_table *, int,
                                  void __user *, size_t *, loff_t *);
-extern int lockup_detector_suspend(void);
-extern void lockup_detector_resume(void);
-#else
-static inline int lockup_detector_suspend(void)
-{
-       return 0;
-}
-
-static inline void lockup_detector_resume(void)
-{
-}
-#endif
  
  #ifdef CONFIG_HAVE_ACPI_APEI_NMI
  #include <asm/nmi.h>
diff --git a/include/linux/once.h b/include/linux/once.h

index 9c98aaa..7247249 100644 (file)
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -5,7 +5,7 @@
  #include <linux/jump_label.h>
  
  bool __do_once_start(bool *done, unsigned long *flags);
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
                     unsigned long *flags);
  
  /* Call a function exactly once. The idea of DO_ONCE() is to perform
@@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key,
         ({                                                                   \
                 bool ___ret = false;                                         \
                 static bool ___done = false;                                 \
-               static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \
-               if (static_key_true(&___once_key)) {                         \
+               static DEFINE_STATIC_KEY_TRUE(___once_key);                  \
+               if (static_branch_unlikely(&___once_key)) {                  \
                         unsigned long ___flags;                              \
                         ___ret = __do_once_start(&___done, &___flags);       \
                         if (unlikely(___ret)) {                              \
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 8e22f24..79b18a2 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
  struct bpf_perf_event_data_kern {
         struct pt_regs *regs;
         struct perf_sample_data *data;
+       struct perf_event *event;
  };
  
  #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                 void *context);
  extern void perf_pmu_migrate_context(struct pmu *pmu,
                                 int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+                         u64 *enabled, u64 *running);
  extern u64 perf_event_read_value(struct perf_event *event,
                                  u64 *enabled, u64 *running);
  
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
  {
         return ERR_PTR(-EINVAL);
  }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+                                       u64 *enabled, u64 *running)
  {
         return -EINVAL;
  }
diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h

index 89fa0bb..e755954 100644 (file)
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type {
  enum qed_ll2_tx_dest {
         QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */
         QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */
+       QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */
         QED_LL2_TX_DEST_MAX
  };
  
@@ -150,11 +151,16 @@ void (*qed_ll2_release_tx_packet_cb)(void *cxt,
                                      dma_addr_t first_frag_addr,
                                      bool b_last_fragment, bool b_last_packet);
  
+typedef
+void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle,
+                           u32 opaque_data_0, u32 opaque_data_1);
+
  struct qed_ll2_cbs {
         qed_ll2_complete_rx_packet_cb rx_comp_cb;
         qed_ll2_release_rx_packet_cb rx_release_cb;
         qed_ll2_complete_tx_packet_cb tx_comp_cb;
         qed_ll2_release_tx_packet_cb tx_release_cb;
+       qed_ll2_slowpath_cb slowpath_cb;
         void *cookie;
  };
  
@@ -171,6 +177,7 @@ struct qed_ll2_acquire_data_inputs {
         enum qed_ll2_tx_dest tx_dest;
         enum qed_ll2_error_handle ai_err_packet_too_big;
         enum qed_ll2_error_handle ai_err_no_buf;
+       bool secondary_queue;
         u8 gsi_enable;
  };
  
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index 01a9859..03634ec 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
         return __skb_grow(skb, len);
  }
  
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
  #define skb_queue_walk(queue, skb) \
                 for (skb = (queue)->next;                                       \
                      skb != (struct sk_buff *)(queue);                          \
@@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
                 for (; skb != (struct sk_buff *)(queue);                        \
                      skb = skb->next)
  
+#define skb_rbtree_walk(skb, root)                                             \
+               for (skb = skb_rb_first(root); skb != NULL;                     \
+                    skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)                                              \
+               for (; skb != NULL;                                             \
+                    skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)                                    \
+               for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);      \
+                    skb = tmp)
+
  #define skb_queue_walk_from_safe(queue, skb, tmp)                              \
                 for (tmp = skb->next;                                           \
                      skb != (struct sk_buff *)(queue);                          \
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h

index 12910cf..c149aa7 100644 (file)
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -55,7 +55,7 @@ smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
  }
  
  void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-                                        const struct cpumask *);
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                         const struct cpumask *);
  
  #endif
diff --git a/include/net/dst.h b/include/net/dst.h

index 06a6765..204c19e 100644 (file)
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -101,7 +101,7 @@ struct dst_entry {
         union {
                 struct dst_entry        *next;
                 struct rtable __rcu     *rt_next;
-               struct rt6_info         *rt6_next;
+               struct rt6_info __rcu   *rt6_next;
                 struct dn_route __rcu   *dn_next;
         };
  };
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h

index 9fba2eb..87a0bb8 100644 (file)
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -87,6 +87,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
  void metadata_dst_free(struct metadata_dst *);
  struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
                                         gfp_t flags);
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst);
  struct metadata_dst __percpu *
  metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags);
  
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h

index d060d71..10c9138 100644 (file)
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -29,6 +29,14 @@
  #define FIB6_TABLE_HASHSZ 1
  #endif
  
+#define RT6_DEBUG 2
+
+#if RT6_DEBUG >= 3
+#define RT6_TRACE(x...) pr_debug(x)
+#else
+#define RT6_TRACE(x...) do { ; } while (0)
+#endif
+
  struct rt6_info;
  
  struct fib6_config {
@@ -60,25 +68,30 @@ struct fib6_config {
  };
  
  struct fib6_node {
-       struct fib6_node        *parent;
-       struct fib6_node        *left;
-       struct fib6_node        *right;
+       struct fib6_node __rcu  *parent;
+       struct fib6_node __rcu  *left;
+       struct fib6_node __rcu  *right;
  #ifdef CONFIG_IPV6_SUBTREES
-       struct fib6_node        *subtree;
+       struct fib6_node __rcu  *subtree;
  #endif
-       struct rt6_info         *leaf;
+       struct rt6_info __rcu   *leaf;
  
         __u16                   fn_bit;         /* bit key */
         __u16                   fn_flags;
         int                     fn_sernum;
-       struct rt6_info         *rr_ptr;
+       struct rt6_info __rcu   *rr_ptr;
         struct rcu_head         rcu;
  };
  
+struct fib6_gc_args {
+       int                     timeout;
+       int                     more;
+};
+
  #ifndef CONFIG_IPV6_SUBTREES
  #define FIB6_SUBTREE(fn)       NULL
  #else
-#define FIB6_SUBTREE(fn)       ((fn)->subtree)
+#define FIB6_SUBTREE(fn)       (rcu_dereference_protected((fn)->subtree, 1))
  #endif
  
  struct mx6_config {
@@ -98,6 +111,22 @@ struct rt6key {
  
  struct fib6_table;
  
+struct rt6_exception_bucket {
+       struct hlist_head       chain;
+       int                     depth;
+};
+
+struct rt6_exception {
+       struct hlist_node       hlist;
+       struct rt6_info         *rt6i;
+       unsigned long           stamp;
+       struct rcu_head         rcu;
+};
+
+#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
+#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
+#define FIB6_MAX_DEPTH 5
+
  struct rt6_info {
         struct dst_entry                dst;
  
@@ -134,14 +163,25 @@ struct rt6_info {
  
         struct inet6_dev                *rt6i_idev;
         struct rt6_info * __percpu      *rt6i_pcpu;
+       struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
  
         u32                             rt6i_metric;
         u32                             rt6i_pmtu;
         /* more non-fragment space at head required */
         unsigned short                  rt6i_nfheader_len;
         u8                              rt6i_protocol;
+       u8                              exception_bucket_flushed:1,
+                                       unused:7;
  };
  
+#define for_each_fib6_node_rt_rcu(fn)                                  \
+       for (rt = rcu_dereference((fn)->leaf); rt;                      \
+            rt = rcu_dereference(rt->dst.rt6_next))
+
+#define for_each_fib6_walker_rt(w)                                     \
+       for (rt = (w)->leaf; rt;                                        \
+            rt = rcu_dereference_protected(rt->dst.rt6_next, 1))
+
  static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
  {
         return ((struct rt6_info *)dst)->rt6i_idev;
@@ -188,6 +228,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
  
         if (fn) {
                 *cookie = fn->fn_sernum;
+               /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */
+               smp_rmb();
                 status = true;
         }
  
@@ -248,7 +290,6 @@ struct fib6_walker {
         struct fib6_node *root, *node;
         struct rt6_info *leaf;
         enum fib6_walk_state state;
-       bool prune;
         unsigned int skip;
         unsigned int count;
         int (*func)(struct fib6_walker *);
@@ -256,12 +297,15 @@ struct fib6_walker {
  };
  
  struct rt6_statistics {
-       __u32           fib_nodes;
-       __u32           fib_route_nodes;
-       __u32           fib_rt_alloc;           /* permanent routes     */
-       __u32           fib_rt_entries;         /* rt entries in table  */
-       __u32           fib_rt_cache;           /* cache routes         */
-       __u32           fib_discarded_routes;
+       __u32           fib_nodes;              /* all fib6 nodes */
+       __u32           fib_route_nodes;        /* intermediate nodes */
+       __u32           fib_rt_entries;         /* rt entries in fib table */
+       __u32           fib_rt_cache;           /* cached rt entries in exception table */
+       __u32           fib_discarded_routes;   /* total number of routes delete */
+
+       /* The following stats are not protected by any lock */
+       atomic_t        fib_rt_alloc;           /* total number of routes alloced */
+       atomic_t        fib_rt_uncache;         /* rt entries in uncached list */
  };
  
  #define RTN_TL_ROOT    0x0001
@@ -277,7 +321,7 @@ struct rt6_statistics {
  struct fib6_table {
         struct hlist_node       tb6_hlist;
         u32                     tb6_id;
-       rwlock_t                tb6_lock;
+       spinlock_t              tb6_lock;
         struct fib6_node        tb6_root;
         struct inet_peer_base   tb6_peers;
         unsigned int            flags;
@@ -325,7 +369,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root,
  
  struct fib6_node *fib6_locate(struct fib6_node *root,
                               const struct in6_addr *daddr, int dst_len,
-                             const struct in6_addr *saddr, int src_len);
+                             const struct in6_addr *saddr, int src_len,
+                             bool exact_match);
  
  void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
                     void *arg);
@@ -358,6 +403,8 @@ void __net_exit fib6_notifier_exit(struct net *net);
  unsigned int fib6_tables_seq_read(struct net *net);
  int fib6_tables_dump(struct net *net, struct notifier_block *nb);
  
+void fib6_update_sernum(struct rt6_info *rt);
+
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
  int fib6_rules_init(void);
  void fib6_rules_cleanup(void);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h

index ee96f40..a0087fb 100644 (file)
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -95,6 +95,11 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
  int ip6_ins_rt(struct rt6_info *);
  int ip6_del_rt(struct rt6_info *);
  
+void rt6_flush_exceptions(struct rt6_info *rt);
+int rt6_remove_exception_rt(struct rt6_info *rt);
+void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
+                       unsigned long now);
+
  static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
                                       const struct in6_addr *daddr,
                                       unsigned int prefs,
diff --git a/include/net/ipv6.h b/include/net/ipv6.h

index 6eac5cf..3cda3b5 100644 (file)
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -300,8 +300,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl)
  
  void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info);
  
-int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
-                              struct icmp6hdr *thdr, int len);
+void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+                               struct icmp6hdr *thdr, int len);
  
  int ip6_ra_control(struct sock *sk, int sel);
  
diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h

index 039cc29..51e1a2a 100644 (file)
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -108,8 +108,10 @@ struct phonet_protocol {
         int                     sock_type;
  };
  
-int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp);
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp);
+int phonet_proto_register(unsigned int protocol,
+               const struct phonet_protocol *pp);
+void phonet_proto_unregister(unsigned int protocol,
+               const struct phonet_protocol *pp);
  
  int phonet_sysctl_init(void);
  void phonet_sysctl_exit(void);
diff --git a/include/net/sock.h b/include/net/sock.h

index a6b9a8d..4827094 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -60,7 +60,7 @@
  #include <linux/sched.h>
  #include <linux/wait.h>
  #include <linux/cgroup-defs.h>
-
+#include <linux/rbtree.h>
  #include <linux/filter.h>
  #include <linux/rculist_nulls.h>
  #include <linux/poll.h>
@@ -397,7 +397,10 @@ struct sock {
         int                     sk_wmem_queued;
         refcount_t              sk_wmem_alloc;
         unsigned long           sk_tsq_flags;
-       struct sk_buff          *sk_send_head;
+       union {
+               struct sk_buff  *sk_send_head;
+               struct rb_root  tcp_rtx_queue;
+       };
         struct sk_buff_head     sk_write_queue;
         __s32                   sk_peek_off;
         int                     sk_write_pending;
diff --git a/include/net/switchdev.h b/include/net/switchdev.h

index d767b79..d756fbe 100644 (file)
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -51,6 +51,7 @@ enum switchdev_attr_id {
         SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
         SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
         SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
+       SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
  };
  
  struct switchdev_attr {
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 3b16f35..5a95e58 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
  void tcp_simple_retransmit(struct sock *);
  void tcp_enter_recovery(struct sock *sk, bool ece_ack);
  int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
+enum tcp_queue {
+       TCP_FRAG_IN_WRITE_QUEUE,
+       TCP_FRAG_IN_RTX_QUEUE,
+};
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                struct sk_buff *skb, u32 len,
+                unsigned int mss_now, gfp_t gfp);
  
  void tcp_send_probe0(struct sock *);
  void tcp_send_partial(struct sock *);
@@ -1606,19 +1612,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
         skb->_skb_refdst = _save;               \
  }
  
-/* write queue abstraction */
-static inline void tcp_write_queue_purge(struct sock *sk)
-{
-       struct sk_buff *skb;
+void tcp_write_queue_purge(struct sock *sk);
  
-       tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-       while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
-               tcp_skb_tsorted_anchor_cleanup(skb);
-               sk_wmem_free_skb(sk, skb);
-       }
-       INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
-       sk_mem_reclaim(sk);
-       tcp_clear_all_retrans_hints(tcp_sk(sk));
+static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
+{
+       return skb_rb_first(&sk->tcp_rtx_queue);
  }
  
  static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
@@ -1643,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
         return skb_queue_prev(&sk->sk_write_queue, skb);
  }
  
-#define tcp_for_write_queue(skb, sk)                                   \
-       skb_queue_walk(&(sk)->sk_write_queue, skb)
-
-#define tcp_for_write_queue_from(skb, sk)                              \
-       skb_queue_walk_from(&(sk)->sk_write_queue, skb)
-
  #define tcp_for_write_queue_from_safe(skb, tmp, sk)                    \
         skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
  
  static inline struct sk_buff *tcp_send_head(const struct sock *sk)
  {
-       return sk->sk_send_head;
+       return skb_peek(&sk->sk_write_queue);
  }
  
  static inline bool tcp_skb_is_last(const struct sock *sk,
@@ -1663,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
         return skb_queue_is_last(&sk->sk_write_queue, skb);
  }
  
-static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
+static inline bool tcp_write_queue_empty(const struct sock *sk)
  {
-       if (tcp_skb_is_last(sk, skb))
-               sk->sk_send_head = NULL;
-       else
-               sk->sk_send_head = tcp_write_queue_next(sk, skb);
+       return skb_queue_empty(&sk->sk_write_queue);
+}
+
+static inline bool tcp_rtx_queue_empty(const struct sock *sk)
+{
+       return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
+}
+
+static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
+{
+       return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
  }
  
  static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
  {
-       if (sk->sk_send_head == skb_unlinked) {
-               sk->sk_send_head = NULL;
+       if (tcp_write_queue_empty(sk))
                 tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-       }
+
         if (tcp_sk(sk)->highest_sack == skb_unlinked)
                 tcp_sk(sk)->highest_sack = NULL;
  }
  
-static inline void tcp_init_send_head(struct sock *sk)
-{
-       sk->sk_send_head = NULL;
-}
-
  static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
  {
         __skb_queue_tail(&sk->sk_write_queue, skb);
@@ -1696,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
         __tcp_add_write_queue_tail(sk, skb);
  
         /* Queue it, remembering where we must start sending. */
-       if (sk->sk_send_head == NULL) {
-               sk->sk_send_head = skb;
+       if (sk->sk_write_queue.next == skb) {
                 tcp_chrono_start(sk, TCP_CHRONO_BUSY);
  
                 if (tcp_sk(sk)->highest_sack == NULL)
@@ -1710,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
         __skb_queue_head(&sk->sk_write_queue, skb);
  }
  
-/* Insert buff after skb on the write queue of sk.  */
-static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
-                                               struct sk_buff *buff,
-                                               struct sock *sk)
-{
-       __skb_queue_after(&sk->sk_write_queue, skb, buff);
-}
-
  /* Insert new before skb on the write queue of sk.  */
  static inline void tcp_insert_write_queue_before(struct sk_buff *new,
                                                   struct sk_buff *skb,
                                                   struct sock *sk)
  {
         __skb_queue_before(&sk->sk_write_queue, skb, new);
-
-       if (sk->sk_send_head == skb)
-               sk->sk_send_head = new;
  }
  
  static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
  {
-       list_del(&skb->tcp_tsorted_anchor);
-       tcp_skb_tsorted_anchor_cleanup(skb);
         __skb_unlink(skb, &sk->sk_write_queue);
  }
  
-static inline bool tcp_write_queue_empty(struct sock *sk)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
+
+static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
  {
-       return skb_queue_empty(&sk->sk_write_queue);
+       tcp_skb_tsorted_anchor_cleanup(skb);
+       rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
+}
+
+static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
+{
+       list_del(&skb->tcp_tsorted_anchor);
+       tcp_rtx_queue_unlink(skb, sk);
+       sk_wmem_free_skb(sk, skb);
  }
  
  static inline void tcp_push_pending_frames(struct sock *sk)
@@ -1767,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
  
  static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
  {
-       tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
-                                               tcp_write_queue_next(sk, skb);
+       struct sk_buff *next = skb_rb_next(skb);
+
+       tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
  }
  
  static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
@@ -1778,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
  
  static inline void tcp_highest_sack_reset(struct sock *sk)
  {
-       tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
+       struct sk_buff *skb = tcp_rtx_queue_head(sk);
+
+       tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
  }
  
  /* Called when old skb is about to be deleted (to be combined with new skb) */
@@ -1948,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
  /* At how many usecs into the future should the RTO fire? */
  static inline s64 tcp_rto_delta_us(const struct sock *sk)
  {
-       const struct sk_buff *skb = tcp_write_queue_head(sk);
+       const struct sk_buff *skb = tcp_rtx_queue_head(sk);
         u32 rto = inet_csk(sk)->icsk_rto;
         u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
  
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h

index 82e93ee..67c5a9f 100644 (file)
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -192,6 +192,7 @@ struct scsi_device {
         unsigned no_dif:1;      /* T10 PI (DIF) should be disabled */
         unsigned broken_fua:1;          /* Don't set FUA bit */
         unsigned lun_in_cdb:1;          /* Store LUN bits in CDB[1] */
+       unsigned unmap_limit_for_ws:1;  /* Use the UNMAP limit for WRITE SAME */
  
         atomic_t disk_events_disable_depth; /* disable depth for disk events */
  
diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h

index 9592570..36b0301 100644 (file)
--- a/include/scsi/scsi_devinfo.h
+++ b/include/scsi/scsi_devinfo.h
@@ -29,5 +29,6 @@
  #define BLIST_TRY_VPD_PAGES    0x10000000 /* Attempt to read VPD pages */
  #define BLIST_NO_RSOC          0x20000000 /* don't try to issue RSOC */
  #define BLIST_MAX_1024         0x40000000 /* maximum 1024 sector cdb length */
+#define BLIST_UNMAP_LIMIT_WS   0x80000000 /* Use UNMAP limit for WRITE SAME */
  
  #endif
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h

index 6183d20..b266d2a 100644 (file)
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
                                                 unsigned int target_id);
  extern void iscsi_remove_session(struct iscsi_cls_session *session);
  extern void iscsi_free_session(struct iscsi_cls_session *session);
-extern int iscsi_destroy_session(struct iscsi_cls_session *session);
  extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
                                                 int dd_size, uint32_t cid);
  extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 6082faf..6db9e1d 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
                 __u32   numa_node;      /* numa node (effective only if
                                          * BPF_F_NUMA_NODE is set).
                                          */
-               __u8    map_name[BPF_OBJ_NAME_LEN];
+               char    map_name[BPF_OBJ_NAME_LEN];
         };
  
         struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
                 __aligned_u64   log_buf;        /* user supplied buffer */
                 __u32           kern_version;   /* checked when prog_type=kprobe */
                 __u32           prog_flags;
-               __u8            prog_name[BPF_OBJ_NAME_LEN];
+               char            prog_name[BPF_OBJ_NAME_LEN];
         };
  
         struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -641,6 +641,21 @@ union bpf_attr {
   *     @xdp_md: pointer to xdp_md
   *     @delta: An positive/negative integer to be added to xdp_md.data_meta
   *     Return: 0 on success or negative on error
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ *     read perf event counter value and perf event enabled/running time
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return: 0 on success or negative error code
+ *
+ * int bpf_perf_prog_read_value(ctx, buf, buf_size)
+ *     read perf prog attached perf event counter and enabled/running time
+ *     @ctx: pointer to ctx
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return : 0 on success or negative error code
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -697,7 +712,9 @@ union bpf_attr {
         FN(redirect_map),               \
         FN(sk_redirect_map),            \
         FN(sock_map_update),            \
-       FN(xdp_adjust_meta),
+       FN(xdp_adjust_meta),            \
+       FN(perf_event_read_value),      \
+       FN(perf_prog_read_value),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
   * function eBPF program intends to call
@@ -741,7 +758,9 @@ enum bpf_func_id {
  #define BPF_F_ZERO_CSUM_TX             (1ULL << 1)
  #define BPF_F_DONT_FRAGMENT            (1ULL << 2)
  
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
  #define BPF_F_INDEX_MASK               0xffffffffULL
  #define BPF_F_CURRENT_CPU              BPF_F_INDEX_MASK
  /* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -869,7 +888,7 @@ struct bpf_prog_info {
         __u32 created_by_uid;
         __u32 nr_map_ids;
         __aligned_u64 map_ids;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char name[BPF_OBJ_NAME_LEN];
  } __attribute__((aligned(8)));
  
  struct bpf_map_info {
@@ -879,7 +898,7 @@ struct bpf_map_info {
         __u32 value_size;
         __u32 max_entries;
         __u32 map_flags;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char  name[BPF_OBJ_NAME_LEN];
  } __attribute__((aligned(8)));
  
  /* User bpf_sock_ops struct to access socket values and specify request ops
@@ -934,4 +953,10 @@ enum {
  #define TCP_BPF_IW             1001    /* Set TCP initial congestion window */
  #define TCP_BPF_SNDCWND_CLAMP  1002    /* Set sndcwnd_clamp */
  
+struct bpf_perf_event_value {
+       __u64 counter;
+       __u64 enabled;
+       __u64 running;
+};
+
  #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h

index cd580fc..b037e0a 100644 (file)
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -327,6 +327,7 @@ enum {
         IFLA_BRPORT_VLAN_TUNNEL,
         IFLA_BRPORT_BCAST_FLOOD,
         IFLA_BRPORT_GROUP_FWD_MASK,
+       IFLA_BRPORT_NEIGH_SUPPRESS,
         __IFLA_BRPORT_MAX
  };
  #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h

index 2e52088..a2f48c0 100644 (file)
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -84,6 +84,7 @@ enum tunnel_encap_types {
         TUNNEL_ENCAP_NONE,
         TUNNEL_ENCAP_FOU,
         TUNNEL_ENCAP_GUE,
+       TUNNEL_ENCAP_MPLS,
  };
  
  #define TUNNEL_ENCAP_FLAG_CSUM         (1<<0)
diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h

index b97725a..da161b5 100644 (file)
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -23,6 +23,7 @@ enum xt_bpf_modes {
         XT_BPF_MODE_FD_PINNED,
         XT_BPF_MODE_FD_ELF,
  };
+#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
  
  struct xt_bpf_info_v1 {
         __u16 mode;
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h

index 156ee4c..0cd6f88 100644 (file)
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr {
         OVS_TUNNEL_KEY_ATTR_IPV6_SRC,           /* struct in6_addr src IPv6 address. */
         OVS_TUNNEL_KEY_ATTR_IPV6_DST,           /* struct in6_addr dst IPv6 address. */
         OVS_TUNNEL_KEY_ATTR_PAD,
+       OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,        /* be32 ERSPAN index. */
         __OVS_TUNNEL_KEY_ATTR_MAX
  };
  
@@ -806,6 +807,7 @@ struct ovs_action_push_eth {
   * packet.
   * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
   * packet.
+ * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
   *
   * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
   * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -835,6 +837,7 @@ enum ovs_action_attr {
         OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
         OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
         OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
+       OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
  
         __OVS_ACTION_ATTR_MAX,        /* Nothing past this will be accepted
                                        * from userspace. */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile

index 897daa0..53fb09f 100644 (file)
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,6 +2,7 @@ obj-y := core.o
  
  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
  ifeq ($(CONFIG_NET),y)
  obj-$(CONFIG_BPF_SYSCALL) += devmap.o
  ifeq ($(CONFIG_STREAM_PARSER),y)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c

index 98c0f00..68d8666 100644 (file)
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
  
         ee = ERR_PTR(-EOPNOTSUPP);
         event = perf_file->private_data;
-       if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+       if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
                 goto err_out;
  
         ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c

index c6be15a..248961a 100644 (file)
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
  
  static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
  {
+       const char *end = sym + KSYM_NAME_LEN;
+
         BUILD_BUG_ON(sizeof("bpf_prog_") +
-                    sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+                    sizeof(prog->tag) * 2 +
+                    /* name has been null terminated.
+                     * We should need +1 for the '_' preceding
+                     * the name.  However, the null character
+                     * is double counted between the name and the
+                     * sizeof("bpf_prog_") above, so we omit
+                     * the +1 here.
+                     */
+                    sizeof(prog->aux->name) > KSYM_NAME_LEN);
  
         sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
         sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
-       *sym = 0;
+       if (prog->aux->name[0])
+               snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+       else
+               *sym = 0;
  }
  
  static __always_inline unsigned long
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c

new file mode 100644 (file)

index 0000000..e682850
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+       __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+       BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+       if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+               return func_id_str[id];
+       else
+               return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+       [BPF_LD]    = "ld",
+       [BPF_LDX]   = "ldx",
+       [BPF_ST]    = "st",
+       [BPF_STX]   = "stx",
+       [BPF_ALU]   = "alu",
+       [BPF_JMP]   = "jmp",
+       [BPF_RET]   = "BUG",
+       [BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+       [BPF_ADD >> 4]  = "+=",
+       [BPF_SUB >> 4]  = "-=",
+       [BPF_MUL >> 4]  = "*=",
+       [BPF_DIV >> 4]  = "/=",
+       [BPF_OR  >> 4]  = "|=",
+       [BPF_AND >> 4]  = "&=",
+       [BPF_LSH >> 4]  = "<<=",
+       [BPF_RSH >> 4]  = ">>=",
+       [BPF_NEG >> 4]  = "neg",
+       [BPF_MOD >> 4]  = "%=",
+       [BPF_XOR >> 4]  = "^=",
+       [BPF_MOV >> 4]  = "=",
+       [BPF_ARSH >> 4] = "s>>=",
+       [BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+       [BPF_W >> 3]  = "u32",
+       [BPF_H >> 3]  = "u16",
+       [BPF_B >> 3]  = "u8",
+       [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+       [BPF_JA >> 4]   = "jmp",
+       [BPF_JEQ >> 4]  = "==",
+       [BPF_JGT >> 4]  = ">",
+       [BPF_JLT >> 4]  = "<",
+       [BPF_JGE >> 4]  = ">=",
+       [BPF_JLE >> 4]  = "<=",
+       [BPF_JSET >> 4] = "&",
+       [BPF_JNE >> 4]  = "!=",
+       [BPF_JSGT >> 4] = "s>",
+       [BPF_JSLT >> 4] = "s<",
+       [BPF_JSGE >> 4] = "s>=",
+       [BPF_JSLE >> 4] = "s<=",
+       [BPF_CALL >> 4] = "call",
+       [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+                              struct bpf_verifier_env *env,
+                              const struct bpf_insn *insn)
+{
+       verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+               BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+               insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+                   const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+       u8 class = BPF_CLASS(insn->code);
+
+       if (class == BPF_ALU || class == BPF_ALU64) {
+               if (BPF_OP(insn->code) == BPF_END) {
+                       if (class == BPF_ALU64)
+                               verbose(env, "BUG_alu64_%02x\n", insn->code);
+                       else
+                               print_bpf_end_insn(verbose, env, insn);
+               } else if (BPF_OP(insn->code) == BPF_NEG) {
+                       verbose(env, "(%02x) r%d = %s-r%d\n",
+                               insn->code, insn->dst_reg,
+                               class == BPF_ALU ? "(u32) " : "",
+                               insn->dst_reg);
+               } else if (BPF_SRC(insn->code) == BPF_X) {
+                       verbose(env, "(%02x) %sr%d %s %sr%d\n",
+                               insn->code, class == BPF_ALU ? "(u32) " : "",
+                               insn->dst_reg,
+                               bpf_alu_string[BPF_OP(insn->code) >> 4],
+                               class == BPF_ALU ? "(u32) " : "",
+                               insn->src_reg);
+               } else {
+                       verbose(env, "(%02x) %sr%d %s %s%d\n",
+                               insn->code, class == BPF_ALU ? "(u32) " : "",
+                               insn->dst_reg,
+                               bpf_alu_string[BPF_OP(insn->code) >> 4],
+                               class == BPF_ALU ? "(u32) " : "",
+                               insn->imm);
+               }
+       } else if (class == BPF_STX) {
+               if (BPF_MODE(insn->code) == BPF_MEM)
+                       verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->dst_reg,
+                               insn->off, insn->src_reg);
+               else if (BPF_MODE(insn->code) == BPF_XADD)
+                       verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->dst_reg, insn->off,
+                               insn->src_reg);
+               else
+                       verbose(env, "BUG_%02x\n", insn->code);
+       } else if (class == BPF_ST) {
+               if (BPF_MODE(insn->code) != BPF_MEM) {
+                       verbose(env, "BUG_st_%02x\n", insn->code);
+                       return;
+               }
+               verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+                       insn->code,
+                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                       insn->dst_reg,
+                       insn->off, insn->imm);
+       } else if (class == BPF_LDX) {
+               if (BPF_MODE(insn->code) != BPF_MEM) {
+                       verbose(env, "BUG_ldx_%02x\n", insn->code);
+                       return;
+               }
+               verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+                       insn->code, insn->dst_reg,
+                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                       insn->src_reg, insn->off);
+       } else if (class == BPF_LD) {
+               if (BPF_MODE(insn->code) == BPF_ABS) {
+                       verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->imm);
+               } else if (BPF_MODE(insn->code) == BPF_IND) {
+                       verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+                               insn->code,
+                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+                               insn->src_reg, insn->imm);
+               } else if (BPF_MODE(insn->code) == BPF_IMM &&
+                          BPF_SIZE(insn->code) == BPF_DW) {
+                       /* At this point, we already made sure that the second
+                        * part of the ldimm64 insn is accessible.
+                        */
+                       u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+                       bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+                       if (map_ptr && !allow_ptr_leaks)
+                               imm = 0;
+
+                       verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+                               insn->dst_reg, (unsigned long long)imm);
+               } else {
+                       verbose(env, "BUG_ld_%02x\n", insn->code);
+                       return;
+               }
+       } else if (class == BPF_JMP) {
+               u8 opcode = BPF_OP(insn->code);
+
+               if (opcode == BPF_CALL) {
+                       verbose(env, "(%02x) call %s#%d\n", insn->code,
+                               func_id_name(insn->imm), insn->imm);
+               } else if (insn->code == (BPF_JMP | BPF_JA)) {
+                       verbose(env, "(%02x) goto pc%+d\n",
+                               insn->code, insn->off);
+               } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+                       verbose(env, "(%02x) exit\n", insn->code);
+               } else if (BPF_SRC(insn->code) == BPF_X) {
+                       verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+                               insn->code, insn->dst_reg,
+                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
+                               insn->src_reg, insn->off);
+               } else {
+                       verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+                               insn->code, insn->dst_reg,
+                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
+                               insn->imm, insn->off);
+               }
+       } else {
+               verbose(env, "(%02x) %s\n",
+                       insn->code, bpf_class_string[class]);
+       }
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h

new file mode 100644 (file)

index 0000000..8de977e
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+                                 const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+                   const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c

index e833ed9..be1dde9 100644 (file)
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ out:
         putname(pname);
         return ret;
  }
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
  
  static void bpf_evict_inode(struct inode *inode)
  {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 0048cb2..d124e70 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -322,6 +322,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
  {
         const char *end = src + BPF_OBJ_NAME_LEN;
  
+       memset(dst, 0, BPF_OBJ_NAME_LEN);
+
         /* Copy all isalnum() and '_' char */
         while (src < end && *src) {
                 if (!isalnum(*src) && *src != '_')
@@ -333,9 +335,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
         if (src == end)
                 return -EINVAL;
  
-       /* '\0' terminates dst */
-       *dst = 0;
-
         return 0;
  }
  
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index 52b0223..2cdbcc4 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,8 @@
  #include <linux/vmalloc.h>
  #include <linux/stringify.h>
  
+#include "disasm.h"
+
  /* bpf_check() is a static code analyzer that walks eBPF program
   * instruction by instruction and updates register/stack state.
   * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -153,28 +155,36 @@ struct bpf_call_arg_meta {
         int access_size;
  };
  
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
  static DEFINE_MUTEX(bpf_verifier_lock);
  
  /* log_level controls verbosity level of eBPF verifier.
   * verbose() is used to dump the verification trace to the log, so the user
   * can figure out what's wrong with the program
   */
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+                                  const char *fmt, ...)
  {
+       struct bpf_verifer_log *log = &env->log;
+       unsigned int n;
         va_list args;
  
-       if (log_level == 0 || log_len >= log_size - 1)
+       if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
                 return;
  
         va_start(args, fmt);
-       log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+       n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
         va_end(args);
+
+       WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+                 "verifier log line truncated - local buffer too short\n");
+
+       n = min(log->len_total - log->len_used - 1, n);
+       log->kbuf[n] = '\0';
+
+       if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+               log->len_used += n;
+       else
+               log->ubuf = NULL;
  }
  
  static bool type_is_pkt_pointer(enum bpf_reg_type type)
@@ -197,23 +207,8 @@ static const char * const reg_type_str[] = {
         [PTR_TO_PACKET_END]     = "pkt_end",
  };
  
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
-       __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
-       BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
-       if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
-               return func_id_str[id];
-       else
-               return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+                                struct bpf_verifier_state *state)
  {
         struct bpf_reg_state *reg;
         enum bpf_reg_type t;
@@ -224,21 +219,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
                 t = reg->type;
                 if (t == NOT_INIT)
                         continue;
-               verbose(" R%d=%s", i, reg_type_str[t]);
+               verbose(env, " R%d=%s", i, reg_type_str[t]);
                 if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
                     tnum_is_const(reg->var_off)) {
                         /* reg->off should be 0 for SCALAR_VALUE */
-                       verbose("%lld", reg->var_off.value + reg->off);
+                       verbose(env, "%lld", reg->var_off.value + reg->off);
                 } else {
-                       verbose("(id=%d", reg->id);
+                       verbose(env, "(id=%d", reg->id);
                         if (t != SCALAR_VALUE)
-                               verbose(",off=%d", reg->off);
+                               verbose(env, ",off=%d", reg->off);
                         if (type_is_pkt_pointer(t))
-                               verbose(",r=%d", reg->range);
+                               verbose(env, ",r=%d", reg->range);
                         else if (t == CONST_PTR_TO_MAP ||
                                  t == PTR_TO_MAP_VALUE ||
                                  t == PTR_TO_MAP_VALUE_OR_NULL)
-                               verbose(",ks=%d,vs=%d",
+                               verbose(env, ",ks=%d,vs=%d",
                                         reg->map_ptr->key_size,
                                         reg->map_ptr->value_size);
                         if (tnum_is_const(reg->var_off)) {
@@ -246,218 +241,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
                                  * could be a pointer whose offset is too big
                                  * for reg->off
                                  */
-                               verbose(",imm=%llx", reg->var_off.value);
+                               verbose(env, ",imm=%llx", reg->var_off.value);
                         } else {
                                 if (reg->smin_value != reg->umin_value &&
                                     reg->smin_value != S64_MIN)
-                                       verbose(",smin_value=%lld",
+                                       verbose(env, ",smin_value=%lld",
                                                 (long long)reg->smin_value);
                                 if (reg->smax_value != reg->umax_value &&
                                     reg->smax_value != S64_MAX)
-                                       verbose(",smax_value=%lld",
+                                       verbose(env, ",smax_value=%lld",
                                                 (long long)reg->smax_value);
                                 if (reg->umin_value != 0)
-                                       verbose(",umin_value=%llu",
+                                       verbose(env, ",umin_value=%llu",
                                                 (unsigned long long)reg->umin_value);
                                 if (reg->umax_value != U64_MAX)
-                                       verbose(",umax_value=%llu",
+                                       verbose(env, ",umax_value=%llu",
                                                 (unsigned long long)reg->umax_value);
                                 if (!tnum_is_unknown(reg->var_off)) {
                                         char tn_buf[48];
  
                                         tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                                       verbose(",var_off=%s", tn_buf);
+                                       verbose(env, ",var_off=%s", tn_buf);
                                 }
                         }
-                       verbose(")");
+                       verbose(env, ")");
                 }
         }
         for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                 if (state->stack_slot_type[i] == STACK_SPILL)
-                       verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+                       verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
                                 reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
         }
-       verbose("\n");
-}
-
-static const char *const bpf_class_string[] = {
-       [BPF_LD]    = "ld",
-       [BPF_LDX]   = "ldx",
-       [BPF_ST]    = "st",
-       [BPF_STX]   = "stx",
-       [BPF_ALU]   = "alu",
-       [BPF_JMP]   = "jmp",
-       [BPF_RET]   = "BUG",
-       [BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
-       [BPF_ADD >> 4]  = "+=",
-       [BPF_SUB >> 4]  = "-=",
-       [BPF_MUL >> 4]  = "*=",
-       [BPF_DIV >> 4]  = "/=",
-       [BPF_OR  >> 4]  = "|=",
-       [BPF_AND >> 4]  = "&=",
-       [BPF_LSH >> 4]  = "<<=",
-       [BPF_RSH >> 4]  = ">>=",
-       [BPF_NEG >> 4]  = "neg",
-       [BPF_MOD >> 4]  = "%=",
-       [BPF_XOR >> 4]  = "^=",
-       [BPF_MOV >> 4]  = "=",
-       [BPF_ARSH >> 4] = "s>>=",
-       [BPF_END >> 4]  = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
-       [BPF_W >> 3]  = "u32",
-       [BPF_H >> 3]  = "u16",
-       [BPF_B >> 3]  = "u8",
-       [BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
-       [BPF_JA >> 4]   = "jmp",
-       [BPF_JEQ >> 4]  = "==",
-       [BPF_JGT >> 4]  = ">",
-       [BPF_JLT >> 4]  = "<",
-       [BPF_JGE >> 4]  = ">=",
-       [BPF_JLE >> 4]  = "<=",
-       [BPF_JSET >> 4] = "&",
-       [BPF_JNE >> 4]  = "!=",
-       [BPF_JSGT >> 4] = "s>",
-       [BPF_JSLT >> 4] = "s<",
-       [BPF_JSGE >> 4] = "s>=",
-       [BPF_JSLE >> 4] = "s<=",
-       [BPF_CALL >> 4] = "call",
-       [BPF_EXIT >> 4] = "exit",
-};
-
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
-                              const struct bpf_insn *insn)
-{
-       verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
-               BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
-               insn->imm, insn->dst_reg);
-}
-
-static void print_bpf_insn(const struct bpf_verifier_env *env,
-                          const struct bpf_insn *insn)
-{
-       u8 class = BPF_CLASS(insn->code);
-
-       if (class == BPF_ALU || class == BPF_ALU64) {
-               if (BPF_OP(insn->code) == BPF_END) {
-                       if (class == BPF_ALU64)
-                               verbose("BUG_alu64_%02x\n", insn->code);
-                       else
-                               print_bpf_end_insn(env, insn);
-               } else if (BPF_OP(insn->code) == BPF_NEG) {
-                       verbose("(%02x) r%d = %s-r%d\n",
-                               insn->code, insn->dst_reg,
-                               class == BPF_ALU ? "(u32) " : "",
-                               insn->dst_reg);
-               } else if (BPF_SRC(insn->code) == BPF_X) {
-                       verbose("(%02x) %sr%d %s %sr%d\n",
-                               insn->code, class == BPF_ALU ? "(u32) " : "",
-                               insn->dst_reg,
-                               bpf_alu_string[BPF_OP(insn->code) >> 4],
-                               class == BPF_ALU ? "(u32) " : "",
-                               insn->src_reg);
-               } else {
-                       verbose("(%02x) %sr%d %s %s%d\n",
-                               insn->code, class == BPF_ALU ? "(u32) " : "",
-                               insn->dst_reg,
-                               bpf_alu_string[BPF_OP(insn->code) >> 4],
-                               class == BPF_ALU ? "(u32) " : "",
-                               insn->imm);
-               }
-       } else if (class == BPF_STX) {
-               if (BPF_MODE(insn->code) == BPF_MEM)
-                       verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->dst_reg,
-                               insn->off, insn->src_reg);
-               else if (BPF_MODE(insn->code) == BPF_XADD)
-                       verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->dst_reg, insn->off,
-                               insn->src_reg);
-               else
-                       verbose("BUG_%02x\n", insn->code);
-       } else if (class == BPF_ST) {
-               if (BPF_MODE(insn->code) != BPF_MEM) {
-                       verbose("BUG_st_%02x\n", insn->code);
-                       return;
-               }
-               verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
-                       insn->code,
-                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                       insn->dst_reg,
-                       insn->off, insn->imm);
-       } else if (class == BPF_LDX) {
-               if (BPF_MODE(insn->code) != BPF_MEM) {
-                       verbose("BUG_ldx_%02x\n", insn->code);
-                       return;
-               }
-               verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
-                       insn->code, insn->dst_reg,
-                       bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                       insn->src_reg, insn->off);
-       } else if (class == BPF_LD) {
-               if (BPF_MODE(insn->code) == BPF_ABS) {
-                       verbose("(%02x) r0 = *(%s *)skb[%d]\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->imm);
-               } else if (BPF_MODE(insn->code) == BPF_IND) {
-                       verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
-                               insn->code,
-                               bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-                               insn->src_reg, insn->imm);
-               } else if (BPF_MODE(insn->code) == BPF_IMM &&
-                          BPF_SIZE(insn->code) == BPF_DW) {
-                       /* At this point, we already made sure that the second
-                        * part of the ldimm64 insn is accessible.
-                        */
-                       u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
-                       bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
-
-                       if (map_ptr && !env->allow_ptr_leaks)
-                               imm = 0;
-
-                       verbose("(%02x) r%d = 0x%llx\n", insn->code,
-                               insn->dst_reg, (unsigned long long)imm);
-               } else {
-                       verbose("BUG_ld_%02x\n", insn->code);
-                       return;
-               }
-       } else if (class == BPF_JMP) {
-               u8 opcode = BPF_OP(insn->code);
-
-               if (opcode == BPF_CALL) {
-                       verbose("(%02x) call %s#%d\n", insn->code,
-                               func_id_name(insn->imm), insn->imm);
-               } else if (insn->code == (BPF_JMP | BPF_JA)) {
-                       verbose("(%02x) goto pc%+d\n",
-                               insn->code, insn->off);
-               } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-                       verbose("(%02x) exit\n", insn->code);
-               } else if (BPF_SRC(insn->code) == BPF_X) {
-                       verbose("(%02x) if r%d %s r%d goto pc%+d\n",
-                               insn->code, insn->dst_reg,
-                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
-                               insn->src_reg, insn->off);
-               } else {
-                       verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
-                               insn->code, insn->dst_reg,
-                               bpf_jmp_string[BPF_OP(insn->code) >> 4],
-                               insn->imm, insn->off);
-               }
-       } else {
-               verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
-       }
+       verbose(env, "\n");
  }
  
  static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
@@ -495,7 +310,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
         env->head = elem;
         env->stack_size++;
         if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
-               verbose("BPF program is too complex\n");
+               verbose(env, "BPF program is too complex\n");
                 goto err;
         }
         return &elem->st;
@@ -533,10 +348,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
         __mark_reg_known(reg, 0);
  }
  
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+                               struct bpf_reg_state *regs, u32 regno)
  {
         if (WARN_ON(regno >= MAX_BPF_REG)) {
-               verbose("mark_reg_known_zero(regs, %u)\n", regno);
+               verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
                 /* Something bad happened, let's kill all regs */
                 for (regno = 0; regno < MAX_BPF_REG; regno++)
                         __mark_reg_not_init(regs + regno);
@@ -646,10 +462,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
         __mark_reg_unbounded(reg);
  }
  
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+                            struct bpf_reg_state *regs, u32 regno)
  {
         if (WARN_ON(regno >= MAX_BPF_REG)) {
-               verbose("mark_reg_unknown(regs, %u)\n", regno);
+               verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
                 /* Something bad happened, let's kill all regs */
                 for (regno = 0; regno < MAX_BPF_REG; regno++)
                         __mark_reg_not_init(regs + regno);
@@ -664,10 +481,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
         reg->type = NOT_INIT;
  }
  
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+                             struct bpf_reg_state *regs, u32 regno)
  {
         if (WARN_ON(regno >= MAX_BPF_REG)) {
-               verbose("mark_reg_not_init(regs, %u)\n", regno);
+               verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
                 /* Something bad happened, let's kill all regs */
                 for (regno = 0; regno < MAX_BPF_REG; regno++)
                         __mark_reg_not_init(regs + regno);
@@ -676,22 +494,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
         __mark_reg_not_init(regs + regno);
  }
  
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+                          struct bpf_reg_state *regs)
  {
         int i;
  
         for (i = 0; i < MAX_BPF_REG; i++) {
-               mark_reg_not_init(regs, i);
+               mark_reg_not_init(env, regs, i);
                 regs[i].live = REG_LIVE_NONE;
         }
  
         /* frame pointer */
         regs[BPF_REG_FP].type = PTR_TO_STACK;
-       mark_reg_known_zero(regs, BPF_REG_FP);
+       mark_reg_known_zero(env, regs, BPF_REG_FP);
  
         /* 1st arg to a function */
         regs[BPF_REG_1].type = PTR_TO_CTX;
-       mark_reg_known_zero(regs, BPF_REG_1);
+       mark_reg_known_zero(env, regs, BPF_REG_1);
  }
  
  enum reg_arg_type {
@@ -704,6 +523,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
  {
         struct bpf_verifier_state *parent = state->parent;
  
+       if (regno == BPF_REG_FP)
+               /* We don't need to worry about FP liveness because it's read-only */
+               return;
+
         while (parent) {
                 /* if read wasn't screened by an earlier write ... */
                 if (state->regs[regno].live & REG_LIVE_WRITTEN)
@@ -721,26 +544,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
         struct bpf_reg_state *regs = env->cur_state.regs;
  
         if (regno >= MAX_BPF_REG) {
-               verbose("R%d is invalid\n", regno);
+               verbose(env, "R%d is invalid\n", regno);
                 return -EINVAL;
         }
  
         if (t == SRC_OP) {
                 /* check whether register used as source operand can be read */
                 if (regs[regno].type == NOT_INIT) {
-                       verbose("R%d !read_ok\n", regno);
+                       verbose(env, "R%d !read_ok\n", regno);
                         return -EACCES;
                 }
                 mark_reg_read(&env->cur_state, regno);
         } else {
                 /* check whether register used as dest operand can be written to */
                 if (regno == BPF_REG_FP) {
-                       verbose("frame pointer is read only\n");
+                       verbose(env, "frame pointer is read only\n");
                         return -EACCES;
                 }
                 regs[regno].live |= REG_LIVE_WRITTEN;
                 if (t == DST_OP)
-                       mark_reg_unknown(regs, regno);
+                       mark_reg_unknown(env, regs, regno);
         }
         return 0;
  }
@@ -765,7 +588,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
  /* check_stack_read/write functions track spill/fill of registers,
   * stack boundary and alignment are checked in check_mem_access()
   */
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+                            struct bpf_verifier_state *state, int off,
                              int size, int value_regno)
  {
         int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -778,7 +602,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
  
                 /* register containing pointer is being spilled into stack */
                 if (size != BPF_REG_SIZE) {
-                       verbose("invalid size of register spill\n");
+                       verbose(env, "invalid size of register spill\n");
                         return -EACCES;
                 }
  
@@ -813,7 +637,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
         }
  }
  
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+                           struct bpf_verifier_state *state, int off, int size,
                             int value_regno)
  {
         u8 *slot_type;
@@ -823,12 +648,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
  
         if (slot_type[0] == STACK_SPILL) {
                 if (size != BPF_REG_SIZE) {
-                       verbose("invalid size of register spill\n");
+                       verbose(env, "invalid size of register spill\n");
                         return -EACCES;
                 }
                 for (i = 1; i < BPF_REG_SIZE; i++) {
                         if (slot_type[i] != STACK_SPILL) {
-                               verbose("corrupted spill memory\n");
+                               verbose(env, "corrupted spill memory\n");
                                 return -EACCES;
                         }
                 }
@@ -844,14 +669,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
         } else {
                 for (i = 0; i < size; i++) {
                         if (slot_type[i] != STACK_MISC) {
-                               verbose("invalid read from stack off %d+%d size %d\n",
+                               verbose(env, "invalid read from stack off %d+%d size %d\n",
                                         off, i, size);
                                 return -EACCES;
                         }
                 }
                 if (value_regno >= 0)
                         /* have read misc data from the stack */
-                       mark_reg_unknown(state->regs, value_regno);
+                       mark_reg_unknown(env, state->regs, value_regno);
                 return 0;
         }
  }
@@ -863,7 +688,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
         struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
  
         if (off < 0 || size <= 0 || off + size > map->value_size) {
-               verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+               verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                         map->value_size, off, size);
                 return -EACCES;
         }
@@ -882,8 +707,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
          * need to try adding each of min_value and max_value to off
          * to make sure our theoretical access will be safe.
          */
-       if (log_level)
-               print_verifier_state(state);
+       if (env->log.level)
+               print_verifier_state(env, state);
         /* The minimum value is only important with signed
          * comparisons where we can't assume the floor of a
          * value is 0.  If we are using signed variables for our
@@ -891,13 +716,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
          * will have a set floor within our range.
          */
         if (reg->smin_value < 0) {
-               verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+               verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                         regno);
                 return -EACCES;
         }
         err = __check_map_access(env, regno, reg->smin_value + off, size);
         if (err) {
-               verbose("R%d min value is outside of the array range\n", regno);
+               verbose(env, "R%d min value is outside of the array range\n",
+                       regno);
                 return err;
         }
  
@@ -906,13 +732,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
          * If reg->umax_value + off could overflow, treat that as unbounded too.
          */
         if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-               verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+               verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
                         regno);
                 return -EACCES;
         }
         err = __check_map_access(env, regno, reg->umax_value + off, size);
         if (err)
-               verbose("R%d max value is outside of the array range\n", regno);
+               verbose(env, "R%d max value is outside of the array range\n",
+                       regno);
         return err;
  }
  
@@ -951,7 +778,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
         struct bpf_reg_state *reg = &regs[regno];
  
         if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
-               verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+               verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
                         off, size, regno, reg->id, reg->off, reg->range);
                 return -EACCES;
         }
@@ -974,13 +801,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
          * detail to prove they're safe.
          */
         if (reg->smin_value < 0) {
-               verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+               verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
                         regno);
                 return -EACCES;
         }
         err = __check_packet_access(env, regno, off, size);
         if (err) {
-               verbose("R%d offset is outside of the packet\n", regno);
+               verbose(env, "R%d offset is outside of the packet\n", regno);
                 return err;
         }
         return err;
@@ -1016,7 +843,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
                 return 0;
         }
  
-       verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+       verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
         return -EACCES;
  }
  
@@ -1034,7 +861,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
         return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
  }
  
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+                                  const struct bpf_reg_state *reg,
                                    int off, int size, bool strict)
  {
         struct tnum reg_off;
@@ -1059,7 +887,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
                 char tn_buf[48];
  
                 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-               verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+               verbose(env,
+                       "misaligned packet access off %d+%s+%d+%d size %d\n",
                         ip_align, tn_buf, reg->off, off, size);
                 return -EACCES;
         }
@@ -1067,7 +896,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
         return 0;
  }
  
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+                                      const struct bpf_reg_state *reg,
                                        const char *pointer_desc,
                                        int off, int size, bool strict)
  {
@@ -1082,7 +912,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
                 char tn_buf[48];
  
                 tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-               verbose("misaligned %saccess off %s+%d+%d size %d\n",
+               verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
                         pointer_desc, tn_buf, reg->off, off, size);
                 return -EACCES;
         }
@@ -1103,7 +933,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
                 /* Special case, because of NET_IP_ALIGN. Given metadata sits
                  * right in front, treat it the very same way.
                  */
-               return check_pkt_ptr_alignment(reg, off, size, strict);
+               return check_pkt_ptr_alignment(env, reg, off, size, strict);
         case PTR_TO_MAP_VALUE:
                 pointer_desc = "value ";
                 break;
@@ -1116,7 +946,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
         default:
                 break;
         }
-       return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+       return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+                                          strict);
  }
  
  /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1148,20 +979,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
         if (reg->type == PTR_TO_MAP_VALUE) {
                 if (t == BPF_WRITE && value_regno >= 0 &&
                     is_pointer_value(env, value_regno)) {
-                       verbose("R%d leaks addr into map\n", value_regno);
+                       verbose(env, "R%d leaks addr into map\n", value_regno);
                         return -EACCES;
                 }
  
                 err = check_map_access(env, regno, off, size);
                 if (!err && t == BPF_READ && value_regno >= 0)
-                       mark_reg_unknown(state->regs, value_regno);
+                       mark_reg_unknown(env, state->regs, value_regno);
  
         } else if (reg->type == PTR_TO_CTX) {
                 enum bpf_reg_type reg_type = SCALAR_VALUE;
  
                 if (t == BPF_WRITE && value_regno >= 0 &&
                     is_pointer_value(env, value_regno)) {
-                       verbose("R%d leaks addr into ctx\n", value_regno);
+                       verbose(env, "R%d leaks addr into ctx\n", value_regno);
                         return -EACCES;
                 }
                 /* ctx accesses must be at a fixed offset, so that we can
@@ -1171,7 +1002,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                         char tn_buf[48];
  
                         tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose("variable ctx access var_off=%s off=%d size=%d",
+                       verbose(env,
+                               "variable ctx access var_off=%s off=%d size=%d",
                                 tn_buf, off, size);
                         return -EACCES;
                 }
@@ -1183,9 +1015,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                          * case, we know the offset is zero.
                          */
                         if (reg_type == SCALAR_VALUE)
-                               mark_reg_unknown(state->regs, value_regno);
+                               mark_reg_unknown(env, state->regs, value_regno);
                         else
-                               mark_reg_known_zero(state->regs, value_regno);
+                               mark_reg_known_zero(env, state->regs,
+                                                   value_regno);
                         state->regs[value_regno].id = 0;
                         state->regs[value_regno].off = 0;
                         state->regs[value_regno].range = 0;
@@ -1201,13 +1034,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                         char tn_buf[48];
  
                         tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose("variable stack access var_off=%s off=%d size=%d",
+                       verbose(env, "variable stack access var_off=%s off=%d size=%d",
                                 tn_buf, off, size);
                         return -EACCES;
                 }
                 off += reg->var_off.value;
                 if (off >= 0 || off < -MAX_BPF_STACK) {
-                       verbose("invalid stack off=%d size=%d\n", off, size);
+                       verbose(env, "invalid stack off=%d size=%d\n", off,
+                               size);
                         return -EACCES;
                 }
  
@@ -1218,29 +1052,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                         if (!env->allow_ptr_leaks &&
                             state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
                             size != BPF_REG_SIZE) {
-                               verbose("attempt to corrupt spilled pointer on stack\n");
+                               verbose(env, "attempt to corrupt spilled pointer on stack\n");
                                 return -EACCES;
                         }
-                       err = check_stack_write(state, off, size, value_regno);
+                       err = check_stack_write(env, state, off, size,
+                                               value_regno);
                 } else {
-                       err = check_stack_read(state, off, size, value_regno);
+                       err = check_stack_read(env, state, off, size,
+                                              value_regno);
                 }
         } else if (reg_is_pkt_pointer(reg)) {
                 if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
-                       verbose("cannot write into packet\n");
+                       verbose(env, "cannot write into packet\n");
                         return -EACCES;
                 }
                 if (t == BPF_WRITE && value_regno >= 0 &&
                     is_pointer_value(env, value_regno)) {
-                       verbose("R%d leaks addr into packet\n", value_regno);
+                       verbose(env, "R%d leaks addr into packet\n",
+                               value_regno);
                         return -EACCES;
                 }
                 err = check_packet_access(env, regno, off, size);
                 if (!err && t == BPF_READ && value_regno >= 0)
-                       mark_reg_unknown(state->regs, value_regno);
+                       mark_reg_unknown(env, state->regs, value_regno);
         } else {
-               verbose("R%d invalid mem access '%s'\n",
-                       regno, reg_type_str[reg->type]);
+               verbose(env, "R%d invalid mem access '%s'\n", regno,
+                       reg_type_str[reg->type]);
                 return -EACCES;
         }
  
@@ -1260,7 +1097,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
  
         if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
             insn->imm != 0) {
-               verbose("BPF_XADD uses reserved fields\n");
+               verbose(env, "BPF_XADD uses reserved fields\n");
                 return -EINVAL;
         }
  
@@ -1275,7 +1112,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
                 return err;
  
         if (is_pointer_value(env, insn->src_reg)) {
-               verbose("R%d leaks addr into mem\n", insn->src_reg);
+               verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
                 return -EACCES;
         }
  
@@ -1316,7 +1153,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                     register_is_null(regs[regno]))
                         return 0;
  
-               verbose("R%d type=%s expected=%s\n", regno,
+               verbose(env, "R%d type=%s expected=%s\n", regno,
                         reg_type_str[regs[regno].type],
                         reg_type_str[PTR_TO_STACK]);
                 return -EACCES;
@@ -1327,13 +1164,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
                 char tn_buf[48];
  
                 tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
-               verbose("invalid variable stack read R%d var_off=%s\n",
+               verbose(env, "invalid variable stack read R%d var_off=%s\n",
                         regno, tn_buf);
         }
         off = regs[regno].off + regs[regno].var_off.value;
         if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
             access_size <= 0) {
-               verbose("invalid stack type R%d off=%d access_size=%d\n",
+               verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
                         regno, off, access_size);
                 return -EACCES;
         }
@@ -1349,7 +1186,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
  
         for (i = 0; i < access_size; i++) {
                 if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
-                       verbose("invalid indirect read from stack off %d+%d size %d\n",
+                       verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
                                 off, i, access_size);
                         return -EACCES;
                 }
@@ -1392,7 +1229,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
  
         if (arg_type == ARG_ANYTHING) {
                 if (is_pointer_value(env, regno)) {
-                       verbose("R%d leaks addr into helper function\n", regno);
+                       verbose(env, "R%d leaks addr into helper function\n",
+                               regno);
                         return -EACCES;
                 }
                 return 0;
@@ -1400,7 +1238,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
  
         if (type_is_pkt_pointer(type) &&
             !may_access_direct_pkt_data(env, meta, BPF_READ)) {
-               verbose("helper access to the packet is not allowed\n");
+               verbose(env, "helper access to the packet is not allowed\n");
                 return -EACCES;
         }
  
@@ -1438,7 +1276,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                         goto err_type;
                 meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
         } else {
-               verbose("unsupported arg_type %d\n", arg_type);
+               verbose(env, "unsupported arg_type %d\n", arg_type);
                 return -EFAULT;
         }
  
@@ -1456,7 +1294,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                          * we have to check map_key here. Otherwise it means
                          * that kernel subsystem misconfigured verifier
                          */
-                       verbose("invalid map_ptr to access map->key\n");
+                       verbose(env, "invalid map_ptr to access map->key\n");
                         return -EACCES;
                 }
                 if (type_is_pkt_pointer(type))
@@ -1472,7 +1310,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                  */
                 if (!meta->map_ptr) {
                         /* kernel subsystem misconfigured verifier */
-                       verbose("invalid map_ptr to access map->value\n");
+                       verbose(env, "invalid map_ptr to access map->value\n");
                         return -EACCES;
                 }
                 if (type_is_pkt_pointer(type))
@@ -1492,7 +1330,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                  */
                 if (regno == 0) {
                         /* kernel subsystem misconfigured verifier */
-                       verbose("ARG_CONST_SIZE cannot be first argument\n");
+                       verbose(env,
+                               "ARG_CONST_SIZE cannot be first argument\n");
                         return -EACCES;
                 }
  
@@ -1509,7 +1348,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                         meta = NULL;
  
                 if (reg->smin_value < 0) {
-                       verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+                       verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
                                 regno);
                         return -EACCES;
                 }
@@ -1523,7 +1362,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                 }
  
                 if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-                       verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+                       verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
                                 regno);
                         return -EACCES;
                 }
@@ -1534,12 +1373,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
  
         return err;
  err_type:
-       verbose("R%d type=%s expected=%s\n", regno,
+       verbose(env, "R%d type=%s expected=%s\n", regno,
                 reg_type_str[type], reg_type_str[expected_type]);
         return -EACCES;
  }
  
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+                                       struct bpf_map *map, int func_id)
  {
         if (!map)
                 return 0;
@@ -1552,7 +1392,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                 break;
         case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
                 if (func_id != BPF_FUNC_perf_event_read &&
-                   func_id != BPF_FUNC_perf_event_output)
+                   func_id != BPF_FUNC_perf_event_output &&
+                   func_id != BPF_FUNC_perf_event_read_value)
                         goto error;
                 break;
         case BPF_MAP_TYPE_STACK_TRACE:
@@ -1595,6 +1436,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
                 break;
         case BPF_FUNC_perf_event_read:
         case BPF_FUNC_perf_event_output:
+       case BPF_FUNC_perf_event_read_value:
                 if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
                         goto error;
                 break;
@@ -1625,7 +1467,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
  
         return 0;
  error:
-       verbose("cannot pass map_type %d into func %s#%d\n",
+       verbose(env, "cannot pass map_type %d into func %s#%d\n",
                 map->map_type, func_id_name(func_id), func_id);
         return -EINVAL;
  }
@@ -1659,7 +1501,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
  
         for (i = 0; i < MAX_BPF_REG; i++)
                 if (reg_is_pkt_pointer_any(&regs[i]))
-                       mark_reg_unknown(regs, i);
+                       mark_reg_unknown(env, regs, i);
  
         for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
                 if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1681,7 +1523,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
  
         /* find function prototype */
         if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-               verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+               verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+                       func_id);
                 return -EINVAL;
         }
  
@@ -1689,13 +1532,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
                 fn = env->prog->aux->ops->get_func_proto(func_id);
  
         if (!fn) {
-               verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+               verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+                       func_id);
                 return -EINVAL;
         }
  
         /* eBPF programs must be GPL compatible to use GPL-ed functions */
         if (!env->prog->gpl_compatible && fn->gpl_only) {
-               verbose("cannot call GPL only function from proprietary program\n");
+               verbose(env, "cannot call GPL only function from proprietary program\n");
                 return -EINVAL;
         }
  
@@ -1709,7 +1553,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
          */
         err = check_raw_mode(fn);
         if (err) {
-               verbose("kernel subsystem misconfigured func %s#%d\n",
+               verbose(env, "kernel subsystem misconfigured func %s#%d\n",
                         func_id_name(func_id), func_id);
                 return err;
         }
@@ -1742,14 +1586,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
  
         /* reset caller saved regs */
         for (i = 0; i < CALLER_SAVED_REGS; i++) {
-               mark_reg_not_init(regs, caller_saved[i]);
+               mark_reg_not_init(env, regs, caller_saved[i]);
                 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
         }
  
         /* update return register (already marked as written above) */
         if (fn->ret_type == RET_INTEGER) {
                 /* sets type to SCALAR_VALUE */
-               mark_reg_unknown(regs, BPF_REG_0);
+               mark_reg_unknown(env, regs, BPF_REG_0);
         } else if (fn->ret_type == RET_VOID) {
                 regs[BPF_REG_0].type = NOT_INIT;
         } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1757,14 +1601,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
  
                 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
                 /* There is no offset yet applied, variable or fixed */
-               mark_reg_known_zero(regs, BPF_REG_0);
+               mark_reg_known_zero(env, regs, BPF_REG_0);
                 regs[BPF_REG_0].off = 0;
                 /* remember map_ptr, so that check_map_access()
                  * can check 'value_size' boundary of memory access
                  * to map element returned from bpf_map_lookup_elem()
                  */
                 if (meta.map_ptr == NULL) {
-                       verbose("kernel subsystem misconfigured verifier\n");
+                       verbose(env,
+                               "kernel subsystem misconfigured verifier\n");
                         return -EINVAL;
                 }
                 regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1775,12 +1620,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
                 else if (insn_aux->map_ptr != meta.map_ptr)
                         insn_aux->map_ptr = BPF_MAP_PTR_POISON;
         } else {
-               verbose("unknown return type %d of func %s#%d\n",
+               verbose(env, "unknown return type %d of func %s#%d\n",
                         fn->ret_type, func_id_name(func_id), func_id);
                 return -EINVAL;
         }
  
-       err = check_map_func_compatibility(meta.map_ptr, func_id);
+       err = check_map_func_compatibility(env, meta.map_ptr, func_id);
         if (err)
                 return err;
  
@@ -1839,39 +1684,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
         dst_reg = &regs[dst];
  
         if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: known but bad sbounds\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env,
+                       "verifier internal error: known but bad sbounds\n");
                 return -EINVAL;
         }
         if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: known but bad ubounds\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env,
+                       "verifier internal error: known but bad ubounds\n");
                 return -EINVAL;
         }
  
         if (BPF_CLASS(insn->code) != BPF_ALU64) {
                 /* 32-bit ALU ops on pointers produce (meaningless) scalars */
                 if (!env->allow_ptr_leaks)
-                       verbose("R%d 32-bit pointer arithmetic prohibited\n",
+                       verbose(env,
+                               "R%d 32-bit pointer arithmetic prohibited\n",
                                 dst);
                 return -EACCES;
         }
  
         if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
                 if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+                       verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
                                 dst);
                 return -EACCES;
         }
         if (ptr_reg->type == CONST_PTR_TO_MAP) {
                 if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+                       verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
                                 dst);
                 return -EACCES;
         }
         if (ptr_reg->type == PTR_TO_PACKET_END) {
                 if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+                       verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
                                 dst);
                 return -EACCES;
         }
@@ -1936,7 +1784,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                 if (dst_reg == off_reg) {
                         /* scalar -= pointer.  Creates an unknown scalar */
                         if (!env->allow_ptr_leaks)
-                               verbose("R%d tried to subtract pointer from scalar\n",
+                               verbose(env, "R%d tried to subtract pointer from scalar\n",
                                         dst);
                         return -EACCES;
                 }
@@ -1946,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                  */
                 if (ptr_reg->type == PTR_TO_STACK) {
                         if (!env->allow_ptr_leaks)
-                               verbose("R%d subtraction from stack pointer prohibited\n",
+                               verbose(env, "R%d subtraction from stack pointer prohibited\n",
                                         dst);
                         return -EACCES;
                 }
@@ -2001,13 +1849,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                  * ptr &= ~3 which would reduce min_value by 3.)
                  */
                 if (!env->allow_ptr_leaks)
-                       verbose("R%d bitwise operator %s on pointer prohibited\n",
+                       verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
                                 dst, bpf_alu_string[opcode >> 4]);
                 return -EACCES;
         default:
                 /* other operators (e.g. MUL,LSH) produce non-pointer results */
                 if (!env->allow_ptr_leaks)
-                       verbose("R%d pointer arithmetic with %s operator prohibited\n",
+                       verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
                                 dst, bpf_alu_string[opcode >> 4]);
                 return -EACCES;
         }
@@ -2173,7 +2021,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                         /* Shifts greater than 63 are undefined.  This includes
                          * shifts by a negative number.
                          */
-                       mark_reg_unknown(regs, insn->dst_reg);
+                       mark_reg_unknown(env, regs, insn->dst_reg);
                         break;
                 }
                 /* We lose all sign bit information (except what we can pick
@@ -2201,7 +2049,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                         /* Shifts greater than 63 are undefined.  This includes
                          * shifts by a negative number.
                          */
-                       mark_reg_unknown(regs, insn->dst_reg);
+                       mark_reg_unknown(env, regs, insn->dst_reg);
                         break;
                 }
                 /* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2229,7 +2077,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
                 __update_reg_bounds(dst_reg);
                 break;
         default:
-               mark_reg_unknown(regs, insn->dst_reg);
+               mark_reg_unknown(env, regs, insn->dst_reg);
                 break;
         }
  
@@ -2261,12 +2109,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
                                  * an arbitrary scalar.
                                  */
                                 if (!env->allow_ptr_leaks) {
-                                       verbose("R%d pointer %s pointer prohibited\n",
+                                       verbose(env, "R%d pointer %s pointer prohibited\n",
                                                 insn->dst_reg,
                                                 bpf_alu_string[opcode >> 4]);
                                         return -EACCES;
                                 }
-                               mark_reg_unknown(regs, insn->dst_reg);
+                               mark_reg_unknown(env, regs, insn->dst_reg);
                                 return 0;
                         } else {
                                 /* scalar += pointer
@@ -2318,13 +2166,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
  
         /* Got here implies adding two SCALAR_VALUEs */
         if (WARN_ON_ONCE(ptr_reg)) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: unexpected ptr_reg\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env, "verifier internal error: unexpected ptr_reg\n");
                 return -EINVAL;
         }
         if (WARN_ON(!src_reg)) {
-               print_verifier_state(&env->cur_state);
-               verbose("verifier internal error: no src_reg\n");
+               print_verifier_state(env, &env->cur_state);
+               verbose(env, "verifier internal error: no src_reg\n");
                 return -EINVAL;
         }
         return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2342,14 +2190,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                         if (BPF_SRC(insn->code) != 0 ||
                             insn->src_reg != BPF_REG_0 ||
                             insn->off != 0 || insn->imm != 0) {
-                               verbose("BPF_NEG uses reserved fields\n");
+                               verbose(env, "BPF_NEG uses reserved fields\n");
                                 return -EINVAL;
                         }
                 } else {
                         if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
                             (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
                             BPF_CLASS(insn->code) == BPF_ALU64) {
-                               verbose("BPF_END uses reserved fields\n");
+                               verbose(env, "BPF_END uses reserved fields\n");
                                 return -EINVAL;
                         }
                 }
@@ -2360,7 +2208,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                         return err;
  
                 if (is_pointer_value(env, insn->dst_reg)) {
-                       verbose("R%d pointer arithmetic prohibited\n",
+                       verbose(env, "R%d pointer arithmetic prohibited\n",
                                 insn->dst_reg);
                         return -EACCES;
                 }
@@ -2374,7 +2222,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
  
                 if (BPF_SRC(insn->code) == BPF_X) {
                         if (insn->imm != 0 || insn->off != 0) {
-                               verbose("BPF_MOV uses reserved fields\n");
+                               verbose(env, "BPF_MOV uses reserved fields\n");
                                 return -EINVAL;
                         }
  
@@ -2384,7 +2232,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                 return err;
                 } else {
                         if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-                               verbose("BPF_MOV uses reserved fields\n");
+                               verbose(env, "BPF_MOV uses reserved fields\n");
                                 return -EINVAL;
                         }
                 }
@@ -2400,14 +2248,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                  * copy register state to dest reg
                                  */
                                 regs[insn->dst_reg] = regs[insn->src_reg];
+                               regs[insn->dst_reg].live |= REG_LIVE_WRITTEN;
                         } else {
                                 /* R1 = (u32) R2 */
                                 if (is_pointer_value(env, insn->src_reg)) {
-                                       verbose("R%d partial copy of pointer\n",
+                                       verbose(env,
+                                               "R%d partial copy of pointer\n",
                                                 insn->src_reg);
                                         return -EACCES;
                                 }
-                               mark_reg_unknown(regs, insn->dst_reg);
+                               mark_reg_unknown(env, regs, insn->dst_reg);
                                 /* high 32 bits are known zero. */
                                 regs[insn->dst_reg].var_off = tnum_cast(
                                                 regs[insn->dst_reg].var_off, 4);
@@ -2422,14 +2272,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                 }
  
         } else if (opcode > BPF_END) {
-               verbose("invalid BPF_ALU opcode %x\n", opcode);
+               verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
                 return -EINVAL;
  
         } else {        /* all other ALU ops: and, sub, xor, add, ... */
  
                 if (BPF_SRC(insn->code) == BPF_X) {
                         if (insn->imm != 0 || insn->off != 0) {
-                               verbose("BPF_ALU uses reserved fields\n");
+                               verbose(env, "BPF_ALU uses reserved fields\n");
                                 return -EINVAL;
                         }
                         /* check src1 operand */
@@ -2438,7 +2288,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                 return err;
                 } else {
                         if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-                               verbose("BPF_ALU uses reserved fields\n");
+                               verbose(env, "BPF_ALU uses reserved fields\n");
                                 return -EINVAL;
                         }
                 }
@@ -2450,7 +2300,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
  
                 if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
                     BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
-                       verbose("div by zero\n");
+                       verbose(env, "div by zero\n");
                         return -EINVAL;
                 }
  
@@ -2459,7 +2309,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                         int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
  
                         if (insn->imm < 0 || insn->imm >= size) {
-                               verbose("invalid shift %d\n", insn->imm);
+                               verbose(env, "invalid shift %d\n", insn->imm);
                                 return -EINVAL;
                         }
                 }
@@ -2812,13 +2662,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
         int err;
  
         if (opcode > BPF_JSLE) {
-               verbose("invalid BPF_JMP opcode %x\n", opcode);
+               verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
                 return -EINVAL;
         }
  
         if (BPF_SRC(insn->code) == BPF_X) {
                 if (insn->imm != 0) {
-                       verbose("BPF_JMP uses reserved fields\n");
+                       verbose(env, "BPF_JMP uses reserved fields\n");
                         return -EINVAL;
                 }
  
@@ -2828,13 +2678,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                         return err;
  
                 if (is_pointer_value(env, insn->src_reg)) {
-                       verbose("R%d pointer comparison prohibited\n",
+                       verbose(env, "R%d pointer comparison prohibited\n",
                                 insn->src_reg);
                         return -EACCES;
                 }
         } else {
                 if (insn->src_reg != BPF_REG_0) {
-                       verbose("BPF_JMP uses reserved fields\n");
+                       verbose(env, "BPF_JMP uses reserved fields\n");
                         return -EINVAL;
                 }
         }
@@ -2946,11 +2796,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                 find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
                                        PTR_TO_PACKET_META);
         } else if (is_pointer_value(env, insn->dst_reg)) {
-               verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+               verbose(env, "R%d pointer comparison prohibited\n",
+                       insn->dst_reg);
                 return -EACCES;
         }
-       if (log_level)
-               print_verifier_state(this_branch);
+       if (env->log.level)
+               print_verifier_state(env, this_branch);
         return 0;
  }
  
@@ -2969,11 +2820,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
         int err;
  
         if (BPF_SIZE(insn->code) != BPF_DW) {
-               verbose("invalid BPF_LD_IMM insn\n");
+               verbose(env, "invalid BPF_LD_IMM insn\n");
                 return -EINVAL;
         }
         if (insn->off != 0) {
-               verbose("BPF_LD_IMM64 uses reserved fields\n");
+               verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
                 return -EINVAL;
         }
  
@@ -3031,14 +2882,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
         int i, err;
  
         if (!may_access_skb(env->prog->type)) {
-               verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+               verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
                 return -EINVAL;
         }
  
         if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
             BPF_SIZE(insn->code) == BPF_DW ||
             (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-               verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+               verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
                 return -EINVAL;
         }
  
@@ -3048,7 +2899,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
                 return err;
  
         if (regs[BPF_REG_6].type != PTR_TO_CTX) {
-               verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+               verbose(env,
+                       "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
                 return -EINVAL;
         }
  
@@ -3061,7 +2913,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
  
         /* reset caller saved regs to unreadable */
         for (i = 0; i < CALLER_SAVED_REGS; i++) {
-               mark_reg_not_init(regs, caller_saved[i]);
+               mark_reg_not_init(env, regs, caller_saved[i]);
                 check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
         }
  
@@ -3069,7 +2921,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
          * the value fetched from the packet.
          * Already marked as written above.
          */
-       mark_reg_unknown(regs, BPF_REG_0);
+       mark_reg_unknown(env, regs, BPF_REG_0);
         return 0;
  }
  
@@ -3089,22 +2941,22 @@ static int check_return_code(struct bpf_verifier_env *env)
  
         reg = &env->cur_state.regs[BPF_REG_0];
         if (reg->type != SCALAR_VALUE) {
-               verbose("At program exit the register R0 is not a known value (%s)\n",
+               verbose(env, "At program exit the register R0 is not a known value (%s)\n",
                         reg_type_str[reg->type]);
                 return -EINVAL;
         }
  
         if (!tnum_in(range, reg->var_off)) {
-               verbose("At program exit the register R0 ");
+               verbose(env, "At program exit the register R0 ");
                 if (!tnum_is_unknown(reg->var_off)) {
                         char tn_buf[48];
  
                         tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-                       verbose("has value %s", tn_buf);
+                       verbose(env, "has value %s", tn_buf);
                 } else {
-                       verbose("has unknown scalar value");
+                       verbose(env, "has unknown scalar value");
                 }
-               verbose(" should have been 0 or 1\n");
+               verbose(env, " should have been 0 or 1\n");
                 return -EINVAL;
         }
         return 0;
@@ -3170,7 +3022,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
                 return 0;
  
         if (w < 0 || w >= env->prog->len) {
-               verbose("jump out of range from insn %d to %d\n", t, w);
+               verbose(env, "jump out of range from insn %d to %d\n", t, w);
                 return -EINVAL;
         }
  
@@ -3187,13 +3039,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
                 insn_stack[cur_stack++] = w;
                 return 1;
         } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-               verbose("back-edge from insn %d to %d\n", t, w);
+               verbose(env, "back-edge from insn %d to %d\n", t, w);
                 return -EINVAL;
         } else if (insn_state[w] == EXPLORED) {
                 /* forward- or cross-edge */
                 insn_state[t] = DISCOVERED | e;
         } else {
-               verbose("insn state internal bug\n");
+               verbose(env, "insn state internal bug\n");
                 return -EFAULT;
         }
         return 0;
@@ -3287,7 +3139,7 @@ peek_stack:
  mark_explored:
         insn_state[t] = EXPLORED;
         if (cur_stack-- <= 0) {
-               verbose("pop stack internal bug\n");
+               verbose(env, "pop stack internal bug\n");
                 ret = -EFAULT;
                 goto err_free;
         }
@@ -3296,7 +3148,7 @@ mark_explored:
  check_state:
         for (i = 0; i < insn_cnt; i++) {
                 if (insn_state[i] != EXPLORED) {
-                       verbose("unreachable insn %d\n", i);
+                       verbose(env, "unreachable insn %d\n", i);
                         ret = -EINVAL;
                         goto err_free;
                 }
@@ -3677,7 +3529,7 @@ static int do_check(struct bpf_verifier_env *env)
         int insn_processed = 0;
         bool do_print_state = false;
  
-       init_reg_state(regs);
+       init_reg_state(env, regs);
         state->parent = NULL;
         insn_idx = 0;
         for (;;) {
@@ -3686,7 +3538,7 @@ static int do_check(struct bpf_verifier_env *env)
                 int err;
  
                 if (insn_idx >= insn_cnt) {
-                       verbose("invalid insn idx %d insn_cnt %d\n",
+                       verbose(env, "invalid insn idx %d insn_cnt %d\n",
                                 insn_idx, insn_cnt);
                         return -EFAULT;
                 }
@@ -3695,7 +3547,8 @@ static int do_check(struct bpf_verifier_env *env)
                 class = BPF_CLASS(insn->code);
  
                 if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-                       verbose("BPF program is too large. Processed %d insn\n",
+                       verbose(env,
+                               "BPF program is too large. Processed %d insn\n",
                                 insn_processed);
                         return -E2BIG;
                 }
@@ -3705,12 +3558,12 @@ static int do_check(struct bpf_verifier_env *env)
                         return err;
                 if (err == 1) {
                         /* found equivalent state, can prune the search */
-                       if (log_level) {
+                       if (env->log.level) {
                                 if (do_print_state)
-                                       verbose("\nfrom %d to %d: safe\n",
+                                       verbose(env, "\nfrom %d to %d: safe\n",
                                                 prev_insn_idx, insn_idx);
                                 else
-                                       verbose("%d: safe\n", insn_idx);
+                                       verbose(env, "%d: safe\n", insn_idx);
                         }
                         goto process_bpf_exit;
                 }
@@ -3718,19 +3571,20 @@ static int do_check(struct bpf_verifier_env *env)
                 if (need_resched())
                         cond_resched();
  
-               if (log_level > 1 || (log_level && do_print_state)) {
-                       if (log_level > 1)
-                               verbose("%d:", insn_idx);
+               if (env->log.level > 1 || (env->log.level && do_print_state)) {
+                       if (env->log.level > 1)
+                               verbose(env, "%d:", insn_idx);
                         else
-                               verbose("\nfrom %d to %d:",
+                               verbose(env, "\nfrom %d to %d:",
                                         prev_insn_idx, insn_idx);
-                       print_verifier_state(&env->cur_state);
+                       print_verifier_state(env, &env->cur_state);
                         do_print_state = false;
                 }
  
-               if (log_level) {
-                       verbose("%d: ", insn_idx);
-                       print_bpf_insn(env, insn);
+               if (env->log.level) {
+                       verbose(env, "%d: ", insn_idx);
+                       print_bpf_insn(verbose, env, insn,
+                                      env->allow_ptr_leaks);
                 }
  
                 err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
@@ -3786,7 +3640,7 @@ static int do_check(struct bpf_verifier_env *env)
                                  * src_reg == stack|map in some other branch.
                                  * Reject it.
                                  */
-                               verbose("same insn cannot be used with different pointers\n");
+                               verbose(env, "same insn cannot be used with different pointers\n");
                                 return -EINVAL;
                         }
  
@@ -3826,14 +3680,14 @@ static int do_check(struct bpf_verifier_env *env)
                         } else if (dst_reg_type != *prev_dst_type &&
                                    (dst_reg_type == PTR_TO_CTX ||
                                     *prev_dst_type == PTR_TO_CTX)) {
-                               verbose("same insn cannot be used with different pointers\n");
+                               verbose(env, "same insn cannot be used with different pointers\n");
                                 return -EINVAL;
                         }
  
                 } else if (class == BPF_ST) {
                         if (BPF_MODE(insn->code) != BPF_MEM ||
                             insn->src_reg != BPF_REG_0) {
-                               verbose("BPF_ST uses reserved fields\n");
+                               verbose(env, "BPF_ST uses reserved fields\n");
                                 return -EINVAL;
                         }
                         /* check src operand */
@@ -3856,7 +3710,7 @@ static int do_check(struct bpf_verifier_env *env)
                                     insn->off != 0 ||
                                     insn->src_reg != BPF_REG_0 ||
                                     insn->dst_reg != BPF_REG_0) {
-                                       verbose("BPF_CALL uses reserved fields\n");
+                                       verbose(env, "BPF_CALL uses reserved fields\n");
                                         return -EINVAL;
                                 }
  
@@ -3869,7 +3723,7 @@ static int do_check(struct bpf_verifier_env *env)
                                     insn->imm != 0 ||
                                     insn->src_reg != BPF_REG_0 ||
                                     insn->dst_reg != BPF_REG_0) {
-                                       verbose("BPF_JA uses reserved fields\n");
+                                       verbose(env, "BPF_JA uses reserved fields\n");
                                         return -EINVAL;
                                 }
  
@@ -3881,7 +3735,7 @@ static int do_check(struct bpf_verifier_env *env)
                                     insn->imm != 0 ||
                                     insn->src_reg != BPF_REG_0 ||
                                     insn->dst_reg != BPF_REG_0) {
-                                       verbose("BPF_EXIT uses reserved fields\n");
+                                       verbose(env, "BPF_EXIT uses reserved fields\n");
                                         return -EINVAL;
                                 }
  
@@ -3896,7 +3750,7 @@ static int do_check(struct bpf_verifier_env *env)
                                         return err;
  
                                 if (is_pointer_value(env, BPF_REG_0)) {
-                                       verbose("R0 leaks addr as return value\n");
+                                       verbose(env, "R0 leaks addr as return value\n");
                                         return -EACCES;
                                 }
  
@@ -3931,19 +3785,19 @@ process_bpf_exit:
  
                                 insn_idx++;
                         } else {
-                               verbose("invalid BPF_LD mode\n");
+                               verbose(env, "invalid BPF_LD mode\n");
                                 return -EINVAL;
                         }
                 } else {
-                       verbose("unknown insn class %d\n", class);
+                       verbose(env, "unknown insn class %d\n", class);
                         return -EINVAL;
                 }
  
                 insn_idx++;
         }
  
-       verbose("processed %d insns, stack depth %d\n",
-               insn_processed, env->prog->aux->stack_depth);
+       verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+               env->prog->aux->stack_depth);
         return 0;
  }
  
@@ -3955,7 +3809,8 @@ static int check_map_prealloc(struct bpf_map *map)
                 !(map->map_flags & BPF_F_NO_PREALLOC);
  }
  
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+                                       struct bpf_map *map,
                                         struct bpf_prog *prog)
  
  {
@@ -3966,12 +3821,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
          */
         if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
                 if (!check_map_prealloc(map)) {
-                       verbose("perf_event programs can only use preallocated hash map\n");
+                       verbose(env, "perf_event programs can only use preallocated hash map\n");
                         return -EINVAL;
                 }
                 if (map->inner_map_meta &&
                     !check_map_prealloc(map->inner_map_meta)) {
-                       verbose("perf_event programs can only use preallocated inner hash map\n");
+                       verbose(env, "perf_event programs can only use preallocated inner hash map\n");
                         return -EINVAL;
                 }
         }
@@ -3994,14 +3849,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
         for (i = 0; i < insn_cnt; i++, insn++) {
                 if (BPF_CLASS(insn->code) == BPF_LDX &&
                     (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
-                       verbose("BPF_LDX uses reserved fields\n");
+                       verbose(env, "BPF_LDX uses reserved fields\n");
                         return -EINVAL;
                 }
  
                 if (BPF_CLASS(insn->code) == BPF_STX &&
                     ((BPF_MODE(insn->code) != BPF_MEM &&
                       BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
-                       verbose("BPF_STX uses reserved fields\n");
+                       verbose(env, "BPF_STX uses reserved fields\n");
                         return -EINVAL;
                 }
  
@@ -4012,7 +3867,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
                         if (i == insn_cnt - 1 || insn[1].code != 0 ||
                             insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
                             insn[1].off != 0) {
-                               verbose("invalid bpf_ld_imm64 insn\n");
+                               verbose(env, "invalid bpf_ld_imm64 insn\n");
                                 return -EINVAL;
                         }
  
@@ -4021,19 +3876,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
                                 goto next_insn;
  
                         if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-                               verbose("unrecognized bpf_ld_imm64 insn\n");
+                               verbose(env,
+                                       "unrecognized bpf_ld_imm64 insn\n");
                                 return -EINVAL;
                         }
  
                         f = fdget(insn->imm);
                         map = __bpf_map_get(f);
                         if (IS_ERR(map)) {
-                               verbose("fd %d is not pointing to valid bpf_map\n",
+                               verbose(env, "fd %d is not pointing to valid bpf_map\n",
                                         insn->imm);
                                 return PTR_ERR(map);
                         }
  
-                       err = check_map_prog_compatibility(map, env->prog);
+                       err = check_map_prog_compatibility(env, map, env->prog);
                         if (err) {
                                 fdput(f);
                                 return err;
@@ -4155,7 +4011,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                 cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
                                         env->prog);
                 if (cnt >= ARRAY_SIZE(insn_buf)) {
-                       verbose("bpf verifier is misconfigured\n");
+                       verbose(env, "bpf verifier is misconfigured\n");
                         return -EINVAL;
                 } else if (cnt) {
                         new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4203,7 +4059,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                         u8 size_code;
  
                         if (type == BPF_WRITE) {
-                               verbose("bpf verifier narrow ctx access misconfigured\n");
+                               verbose(env, "bpf verifier narrow ctx access misconfigured\n");
                                 return -EINVAL;
                         }
  
@@ -4222,7 +4078,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
                                               &target_size);
                 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
                     (ctx_field_size && !target_size)) {
-                       verbose("bpf verifier is misconfigured\n");
+                       verbose(env, "bpf verifier is misconfigured\n");
                         return -EINVAL;
                 }
  
@@ -4304,7 +4160,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
  
                         cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
                         if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-                               verbose("bpf verifier is misconfigured\n");
+                               verbose(env, "bpf verifier is misconfigured\n");
                                 return -EINVAL;
                         }
  
@@ -4348,7 +4204,8 @@ patch_call_imm:
                  * programs to call them, must be real in-kernel functions
                  */
                 if (!fn->func) {
-                       verbose("kernel subsystem misconfigured func %s#%d\n",
+                       verbose(env,
+                               "kernel subsystem misconfigured func %s#%d\n",
                                 func_id_name(insn->imm), insn->imm);
                         return -EFAULT;
                 }
@@ -4382,8 +4239,8 @@ static void free_states(struct bpf_verifier_env *env)
  
  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
  {
-       char __user *log_ubuf = NULL;
         struct bpf_verifier_env *env;
+       struct bpf_verifer_log *log;
         int ret = -EINVAL;
  
         /* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4392,6 +4249,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
         env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
         if (!env)
                 return -ENOMEM;
+       log = &env->log;
  
         env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
                                      (*prog)->len);
@@ -4407,23 +4265,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
                 /* user requested verbose verifier output
                  * and supplied buffer to store the verification trace
                  */
-               log_level = attr->log_level;
-               log_ubuf = (char __user *) (unsigned long) attr->log_buf;
-               log_size = attr->log_size;
-               log_len = 0;
+               log->level = attr->log_level;
+               log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+               log->len_total = attr->log_size;
  
                 ret = -EINVAL;
-               /* log_* values have to be sane */
-               if (log_size < 128 || log_size > UINT_MAX >> 8 ||
-                   log_level == 0 || log_ubuf == NULL)
-                       goto err_unlock;
-
-               ret = -ENOMEM;
-               log_buf = vmalloc(log_size);
-               if (!log_buf)
+               /* log attributes have to be sane */
+               if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+                   !log->level || !log->ubuf)
                         goto err_unlock;
-       } else {
-               log_level = 0;
         }
  
         env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4460,17 +4310,11 @@ skip_full_check:
         if (ret == 0)
                 ret = fixup_bpf_calls(env);
  
-       if (log_level && log_len >= log_size - 1) {
-               BUG_ON(log_len >= log_size);
-               /* verifier log exceeded user supplied buffer */
+       if (log->level && bpf_verifier_log_full(log))
                 ret = -ENOSPC;
-               /* fall through to return what was recorded */
-       }
-
-       /* copy verifier log back to user space including trailing zero */
-       if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+       if (log->level && !log->ubuf) {
                 ret = -EFAULT;
-               goto free_log_buf;
+               goto err_release_maps;
         }
  
         if (ret == 0 && env->used_map_cnt) {
@@ -4481,7 +4325,7 @@ skip_full_check:
  
                 if (!env->prog->aux->used_maps) {
                         ret = -ENOMEM;
-                       goto free_log_buf;
+                       goto err_release_maps;
                 }
  
                 memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4494,9 +4338,7 @@ skip_full_check:
                 convert_pseudo_ld_imm64(env);
         }
  
-free_log_buf:
-       if (log_level)
-               vfree(log_buf);
+err_release_maps:
         if (!env->prog->aux->used_maps)
                 /* if we didn't copy map pointers into bpf_prog_info, release
                  * them now. Otherwise free_bpf_prog_info() will release them.
@@ -4533,8 +4375,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
         /* grab the mutex to protect few globals used by verifier */
         mutex_lock(&bpf_verifier_lock);
  
-       log_level = 0;
-
         env->strict_alignment = false;
         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                 env->strict_alignment = true;
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 8de11a2..d851df2 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
  #include <linux/lockdep.h>
  #include <linux/tick.h>
  #include <linux/irq.h>
+#include <linux/nmi.h>
  #include <linux/smpboot.h>
  #include <linux/relay.h>
  #include <linux/slab.h>
@@ -897,6 +898,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
  
  out:
         cpus_write_unlock();
+       /*
+        * Do post unplug cleanup. This is still protected against
+        * concurrent CPU hotplug via cpu_add_remove_lock.
+        */
+       lockup_detector_cleanup();
         return ret;
  }
  
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 6bc21e2..902149f 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
   *     will not be local and we cannot read them atomically
   *   - must not have a pmu::count method
   */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+                         u64 *enabled, u64 *running)
  {
         unsigned long flags;
         int ret = 0;
+       u64 now;
  
         /*
          * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
                 goto out;
         }
  
+       now = event->shadow_ctx_time + perf_clock();
+       if (enabled)
+               *enabled = now - event->tstamp_enabled;
         /*
          * If the event is currently on this CPU, its either a per-task event,
          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
          * oncpu == -1).
          */
-       if (event->oncpu == smp_processor_id())
+       if (event->oncpu == smp_processor_id()) {
                 event->pmu->read(event);
+               if (running)
+                       *running = now - event->tstamp_running;
+       } else if (running) {
+               *running = event->total_time_running;
+       }
  
         *value = local64_read(&event->count);
  out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
         struct bpf_perf_event_data_kern ctx = {
                 .data = data,
                 .regs = regs,
+               .event = event,
         };
         int ret = 0;
  
diff --git a/kernel/smpboot.c b/kernel/smpboot.c

index 1d71c05..5043e74 100644 (file)
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
   * by the client, but only by calling this function.
   * This function can only be called on a registered smp_hotplug_thread.
   */
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-                                        const struct cpumask *new)
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+                                         const struct cpumask *new)
  {
         struct cpumask *old = plug_thread->cpumask;
-       cpumask_var_t tmp;
+       static struct cpumask tmp;
         unsigned int cpu;
  
-       if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
-               return -ENOMEM;
-
-       get_online_cpus();
+       lockdep_assert_cpus_held();
         mutex_lock(&smpboot_threads_lock);
  
         /* Park threads that were exclusively enabled on the old mask. */
-       cpumask_andnot(tmp, old, new);
-       for_each_cpu_and(cpu, tmp, cpu_online_mask)
+       cpumask_andnot(&tmp, old, new);
+       for_each_cpu_and(cpu, &tmp, cpu_online_mask)
                 smpboot_park_thread(plug_thread, cpu);
  
         /* Unpark threads that are exclusively enabled on the new mask. */
-       cpumask_andnot(tmp, new, old);
-       for_each_cpu_and(cpu, tmp, cpu_online_mask)
+       cpumask_andnot(&tmp, new, old);
+       for_each_cpu_and(cpu, &tmp, cpu_online_mask)
                 smpboot_unpark_thread(plug_thread, cpu);
  
         cpumask_copy(old, new);
  
         mutex_unlock(&smpboot_threads_lock);
-       put_online_cpus();
-
-       free_cpumask_var(tmp);
-
-       return 0;
  }
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
  
  static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
  
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 4da9e62..d9c31bc 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -872,9 +872,9 @@ static struct ctl_table kern_table[] = {
  #if defined(CONFIG_LOCKUP_DETECTOR)
         {
                 .procname       = "watchdog",
-               .data           = &watchdog_user_enabled,
-               .maxlen         = sizeof (int),
-               .mode           = 0644,
+               .data           = &watchdog_user_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
                 .proc_handler   = proc_watchdog,
                 .extra1         = &zero,
                 .extra2         = &one,
@@ -890,16 +890,12 @@ static struct ctl_table kern_table[] = {
         },
         {
                 .procname       = "nmi_watchdog",
-               .data           = &nmi_watchdog_enabled,
-               .maxlen         = sizeof (int),
-               .mode           = 0644,
+               .data           = &nmi_watchdog_user_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = NMI_WATCHDOG_SYSCTL_PERM,
                 .proc_handler   = proc_nmi_watchdog,
                 .extra1         = &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
                 .extra2         = &one,
-#else
-               .extra2         = &zero,
-#endif
         },
         {
                 .procname       = "watchdog_cpumask",
@@ -911,9 +907,9 @@ static struct ctl_table kern_table[] = {
  #ifdef CONFIG_SOFTLOCKUP_DETECTOR
         {
                 .procname       = "soft_watchdog",
-               .data           = &soft_watchdog_enabled,
-               .maxlen         = sizeof (int),
-               .mode           = 0644,
+               .data           = &soft_watchdog_user_enabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
                 .proc_handler   = proc_soft_watchdog,
                 .extra1         = &zero,
                 .extra2         = &one,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c

index dc498b6..04ea531 100644 (file)
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
         return &bpf_trace_printk_proto;
  }
  
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+                    u64 *value, u64 *enabled, u64 *running)
  {
         struct bpf_array *array = container_of(map, struct bpf_array, map);
         unsigned int cpu = smp_processor_id();
         u64 index = flags & BPF_F_INDEX_MASK;
         struct bpf_event_entry *ee;
-       u64 value = 0;
-       int err;
  
         if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
                 return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
         if (!ee)
                 return -ENOENT;
  
-       err = perf_event_read_local(ee->event, &value);
+       return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+       u64 value = 0;
+       int err;
+
+       err = get_map_perf_counter(map, flags, &value, NULL, NULL);
         /*
          * this api is ugly since we miss [-22..-2] range of valid
          * counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
         .arg2_type      = ARG_ANYTHING,
  };
  
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+          struct bpf_perf_event_value *, buf, u32, size)
+{
+       int err = -EINVAL;
+
+       if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+               goto clear;
+       err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+                                  &buf->running);
+       if (unlikely(err))
+               goto clear;
+       return 0;
+clear:
+       memset(buf, 0, size);
+       return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+       .func           = bpf_perf_event_read_value,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
+       .arg4_type      = ARG_CONST_SIZE,
+};
+
  static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
  
  static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
                 return &bpf_perf_event_output_proto;
         case BPF_FUNC_get_stackid:
                 return &bpf_get_stackid_proto;
+       case BPF_FUNC_perf_event_read_value:
+               return &bpf_perf_event_read_value_proto;
         default:
                 return tracing_func_proto(func_id);
         }
@@ -576,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
         .arg3_type      = ARG_ANYTHING,
  };
  
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+          struct bpf_perf_event_value *, buf, u32, size)
+{
+       int err = -EINVAL;
+
+       if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+               goto clear;
+       err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+                                   &buf->running);
+       if (unlikely(err))
+               goto clear;
+       return 0;
+clear:
+       memset(buf, 0, size);
+       return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+         .func           = bpf_perf_prog_read_value_tp,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
  static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
  {
         switch (func_id) {
@@ -583,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
                 return &bpf_perf_event_output_proto_tp;
         case BPF_FUNC_get_stackid:
                 return &bpf_get_stackid_proto_tp;
+       case BPF_FUNC_perf_prog_read_value:
+               return &bpf_perf_prog_read_value_proto_tp;
         default:
                 return tracing_func_proto(func_id);
         }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c

index f5d5202..6bcb854 100644 (file)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,20 +29,29 @@
  #include <linux/kvm_para.h>
  #include <linux/kthread.h>
  
-/* Watchdog configuration */
-static DEFINE_MUTEX(watchdog_proc_mutex);
-
-int __read_mostly nmi_watchdog_enabled;
+static DEFINE_MUTEX(watchdog_mutex);
  
  #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
-                                               NMI_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT      (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT  1
  #else
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+# define WATCHDOG_DEFAULT      (SOFT_WATCHDOG_ENABLED)
+# define NMI_WATCHDOG_DEFAULT  0
  #endif
  
+unsigned long __read_mostly watchdog_enabled;
+int __read_mostly watchdog_user_enabled = 1;
+int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
+int __read_mostly soft_watchdog_user_enabled = 1;
+int __read_mostly watchdog_thresh = 10;
+int __read_mostly nmi_watchdog_available;
+
+struct cpumask watchdog_allowed_mask __read_mostly;
+
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
  #ifdef CONFIG_HARDLOCKUP_DETECTOR
-/* boot commands */
  /*
   * Should we panic when a soft-lockup or hard-lockup occurs:
   */
@@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic =
   * kernel command line parameters are parsed, because otherwise it is not
   * possible to override this in hardlockup_panic_setup().
   */
-void hardlockup_detector_disable(void)
+void __init hardlockup_detector_disable(void)
  {
-       watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+       nmi_watchdog_user_enabled = 0;
  }
  
  static int __init hardlockup_panic_setup(char *str)
@@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str)
         else if (!strncmp(str, "nopanic", 7))
                 hardlockup_panic = 0;
         else if (!strncmp(str, "0", 1))
-               watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+               nmi_watchdog_user_enabled = 0;
         else if (!strncmp(str, "1", 1))
-               watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+               nmi_watchdog_user_enabled = 1;
         return 1;
  }
  __setup("nmi_watchdog=", hardlockup_panic_setup);
  
-#endif
-
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-int __read_mostly soft_watchdog_enabled;
-#endif
-
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-
-#ifdef CONFIG_SMP
-int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+# ifdef CONFIG_SMP
  int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#endif
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
  
-/*
- * The 'watchdog_running' variable is set to 1 when the watchdog threads
- * are registered/started and is set to 0 when the watchdog threads are
- * unregistered/stopped, so it is an indicator whether the threads exist.
- */
-static int __read_mostly watchdog_running;
-/*
- * If a subsystem has a need to deactivate the watchdog temporarily, it
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+       sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
+       return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
  
  /*
   * These functions can be overridden if an architecture implements its
@@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended;
   */
  int __weak watchdog_nmi_enable(unsigned int cpu)
  {
+       hardlockup_detector_perf_enable();
         return 0;
  }
+
  void __weak watchdog_nmi_disable(unsigned int cpu)
  {
+       hardlockup_detector_perf_disable();
  }
  
-/*
- * watchdog_nmi_reconfigure can be implemented to be notified after any
- * watchdog configuration change. The arch hardlockup watchdog should
- * respond to the following variables:
- * - nmi_watchdog_enabled
+/* Return 0, if a NMI watchdog is available. Error code otherwise */
+int __weak __init watchdog_nmi_probe(void)
+{
+       return hardlockup_detector_perf_init();
+}
+
+/**
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
+ *
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
+ * update_variables();
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
+ *
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
+ * - watchdog_enabled
   * - watchdog_thresh
   * - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
- * - watchdog_suspended
   */
-void __weak watchdog_nmi_reconfigure(void)
+void __weak watchdog_nmi_start(void) { }
+
+/**
+ * lockup_detector_update_enable - Update the sysctl enable bit
+ *
+ * Caller needs to make sure that the NMI/perf watchdogs are off, so this
+ * can't race with watchdog_nmi_disable().
+ */
+static void lockup_detector_update_enable(void)
  {
+       watchdog_enabled = 0;
+       if (!watchdog_user_enabled)
+               return;
+       if (nmi_watchdog_available && nmi_watchdog_user_enabled)
+               watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+       if (soft_watchdog_user_enabled)
+               watchdog_enabled |= SOFT_WATCHDOG_ENABLED;
  }
  
-
  #ifdef CONFIG_SOFTLOCKUP_DETECTOR
  
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
-       for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+/* Global variables, exported for sysctl */
+unsigned int __read_mostly softlockup_panic =
+                       CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
  
+static bool softlockup_threads_initialized __read_mostly;
  static u64 __read_mostly sample_period;
  
  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
  static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
  static unsigned long soft_lockup_nmi_warn;
  
-unsigned int __read_mostly softlockup_panic =
-                       CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-
  static int __init softlockup_panic_setup(char *str)
  {
         softlockup_panic = simple_strtoul(str, NULL, 0);
-
         return 1;
  }
  __setup("softlockup_panic=", softlockup_panic_setup);
  
  static int __init nowatchdog_setup(char *str)
  {
-       watchdog_enabled = 0;
+       watchdog_user_enabled = 0;
         return 1;
  }
  __setup("nowatchdog", nowatchdog_setup);
  
  static int __init nosoftlockup_setup(char *str)
  {
-       watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
+       soft_watchdog_user_enabled = 0;
         return 1;
  }
  __setup("nosoftlockup", nosoftlockup_setup);
  
  #ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+
  static int __init softlockup_all_cpu_backtrace_setup(char *str)
  {
-       sysctl_softlockup_all_cpu_backtrace =
-               !!simple_strtol(str, NULL, 0);
+       sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0);
         return 1;
  }
  __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int __init hardlockup_all_cpu_backtrace_setup(char *str)
-{
-       sysctl_hardlockup_all_cpu_backtrace =
-               !!simple_strtol(str, NULL, 0);
-       return 1;
-}
-__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
-#endif
  #endif
  
+static void __lockup_detector_cleanup(void);
+
  /*
   * Hard-lockup warnings should be triggered after just a few seconds. Soft-
   * lockups can have false positives under extreme conditions. So we generally
@@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void)
         int cpu;
  
         /*
-        * this is done lockless
-        * do we care if a 0 races with a timestamp?
-        * all it means is the softlock check starts one cycle later
+        * watchdog_mutex cannpt be taken here, as this might be called
+        * from (soft)interrupt context, so the access to
+        * watchdog_allowed_cpumask might race with a concurrent update.
+        *
+        * The watchdog time stamp can race against a concurrent real
+        * update as well, the only side effect might be a cycle delay for
+        * the softlockup check.
          */
-       for_each_watchdog_cpu(cpu)
+       for_each_cpu(cpu, &watchdog_allowed_mask)
                 per_cpu(watchdog_touch_ts, cpu) = 0;
         wq_watchdog_touch(-1);
  }
@@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void)
         __this_cpu_inc(hrtimer_interrupts);
  }
  
-static int watchdog_enable_all_cpus(void);
-static void watchdog_disable_all_cpus(void);
-
  /* watchdog kicker functions */
  static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
  {
@@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
         int duration;
         int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
  
-       if (atomic_read(&watchdog_park_in_progress) != 0)
+       if (!watchdog_enabled)
                 return HRTIMER_NORESTART;
  
         /* kick the hardlockup detector */
@@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
  
  static void watchdog_enable(unsigned int cpu)
  {
-       struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+       struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
  
-       /* kick off the timer for the hardlockup detector */
+       /*
+        * Start the timer first to prevent the NMI watchdog triggering
+        * before the timer has a chance to fire.
+        */
         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         hrtimer->function = watchdog_timer_fn;
-
-       /* Enable the perf event */
-       watchdog_nmi_enable(cpu);
-
-       /* done here because hrtimer_start can only pin to smp_processor_id() */
         hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                       HRTIMER_MODE_REL_PINNED);
  
-       /* initialize timestamp */
-       watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+       /* Initialize timestamp */
         __touch_watchdog();
+       /* Enable the perf event */
+       if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
+               watchdog_nmi_enable(cpu);
+
+       watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
  }
  
  static void watchdog_disable(unsigned int cpu)
  {
-       struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+       struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
  
         watchdog_set_prio(SCHED_NORMAL, 0);
-       hrtimer_cancel(hrtimer);
-       /* disable the perf event */
+       /*
+        * Disable the perf event first. That prevents that a large delay
+        * between disabling the timer and disabling the perf event causes
+        * the perf NMI to detect a false positive.
+        */
         watchdog_nmi_disable(cpu);
+       hrtimer_cancel(hrtimer);
  }
  
  static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu)
         __this_cpu_write(soft_lockup_hrtimer_cnt,
                          __this_cpu_read(hrtimer_interrupts));
         __touch_watchdog();
-
-       /*
-        * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the
-        * failure path. Check for failures that can occur asynchronously -
-        * for example, when CPUs are on-lined - and shut down the hardware
-        * perf event on each CPU accordingly.
-        *
-        * The only non-obvious place this bit can be cleared is through
-        * watchdog_nmi_enable(), so a pr_info() is placed there.  Placing a
-        * pr_info here would be too noisy as it would result in a message
-        * every few seconds if the hardlockup was disabled but the softlockup
-        * enabled.
-        */
-       if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-               watchdog_nmi_disable(cpu);
  }
  
  static struct smp_hotplug_thread watchdog_threads = {
@@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = {
         .unpark                 = watchdog_enable,
  };
  
-/*
- * park all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function returns an error if kthread_park() of a watchdog thread
- * fails. In this situation, the watchdog threads of some CPUs can already
- * be parked and the watchdog threads of other CPUs can still be runnable.
- * Callers are expected to handle this special condition as appropriate in
- * their context.
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static int watchdog_park_threads(void)
+static void softlockup_update_smpboot_threads(void)
  {
-       int cpu, ret = 0;
+       lockdep_assert_held(&watchdog_mutex);
  
-       atomic_set(&watchdog_park_in_progress, 1);
+       if (!softlockup_threads_initialized)
+               return;
  
-       for_each_watchdog_cpu(cpu) {
-               ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
-               if (ret)
-                       break;
-       }
-
-       atomic_set(&watchdog_park_in_progress, 0);
-
-       return ret;
+       smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+                                            &watchdog_allowed_mask);
  }
  
-/*
- * unpark all watchdog threads that are specified in 'watchdog_cpumask'
- *
- * This function may only be called in a context that is protected against
- * races with CPU hotplug - for example, via get_online_cpus().
- */
-static void watchdog_unpark_threads(void)
+/* Temporarily park all watchdog threads */
+static void softlockup_park_all_threads(void)
  {
-       int cpu;
-
-       for_each_watchdog_cpu(cpu)
-               kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+       cpumask_clear(&watchdog_allowed_mask);
+       softlockup_update_smpboot_threads();
  }
  
-static int update_watchdog_all_cpus(void)
+/* Unpark enabled threads */
+static void softlockup_unpark_threads(void)
  {
-       int ret;
-
-       ret = watchdog_park_threads();
-       if (ret)
-               return ret;
-
-       watchdog_unpark_threads();
-
-       return 0;
+       cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+       softlockup_update_smpboot_threads();
  }
  
-static int watchdog_enable_all_cpus(void)
+static void lockup_detector_reconfigure(void)
  {
-       int err = 0;
-
-       if (!watchdog_running) {
-               err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
-                                                            &watchdog_cpumask);
-               if (err)
-                       pr_err("Failed to create watchdog threads, disabled\n");
-               else
-                       watchdog_running = 1;
-       } else {
-               /*
-                * Enable/disable the lockup detectors or
-                * change the sample period 'on the fly'.
-                */
-               err = update_watchdog_all_cpus();
-
-               if (err) {
-                       watchdog_disable_all_cpus();
-                       pr_err("Failed to update lockup detectors, disabled\n");
-               }
-       }
-
-       if (err)
-               watchdog_enabled = 0;
-
-       return err;
+       cpus_read_lock();
+       watchdog_nmi_stop();
+       softlockup_park_all_threads();
+       set_sample_period();
+       lockup_detector_update_enable();
+       if (watchdog_enabled && watchdog_thresh)
+               softlockup_unpark_threads();
+       watchdog_nmi_start();
+       cpus_read_unlock();
+       /*
+        * Must be called outside the cpus locked section to prevent
+        * recursive locking in the perf code.
+        */
+       __lockup_detector_cleanup();
  }
  
-static void watchdog_disable_all_cpus(void)
+/*
+ * Create the watchdog thread infrastructure and configure the detector(s).
+ *
+ * The threads are not unparked as watchdog_allowed_mask is empty.  When
+ * the threads are sucessfully initialized, take the proper locks and
+ * unpark the threads in the watchdog_cpumask if the watchdog is enabled.
+ */
+static __init void lockup_detector_setup(void)
  {
-       if (watchdog_running) {
-               watchdog_running = 0;
-               smpboot_unregister_percpu_thread(&watchdog_threads);
-       }
-}
+       int ret;
  
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
-{
-       return smpboot_update_cpumask_percpu_thread(
-                   &watchdog_threads, &watchdog_cpumask);
-}
-#endif
+       /*
+        * If sysctl is off and watchdog got disabled on the command line,
+        * nothing to do here.
+        */
+       lockup_detector_update_enable();
  
-#else /* SOFTLOCKUP */
-static int watchdog_park_threads(void)
-{
-       return 0;
-}
+       if (!IS_ENABLED(CONFIG_SYSCTL) &&
+           !(watchdog_enabled && watchdog_thresh))
+               return;
  
-static void watchdog_unpark_threads(void)
-{
-}
+       ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+                                                    &watchdog_allowed_mask);
+       if (ret) {
+               pr_err("Failed to initialize soft lockup detector threads\n");
+               return;
+       }
  
-static int watchdog_enable_all_cpus(void)
-{
-       return 0;
+       mutex_lock(&watchdog_mutex);
+       softlockup_threads_initialized = true;
+       lockup_detector_reconfigure();
+       mutex_unlock(&watchdog_mutex);
  }
  
-static void watchdog_disable_all_cpus(void)
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
+static inline int watchdog_park_threads(void) { return 0; }
+static inline void watchdog_unpark_threads(void) { }
+static inline int watchdog_enable_all_cpus(void) { return 0; }
+static inline void watchdog_disable_all_cpus(void) { }
+static void lockup_detector_reconfigure(void)
  {
+       cpus_read_lock();
+       watchdog_nmi_stop();
+       lockup_detector_update_enable();
+       watchdog_nmi_start();
+       cpus_read_unlock();
  }
-
-#ifdef CONFIG_SYSCTL
-static int watchdog_update_cpus(void)
+static inline void lockup_detector_setup(void)
  {
-       return 0;
+       lockup_detector_reconfigure();
  }
-#endif
+#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
  
-static void set_sample_period(void)
+static void __lockup_detector_cleanup(void)
  {
+       lockdep_assert_held(&watchdog_mutex);
+       hardlockup_detector_perf_cleanup();
  }
-#endif /* SOFTLOCKUP */
  
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
+/**
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
   */
-int lockup_detector_suspend(void)
+void lockup_detector_cleanup(void)
  {
-       int ret = 0;
-
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
-       /*
-        * Multiple suspend requests can be active in parallel (counted by
-        * the 'watchdog_suspended' variable). If the watchdog threads are
-        * running, the first caller takes care that they will be parked.
-        * The state of 'watchdog_running' cannot change while a suspend
-        * request is active (see related code in 'proc' handlers).
-        */
-       if (watchdog_running && !watchdog_suspended)
-               ret = watchdog_park_threads();
-
-       if (ret == 0)
-               watchdog_suspended++;
-       else {
-               watchdog_disable_all_cpus();
-               pr_err("Failed to suspend lockup detectors, disabled\n");
-               watchdog_enabled = 0;
-       }
-
-       watchdog_nmi_reconfigure();
-
-       mutex_unlock(&watchdog_proc_mutex);
-
-       return ret;
+       mutex_lock(&watchdog_mutex);
+       __lockup_detector_cleanup();
+       mutex_unlock(&watchdog_mutex);
  }
  
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
+/**
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
   */
-void lockup_detector_resume(void)
+void lockup_detector_soft_poweroff(void)
  {
-       mutex_lock(&watchdog_proc_mutex);
-
-       watchdog_suspended--;
-       /*
-        * The watchdog threads are unparked if they were previously running
-        * and if there is no more active suspend request.
-        */
-       if (watchdog_running && !watchdog_suspended)
-               watchdog_unpark_threads();
-
-       watchdog_nmi_reconfigure();
-
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       watchdog_enabled = 0;
  }
  
  #ifdef CONFIG_SYSCTL
  
-/*
- * Update the run state of the lockup detectors.
- */
-static int proc_watchdog_update(void)
+/* Propagate any changes to the watchdog threads */
+static void proc_watchdog_update(void)
  {
-       int err = 0;
-
-       /*
-        * Watchdog threads won't be started if they are already active.
-        * The 'watchdog_running' variable in watchdog_*_all_cpus() takes
-        * care of this. If those threads are already active, the sample
-        * period will be updated and the lockup detectors will be enabled
-        * or disabled 'on the fly'.
-        */
-       if (watchdog_enabled && watchdog_thresh)
-               err = watchdog_enable_all_cpus();
-       else
-               watchdog_disable_all_cpus();
-
-       watchdog_nmi_reconfigure();
-
-       return err;
-
+       /* Remove impossible cpus to keep sysctl output clean. */
+       cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
+       lockup_detector_reconfigure();
  }
  
  /*
   * common function for watchdog, nmi_watchdog and soft_watchdog parameter
   *
- * caller             | table->data points to | 'which' contains the flag(s)
- * -------------------|-----------------------|-----------------------------
- * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
- *                    |                       | with SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * caller             | table->data points to      | 'which'
+ * -------------------|----------------------------|--------------------------
+ * proc_watchdog      | watchdog_user_enabled      | NMI_WATCHDOG_ENABLED |
+ *                    |                            | SOFT_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_nmi_watchdog  | nmi_watchdog_user_enabled  | NMI_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
   */
  static int proc_watchdog_common(int which, struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp, loff_t *ppos)
  {
-       int err, old, new;
-       int *watchdog_param = (int *)table->data;
+       int err, old, *param = table->data;
  
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
+       mutex_lock(&watchdog_mutex);
  
-       if (watchdog_suspended) {
-               /* no parameter changes allowed while watchdog is suspended */
-               err = -EAGAIN;
-               goto out;
-       }
-
-       /*
-        * If the parameter is being read return the state of the corresponding
-        * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
-        * run state of the lockup detectors.
-        */
         if (!write) {
-               *watchdog_param = (watchdog_enabled & which) != 0;
+               /*
+                * On read synchronize the userspace interface. This is a
+                * racy snapshot.
+                */
+               *param = (watchdog_enabled & which) != 0;
                 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
         } else {
+               old = READ_ONCE(*param);
                 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-               if (err)
-                       goto out;
-
-               /*
-                * There is a race window between fetching the current value
-                * from 'watchdog_enabled' and storing the new value. During
-                * this race window, watchdog_nmi_enable() can sneak in and
-                * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'.
-                * The 'cmpxchg' detects this race and the loop retries.
-                */
-               do {
-                       old = watchdog_enabled;
-                       /*
-                        * If the parameter value is not zero set the
-                        * corresponding bit(s), else clear it(them).
-                        */
-                       if (*watchdog_param)
-                               new = old | which;
-                       else
-                               new = old & ~which;
-               } while (cmpxchg(&watchdog_enabled, old, new) != old);
-
-               /*
-                * Update the run state of the lockup detectors. There is _no_
-                * need to check the value returned by proc_watchdog_update()
-                * and to restore the previous value of 'watchdog_enabled' as
-                * both lockup detectors are disabled if proc_watchdog_update()
-                * returns an error.
-                */
-               if (old == new)
-                       goto out;
-
-               err = proc_watchdog_update();
+               if (!err && old != READ_ONCE(*param))
+                       proc_watchdog_update();
         }
-out:
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       mutex_unlock(&watchdog_mutex);
         return err;
  }
  
@@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write,
  int proc_nmi_watchdog(struct ctl_table *table, int write,
                       void __user *buffer, size_t *lenp, loff_t *ppos)
  {
+       if (!nmi_watchdog_available && write)
+               return -ENOTSUPP;
         return proc_watchdog_common(NMI_WATCHDOG_ENABLED,
                                     table, write, buffer, lenp, ppos);
  }
@@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
  int proc_watchdog_thresh(struct ctl_table *table, int write,
                          void __user *buffer, size_t *lenp, loff_t *ppos)
  {
-       int err, old, new;
-
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
+       int err, old;
  
-       if (watchdog_suspended) {
-               /* no parameter changes allowed while watchdog is suspended */
-               err = -EAGAIN;
-               goto out;
-       }
+       mutex_lock(&watchdog_mutex);
  
-       old = ACCESS_ONCE(watchdog_thresh);
+       old = READ_ONCE(watchdog_thresh);
         err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  
-       if (err || !write)
-               goto out;
-
-       /*
-        * Update the sample period. Restore on failure.
-        */
-       new = ACCESS_ONCE(watchdog_thresh);
-       if (old == new)
-               goto out;
+       if (!err && write && old != READ_ONCE(watchdog_thresh))
+               proc_watchdog_update();
  
-       set_sample_period();
-       err = proc_watchdog_update();
-       if (err) {
-               watchdog_thresh = old;
-               set_sample_period();
-       }
-out:
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       mutex_unlock(&watchdog_mutex);
         return err;
  }
  
@@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
  {
         int err;
  
-       get_online_cpus();
-       mutex_lock(&watchdog_proc_mutex);
-
-       if (watchdog_suspended) {
-               /* no parameter changes allowed while watchdog is suspended */
-               err = -EAGAIN;
-               goto out;
-       }
+       mutex_lock(&watchdog_mutex);
  
         err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
-       if (!err && write) {
-               /* Remove impossible cpus to keep sysctl output cleaner. */
-               cpumask_and(&watchdog_cpumask, &watchdog_cpumask,
-                           cpu_possible_mask);
-
-               if (watchdog_running) {
-                       /*
-                        * Failure would be due to being unable to allocate
-                        * a temporary cpumask, so we are likely not in a
-                        * position to do much else to make things better.
-                        */
-                       if (watchdog_update_cpus() != 0)
-                               pr_err("cpumask update failed\n");
-               }
+       if (!err && write)
+               proc_watchdog_update();
  
-               watchdog_nmi_reconfigure();
-       }
-out:
-       mutex_unlock(&watchdog_proc_mutex);
-       put_online_cpus();
+       mutex_unlock(&watchdog_mutex);
         return err;
  }
-
  #endif /* CONFIG_SYSCTL */
  
  void __init lockup_detector_init(void)
  {
-       set_sample_period();
-
  #ifdef CONFIG_NO_HZ_FULL
         if (tick_nohz_full_enabled()) {
                 pr_info("Disabling watchdog on nohz_full cores by default\n");
@@ -951,6 +783,7 @@ void __init lockup_detector_init(void)
         cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
  #endif
  
-       if (watchdog_enabled)
-               watchdog_enable_all_cpus();
+       if (!watchdog_nmi_probe())
+               nmi_watchdog_available = true;
+       lockup_detector_setup();
  }
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c

index 3a09ea1..71a62ce 100644 (file)
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,8 +21,10 @@
  static DEFINE_PER_CPU(bool, hard_watchdog_warn);
  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static struct cpumask dead_events_mask;
  
  static unsigned long hardlockup_allcpu_dumped;
+static unsigned int watchdog_cpus;
  
  void arch_touch_nmi_watchdog(void)
  {
@@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = {
  
  /* Callback function for perf event subsystem */
  static void watchdog_overflow_callback(struct perf_event *event,
-                struct perf_sample_data *data,
-                struct pt_regs *regs)
+                                      struct perf_sample_data *data,
+                                      struct pt_regs *regs)
  {
         /* Ensure the watchdog never gets throttled */
         event->hw.interrupts = 0;
  
-       if (atomic_read(&watchdog_park_in_progress) != 0)
-               return;
-
         if (__this_cpu_read(watchdog_nmi_touch) == true) {
                 __this_cpu_write(watchdog_nmi_touch, false);
                 return;
@@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event,
         return;
  }
  
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long firstcpu_err;
-static atomic_t watchdog_cpus;
-
-int watchdog_nmi_enable(unsigned int cpu)
+static int hardlockup_detector_event_create(void)
  {
+       unsigned int cpu = smp_processor_id();
         struct perf_event_attr *wd_attr;
-       struct perf_event *event = per_cpu(watchdog_ev, cpu);
-       int firstcpu = 0;
-
-       /* nothing to do if the hard lockup detector is disabled */
-       if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-               goto out;
-
-       /* is it already setup and enabled? */
-       if (event && event->state > PERF_EVENT_STATE_OFF)
-               goto out;
-
-       /* it is setup but not enabled */
-       if (event != NULL)
-               goto out_enable;
-
-       if (atomic_inc_return(&watchdog_cpus) == 1)
-               firstcpu = 1;
+       struct perf_event *evt;
  
         wd_attr = &wd_hw_attr;
         wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
  
         /* Try to register using hardware perf events */
-       event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+       evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+                                              watchdog_overflow_callback, NULL);
+       if (IS_ERR(evt)) {
+               pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+                       PTR_ERR(evt));
+               return PTR_ERR(evt);
+       }
+       this_cpu_write(watchdog_ev, evt);
+       return 0;
+}
  
-       /* save the first cpu's error for future comparision */
-       if (firstcpu && IS_ERR(event))
-               firstcpu_err = PTR_ERR(event);
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+void hardlockup_detector_perf_enable(void)
+{
+       if (hardlockup_detector_event_create())
+               return;
  
-       if (!IS_ERR(event)) {
-               /* only print for the first cpu initialized */
-               if (firstcpu || firstcpu_err)
-                       pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
-               goto out_save;
-       }
+       if (!watchdog_cpus++)
+               pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
  
-       /*
-        * Disable the hard lockup detector if _any_ CPU fails to set up
-        * set up the hardware perf event. The watchdog() function checks
-        * the NMI_WATCHDOG_ENABLED bit periodically.
-        *
-        * The barriers are for syncing up watchdog_enabled across all the
-        * cpus, as clear_bit() does not use barriers.
-        */
-       smp_mb__before_atomic();
-       clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
-       smp_mb__after_atomic();
-
-       /* skip displaying the same error again */
-       if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
-               return PTR_ERR(event);
-
-       /* vary the KERN level based on the returned errno */
-       if (PTR_ERR(event) == -EOPNOTSUPP)
-               pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
-       else if (PTR_ERR(event) == -ENOENT)
-               pr_warn("disabled (cpu%i): hardware events not enabled\n",
-                        cpu);
-       else
-               pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
-                       cpu, PTR_ERR(event));
-
-       pr_info("Shutting down hard lockup detector on all cpus\n");
-
-       return PTR_ERR(event);
-
-       /* success path */
-out_save:
-       per_cpu(watchdog_ev, cpu) = event;
-out_enable:
-       perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
-       return 0;
+       perf_event_enable(this_cpu_read(watchdog_ev));
  }
  
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
  {
-       struct perf_event *event = per_cpu(watchdog_ev, cpu);
+       struct perf_event *event = this_cpu_read(watchdog_ev);
  
         if (event) {
                 perf_event_disable(event);
+               cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
+               watchdog_cpus--;
+       }
+}
+
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+       int cpu;
+
+       for_each_cpu(cpu, &dead_events_mask) {
+               struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+               /*
+                * Required because for_each_cpu() reports  unconditionally
+                * CPU0 as set on UP kernels. Sigh.
+                */
+               if (event)
+                       perf_event_release_kernel(event);
                 per_cpu(watchdog_ev, cpu) = NULL;
+       }
+       cpumask_clear(&dead_events_mask);
+}
+
+/**
+ * hardlockup_detector_perf_stop - Globally stop watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_stop(void)
+{
+       int cpu;
+
+       lockdep_assert_cpus_held();
+
+       for_each_online_cpu(cpu) {
+               struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+               if (event)
+                       perf_event_disable(event);
+       }
+}
  
-               /* should be in cleanup, but blocks oprofile */
-               perf_event_release_kernel(event);
+/**
+ * hardlockup_detector_perf_restart - Globally restart watchdog events
+ *
+ * Special interface for x86 to handle the perf HT bug.
+ */
+void __init hardlockup_detector_perf_restart(void)
+{
+       int cpu;
+
+       lockdep_assert_cpus_held();
+
+       if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+               return;
+
+       for_each_online_cpu(cpu) {
+               struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+               if (event)
+                       perf_event_enable(event);
+       }
+}
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+       int ret = hardlockup_detector_event_create();
  
-               /* watchdog_nmi_enable() expects this to be zero initially. */
-               if (atomic_dec_and_test(&watchdog_cpus))
-                       firstcpu_err = 0;
+       if (ret) {
+               pr_info("Perf NMI watchdog permanently disabled\n");
+       } else {
+               perf_event_release_kernel(this_cpu_read(watchdog_ev));
+               this_cpu_write(watchdog_ev, NULL);
         }
+       return ret;
  }
diff --git a/lib/once.c b/lib/once.c

index 05c8604..831c5a6 100644 (file)
--- a/lib/once.c
+++ b/lib/once.c
@@ -5,7 +5,7 @@
  
  struct once_work {
         struct work_struct work;
-       struct static_key *key;
+       struct static_key_true *key;
  };
  
  static void once_deferred(struct work_struct *w)
@@ -14,11 +14,11 @@ static void once_deferred(struct work_struct *w)
  
         work = container_of(w, struct once_work, work);
         BUG_ON(!static_key_enabled(work->key));
-       static_key_slow_dec(work->key);
+       static_branch_disable(work->key);
         kfree(work);
  }
  
-static void once_disable_jump(struct static_key *key)
+static void once_disable_jump(struct static_key_true *key)
  {
         struct once_work *w;
  
@@ -51,7 +51,7 @@ bool __do_once_start(bool *done, unsigned long *flags)
  }
  EXPORT_SYMBOL(__do_once_start);
  
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
                     unsigned long *flags)
         __releases(once_lock)
  {
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c

index 83ba548..1b659ab 100644 (file)
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -916,8 +916,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
         u16 tvlv_len = 0;
         unsigned long send_time;
  
-       if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
-           (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+       if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+           hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
                 return;
  
         /* the interface gets activated here to avoid race conditions between
@@ -1264,7 +1264,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
          * drops as they can't send and receive at the same time.
          */
         tq_iface_penalty = BATADV_TQ_MAX_VALUE;
-       if (if_outgoing && (if_incoming == if_outgoing) &&
+       if (if_outgoing && if_incoming == if_outgoing &&
             batadv_is_wifi_hardif(if_outgoing))
                 tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE,
                                                       bat_priv);
@@ -1369,7 +1369,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
                                 ret = BATADV_NEIGH_DUP;
                 } else {
                         set_mark = 0;
-                       if (is_dup && (ret != BATADV_NEIGH_DUP))
+                       if (is_dup && ret != BATADV_NEIGH_DUP)
                                 ret = BATADV_ORIG_DUP;
                 }
  
@@ -1515,7 +1515,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
         /* drop packet if sender is not a direct neighbor and if we
          * don't route towards it
          */
-       if (!is_single_hop_neigh && (!orig_neigh_router)) {
+       if (!is_single_hop_neigh && !orig_neigh_router) {
                 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                            "Drop packet: OGM via unknown neighbor!\n");
                 goto out_neigh;
@@ -1535,7 +1535,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
         sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno);
         similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl;
  
-       if (is_bidirect && ((dup_status == BATADV_NO_DUP) ||
+       if (is_bidirect && (dup_status == BATADV_NO_DUP ||
                             (sameseq && similar_ttl))) {
                 batadv_iv_ogm_orig_update(bat_priv, orig_node,
                                           orig_ifinfo, ethhdr,
@@ -1553,8 +1553,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
                 /* OGMs from secondary interfaces should only scheduled once
                  * per interface where it has been received, not multiple times
                  */
-               if ((ogm_packet->ttl <= 2) &&
-                   (if_incoming != if_outgoing)) {
+               if (ogm_packet->ttl <= 2 &&
+                   if_incoming != if_outgoing) {
                         batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                                    "Drop packet: OGM from secondary interface and wrong outgoing interface\n");
                         goto out_neigh;
@@ -1590,7 +1590,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
                               if_incoming, if_outgoing);
  
  out_neigh:
-       if ((orig_neigh_node) && (!is_single_hop_neigh))
+       if (orig_neigh_node && !is_single_hop_neigh)
                 batadv_orig_node_put(orig_neigh_node);
  out:
         if (router_ifinfo)
@@ -2523,9 +2523,9 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
                         tmp_gw_factor *= 100 * 100;
                         tmp_gw_factor >>= 18;
  
-                       if ((tmp_gw_factor > max_gw_factor) ||
-                           ((tmp_gw_factor == max_gw_factor) &&
-                            (tq_avg > max_tq))) {
+                       if (tmp_gw_factor > max_gw_factor ||
+                           (tmp_gw_factor == max_gw_factor &&
+                            tq_avg > max_tq)) {
                                 if (curr_gw)
                                         batadv_gw_node_put(curr_gw);
                                 curr_gw = gw_node;
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c

index 4e2724c..93ef1c0 100644 (file)
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -767,7 +767,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
                 if (batadv_v_gw_throughput_get(gw_node, &bw) < 0)
                         goto next;
  
-               if (curr_gw && (bw <= max_bw))
+               if (curr_gw && bw <= max_bw)
                         goto next;
  
                 if (curr_gw)
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c

index bd1064d..1de992c 100644 (file)
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -134,7 +134,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
                         hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
  
                 throughput = link_settings.base.speed;
-               if (throughput && (throughput != SPEED_UNKNOWN))
+               if (throughput && throughput != SPEED_UNKNOWN)
                         return throughput * 10;
         }
  
@@ -263,8 +263,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
                 goto out;
  
         /* we are in the process of shutting this interface down */
-       if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
-           (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+       if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+           hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
                 goto out;
  
         /* the interface was enabled but may not be ready yet */
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c

index 8be6173..c251445 100644 (file)
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -304,8 +304,8 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
          * due to the store & forward characteristics of WIFI.
          * Very low throughput values are the exception.
          */
-       if ((throughput > 10) &&
-           (if_incoming == if_outgoing) &&
+       if (throughput > 10 &&
+           if_incoming == if_outgoing &&
             !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX))
                 return throughput / 2;
  
@@ -455,7 +455,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
         /* drop packets with old seqnos, however accept the first packet after
          * a host has been rebooted.
          */
-       if ((seq_diff < 0) && !protection_started)
+       if (seq_diff < 0 && !protection_started)
                 goto out;
  
         neigh_node->last_seen = jiffies;
@@ -568,8 +568,8 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
                 router_throughput = router_ifinfo->bat_v.throughput;
                 neigh_throughput = neigh_ifinfo->bat_v.throughput;
  
-               if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) &&
-                   (router_throughput >= neigh_throughput))
+               if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF &&
+                   router_throughput >= neigh_throughput)
                         goto out;
         }
  
@@ -621,7 +621,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
                 return;
  
         /* only unknown & newer OGMs contain TVLVs we are interested in */
-       if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT))
+       if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT)
                 batadv_tvlv_containers_process(bat_priv, true, orig_node,
                                                NULL, NULL,
                                                (unsigned char *)(ogm2 + 1),
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c

index b6cfa78..760c0de 100644 (file)
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -492,8 +492,8 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
         /* this is an hash collision with the temporary selected node. Choose
          * the one with the lowest address
          */
-       if ((tmp_max == max) && max_orig_node &&
-           (batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0))
+       if (tmp_max == max && max_orig_node &&
+           batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0)
                 goto out;
  
         ret = true;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c

index de9955d..10d521f 100644 (file)
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -248,12 +248,12 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
                 }
         }
  
-       if ((curr_gw) && (!next_gw)) {
+       if (curr_gw && !next_gw) {
                 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                            "Removing selected gateway - no gateway in range\n");
                 batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL,
                                     NULL);
-       } else if ((!curr_gw) && (next_gw)) {
+       } else if (!curr_gw && next_gw) {
                 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                            "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n",
                            next_gw->orig_node->orig,
@@ -411,8 +411,8 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
                 goto out;
         }
  
-       if ((gw_node->bandwidth_down == ntohl(gateway->bandwidth_down)) &&
-           (gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)))
+       if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) &&
+           gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))
                 goto out;
  
         batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c

index 33940c5..2c26039 100644 (file)
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -56,8 +56,8 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
                 if (strncasecmp(tmp_ptr, "mbit", 4) == 0)
                         bw_unit_type = BATADV_BW_UNIT_MBIT;
  
-               if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) ||
-                   (bw_unit_type == BATADV_BW_UNIT_MBIT))
+               if (strncasecmp(tmp_ptr, "kbit", 4) == 0 ||
+                   bw_unit_type == BATADV_BW_UNIT_MBIT)
                         *tmp_ptr = '\0';
         }
  
@@ -190,7 +190,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
         if (!up_new)
                 up_new = 1;
  
-       if ((down_curr == down_new) && (up_curr == up_new))
+       if (down_curr == down_new && up_curr == up_new)
                 return count;
  
         batadv_gw_reselect(bat_priv);
@@ -224,16 +224,16 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
         /* only fetch the tvlv value if the handler wasn't called via the
          * CIFNOTFND flag and if there is data to fetch
          */
-       if ((flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) ||
-           (tvlv_value_len < sizeof(gateway))) {
+       if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND ||
+           tvlv_value_len < sizeof(gateway)) {
                 gateway.bandwidth_down = 0;
                 gateway.bandwidth_up = 0;
         } else {
                 gateway_ptr = tvlv_value;
                 gateway.bandwidth_down = gateway_ptr->bandwidth_down;
                 gateway.bandwidth_up = gateway_ptr->bandwidth_up;
-               if ((gateway.bandwidth_down == 0) ||
-                   (gateway.bandwidth_up == 0)) {
+               if (gateway.bandwidth_down == 0 ||
+                   gateway.bandwidth_up == 0) {
                         gateway.bandwidth_down = 0;
                         gateway.bandwidth_up = 0;
                 }
@@ -242,8 +242,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
         batadv_gw_node_update(bat_priv, orig, &gateway);
  
         /* restart gateway selection */
-       if ((gateway.bandwidth_down != 0) &&
-           (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT))
+       if (gateway.bandwidth_down != 0 &&
+           atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)
                 batadv_gw_check_election(bat_priv, orig);
  }
  
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c

index f7b413b..4e3d534 100644 (file)
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -504,8 +504,8 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev)
  
         rcu_read_lock();
         list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
-               if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
-                   (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+               if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+                   hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
                         continue;
  
                 if (hard_iface->net_dev == net_dev)
@@ -568,8 +568,8 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface)
  
         rcu_read_lock();
         list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
-               if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
-                   (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+               if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+                   hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
                         continue;
  
                 if (hard_iface->soft_iface != soft_iface)
@@ -654,8 +654,8 @@ out:
  static void
  batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
  {
-       if ((hard_iface->if_status != BATADV_IF_ACTIVE) &&
-           (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED))
+       if (hard_iface->if_status != BATADV_IF_ACTIVE &&
+           hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
                 return;
  
         hard_iface->if_status = BATADV_IF_INACTIVE;
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c

index 8ead292..bded311 100644 (file)
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -132,10 +132,10 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf,
         size_t packet_len;
         int error;
  
-       if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0))
+       if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0)
                 return -EAGAIN;
  
-       if ((!buf) || (count < sizeof(struct batadv_icmp_packet)))
+       if (!buf || count < sizeof(struct batadv_icmp_packet))
                 return -EINVAL;
  
         if (!access_ok(VERIFY_WRITE, buf, count))
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c

index fb381fb..4daed7a 100644 (file)
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -73,8 +73,8 @@
   * list traversals just rcu-locked
   */
  struct list_head batadv_hardif_list;
-static int (*batadv_rx_handler[256])(struct sk_buff *,
-                                    struct batadv_hard_iface *);
+static int (*batadv_rx_handler[256])(struct sk_buff *skb,
+                                    struct batadv_hard_iface *recv_if);
  
  unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
  
@@ -540,12 +540,12 @@ batadv_recv_handler_register(u8 packet_type,
                              int (*recv_handler)(struct sk_buff *,
                                                  struct batadv_hard_iface *))
  {
-       int (*curr)(struct sk_buff *,
-                   struct batadv_hard_iface *);
+       int (*curr)(struct sk_buff *skb,
+                   struct batadv_hard_iface *recv_if);
         curr = batadv_rx_handler[packet_type];
  
-       if ((curr != batadv_recv_unhandled_packet) &&
-           (curr != batadv_recv_unhandled_unicast_packet))
+       if (curr != batadv_recv_unhandled_packet &&
+           curr != batadv_recv_unhandled_unicast_packet)
                 return -EBUSY;
  
         batadv_rx_handler[packet_type] = recv_handler;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h

index 05cc763..edb2f23 100644 (file)
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
  #define BATADV_DRIVER_DEVICE "batman-adv"
  
  #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2017.3"
+#define BATADV_SOURCE_VERSION "2017.4"
  #endif
  
  /* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c

index d327670..e553a87 100644 (file)
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1126,7 +1126,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
         bool orig_initialized;
  
         if (orig_mcast_enabled && tvlv_value &&
-           (tvlv_value_len >= sizeof(mcast_flags)))
+           tvlv_value_len >= sizeof(mcast_flags))
                 mcast_flags = *(u8 *)tvlv_value;
  
         spin_lock_bh(&orig->mcast_handler_lock);
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c

index 8e2a4b2..2967b86 100644 (file)
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1062,9 +1062,9 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
                         continue;
  
                 /* don't purge if the interface is not (going) down */
-               if ((if_outgoing->if_status != BATADV_IF_INACTIVE) &&
-                   (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) &&
-                   (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED))
+               if (if_outgoing->if_status != BATADV_IF_INACTIVE &&
+                   if_outgoing->if_status != BATADV_IF_NOT_IN_USE &&
+                   if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)
                         continue;
  
                 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -1106,9 +1106,9 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
                         continue;
  
                 /* don't purge if the interface is not (going) down */
-               if ((if_outgoing->if_status != BATADV_IF_INACTIVE) &&
-                   (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) &&
-                   (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED))
+               if (if_outgoing->if_status != BATADV_IF_INACTIVE &&
+                   if_outgoing->if_status != BATADV_IF_NOT_IN_USE &&
+                   if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)
                         continue;
  
                 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
@@ -1155,13 +1155,13 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
                 last_seen = neigh_node->last_seen;
                 if_incoming = neigh_node->if_incoming;
  
-               if ((batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT)) ||
-                   (if_incoming->if_status == BATADV_IF_INACTIVE) ||
-                   (if_incoming->if_status == BATADV_IF_NOT_IN_USE) ||
-                   (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) {
-                       if ((if_incoming->if_status == BATADV_IF_INACTIVE) ||
-                           (if_incoming->if_status == BATADV_IF_NOT_IN_USE) ||
-                           (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED))
+               if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) ||
+                   if_incoming->if_status == BATADV_IF_INACTIVE ||
+                   if_incoming->if_status == BATADV_IF_NOT_IN_USE ||
+                   if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) {
+                       if (if_incoming->if_status == BATADV_IF_INACTIVE ||
+                           if_incoming->if_status == BATADV_IF_NOT_IN_USE ||
+                           if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)
                                 batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
                                            "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n",
                                            orig_node->orig, neigh_node->addr,
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c

index f10e3ff..40d9bf3 100644 (file)
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -93,14 +93,14 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
         batadv_orig_ifinfo_put(orig_ifinfo);
  
         /* route deleted */
-       if ((curr_router) && (!neigh_node)) {
+       if (curr_router && !neigh_node) {
                 batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
                            "Deleting route towards: %pM\n", orig_node->orig);
                 batadv_tt_global_del_orig(bat_priv, orig_node, -1,
                                           "Deleted route towards originator");
  
         /* route added */
-       } else if ((!curr_router) && (neigh_node)) {
+       } else if (!curr_router && neigh_node) {
                 batadv_dbg(BATADV_DBG_ROUTES, bat_priv,
                            "Adding route towards: %pM (via %pM)\n",
                            orig_node->orig, neigh_node->addr);
@@ -381,7 +381,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
         /* add record route information if not full */
         if ((icmph->msg_type == BATADV_ECHO_REPLY ||
              icmph->msg_type == BATADV_ECHO_REQUEST) &&
-           (skb->len >= sizeof(struct batadv_icmp_packet_rr))) {
+           skb->len >= sizeof(struct batadv_icmp_packet_rr)) {
                 if (skb_linearize(skb) < 0)
                         goto free_skb;
  
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c

index 054a65e..7895323 100644 (file)
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -142,7 +142,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
  #ifdef CONFIG_BATMAN_ADV_BATMAN_V
         hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr);
  
-       if ((hardif_neigh) && (ret != NET_XMIT_DROP))
+       if (hardif_neigh && ret != NET_XMIT_DROP)
                 hardif_neigh->bat_v.last_unicast_tx = jiffies;
  
         if (hardif_neigh)
@@ -615,8 +615,8 @@ batadv_forw_packet_list_steal(struct hlist_head *forw_list,
                  * we delete only packets belonging to the given interface
                  */
                 if (hard_iface &&
-                   (forw_packet->if_incoming != hard_iface) &&
-                   (forw_packet->if_outgoing != hard_iface))
+                   forw_packet->if_incoming != hard_iface &&
+                   forw_packet->if_outgoing != hard_iface)
                         continue;
  
                 hlist_del(&forw_packet->list);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c

index e7d5fbb..543d2c3 100644 (file)
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -69,8 +69,8 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
         int result;
  
         /* TODO: We must check if we can release all references to non-payload
-        * data using __skb_header_release in our skbs to allow skb_cow_header to
-        * work optimally. This means that those skbs are not allowed to read
+        * data using __skb_header_release in our skbs to allow skb_cow_header
+        * to work optimally. This means that those skbs are not allowed to read
          * or write any data which is before the current position of skb->data
          * after that call and thus allow other skbs with the same data buffer
          * to write freely in that area.
@@ -160,7 +160,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
  static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
  {
         /* check ranges */
-       if ((new_mtu < 68) || (new_mtu > batadv_hardif_min_mtu(dev)))
+       if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev))
                 return -EINVAL;
  
         dev->mtu = new_mtu;
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c

index 0ae8b30..aa187fd 100644 (file)
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -925,8 +925,8 @@ static int batadv_store_mesh_iface_finish(struct net_device *net_dev,
         if (hard_iface->if_status == status_tmp)
                 goto out;
  
-       if ((hard_iface->soft_iface) &&
-           (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0))
+       if (hard_iface->soft_iface &&
+           strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)
                 goto out;
  
         if (status_tmp == BATADV_IF_NOT_IN_USE) {
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c

index bfe8eff..4b90033 100644 (file)
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1206,7 +1206,7 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
  
         /* send the ack */
         r = batadv_send_skb_to_orig(skb, orig_node, NULL);
-       if (unlikely(r < 0) || (r == NET_XMIT_DROP)) {
+       if (unlikely(r < 0) || r == NET_XMIT_DROP) {
                 ret = BATADV_TP_REASON_DST_UNREACHABLE;
                 goto out;
         }
diff --git a/net/bridge/Makefile b/net/bridge/Makefile

index 40b1ede..4aee55f 100644 (file)
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o
  bridge-y       := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \
                         br_ioctl.o br_stp.o br_stp_bpdu.o \
                         br_stp_if.o br_stp_timer.o br_netlink.o \
-                       br_netlink_tunnel.o
+                       br_netlink_tunnel.o br_arp_nd_proxy.o
  
  bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o
  
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c

new file mode 100644 (file)

index 0000000..2cf7716
--- /dev/null
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -0,0 +1,469 @@
+/*
+ *  Handle bridge arp/nd proxy/suppress
+ *
+ *  Copyright (C) 2017 Cumulus Networks
+ *  Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  Authors:
+ *     Roopa Prabhu <roopa@cumulusnetworks.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/neighbour.h>
+#include <net/arp.h>
+#include <linux/if_vlan.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_checksum.h>
+#endif
+
+#include "br_private.h"
+
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
+{
+       struct net_bridge_port *p;
+       bool neigh_suppress = false;
+
+       list_for_each_entry(p, &br->port_list, list) {
+               if (p->flags & BR_NEIGH_SUPPRESS) {
+                       neigh_suppress = true;
+                       break;
+               }
+       }
+
+       br->neigh_suppress_enabled = neigh_suppress;
+}
+
+#if IS_ENABLED(CONFIG_INET)
+static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p,
+                       struct net_device *dev, __be32 dest_ip, __be32 src_ip,
+                       const unsigned char *dest_hw,
+                       const unsigned char *src_hw,
+                       const unsigned char *target_hw,
+                       __be16 vlan_proto, u16 vlan_tci)
+{
+       struct net_bridge_vlan_group *vg;
+       struct sk_buff *skb;
+       u16 pvid;
+
+       netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n",
+                  dev->name, &dest_ip, dest_hw, &src_ip, src_hw);
+
+       if (!vlan_tci) {
+               arp_send(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip,
+                        dest_hw, src_hw, target_hw);
+               return;
+       }
+
+       skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip,
+                        dest_hw, src_hw, target_hw);
+       if (!skb)
+               return;
+
+       if (p)
+               vg = nbp_vlan_group_rcu(p);
+       else
+               vg = br_vlan_group_rcu(br);
+       pvid = br_get_pvid(vg);
+       if (pvid == (vlan_tci & VLAN_VID_MASK))
+               vlan_tci = 0;
+
+       if (vlan_tci)
+               __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
+
+       if (p) {
+               arp_xmit(skb);
+       } else {
+               skb_reset_mac_header(skb);
+               __skb_pull(skb, skb_network_offset(skb));
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+               skb->pkt_type = PACKET_HOST;
+
+               netif_rx_ni(skb);
+       }
+}
+
+static int br_chk_addr_ip(struct net_device *dev, void *data)
+{
+       __be32 ip = *(__be32 *)data;
+       struct in_device *in_dev;
+       __be32 addr = 0;
+
+       in_dev = __in_dev_get_rcu(dev);
+       if (in_dev)
+               addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip,
+                                        RT_SCOPE_HOST);
+
+       if (addr == ip)
+               return 1;
+
+       return 0;
+}
+
+static bool br_is_local_ip(struct net_device *dev, __be32 ip)
+{
+       if (br_chk_addr_ip(dev, &ip))
+               return true;
+
+       /* check if ip is configured on upper dev */
+       if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip))
+               return true;
+
+       return false;
+}
+
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+                             u16 vid, struct net_bridge_port *p)
+{
+       struct net_device *dev = br->dev;
+       struct net_device *vlandev = dev;
+       struct neighbour *n;
+       struct arphdr *parp;
+       u8 *arpptr, *sha;
+       __be32 sip, tip;
+
+       BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+       if ((dev->flags & IFF_NOARP) ||
+           !pskb_may_pull(skb, arp_hdr_len(dev)))
+               return;
+
+       parp = arp_hdr(skb);
+
+       if (parp->ar_pro != htons(ETH_P_IP) ||
+           parp->ar_hln != dev->addr_len ||
+           parp->ar_pln != 4)
+               return;
+
+       arpptr = (u8 *)parp + sizeof(struct arphdr);
+       sha = arpptr;
+       arpptr += dev->addr_len;        /* sha */
+       memcpy(&sip, arpptr, sizeof(sip));
+       arpptr += sizeof(sip);
+       arpptr += dev->addr_len;        /* tha */
+       memcpy(&tip, arpptr, sizeof(tip));
+
+       if (ipv4_is_loopback(tip) ||
+           ipv4_is_multicast(tip))
+               return;
+
+       if (br->neigh_suppress_enabled) {
+               if (p && (p->flags & BR_NEIGH_SUPPRESS))
+                       return;
+               if (ipv4_is_zeronet(sip) || sip == tip) {
+                       /* prevent flooding to neigh suppress ports */
+                       BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+                       return;
+               }
+       }
+
+       if (parp->ar_op != htons(ARPOP_REQUEST))
+               return;
+
+       if (vid != 0) {
+               vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+                                                  vid);
+               if (!vlandev)
+                       return;
+       }
+
+       if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) {
+               /* its our local ip, so don't proxy reply
+                * and don't forward to neigh suppress ports
+                */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       n = neigh_lookup(&arp_tbl, &tip, vlandev);
+       if (n) {
+               struct net_bridge_fdb_entry *f;
+
+               if (!(n->nud_state & NUD_VALID)) {
+                       neigh_release(n);
+                       return;
+               }
+
+               f = br_fdb_find_rcu(br, n->ha, vid);
+               if (f) {
+                       bool replied = false;
+
+                       if ((p && (p->flags & BR_PROXYARP)) ||
+                           (f->dst && (f->dst->flags & (BR_PROXYARP_WIFI |
+                                                        BR_NEIGH_SUPPRESS)))) {
+                               if (!vid)
+                                       br_arp_send(br, p, skb->dev, sip, tip,
+                                                   sha, n->ha, sha, 0, 0);
+                               else
+                                       br_arp_send(br, p, skb->dev, sip, tip,
+                                                   sha, n->ha, sha,
+                                                   skb->vlan_proto,
+                                                   skb_vlan_tag_get(skb));
+                               replied = true;
+                       }
+
+                       /* If we have replied or as long as we know the
+                        * mac, indicate to arp replied
+                        */
+                       if (replied || br->neigh_suppress_enabled)
+                               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               }
+
+               neigh_release(n);
+       }
+}
+#endif
+
+#if IS_ENABLED(CONFIG_IPV6)
+struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg)
+{
+       struct nd_msg *m;
+
+       m = skb_header_pointer(skb, skb_network_offset(skb) +
+                              sizeof(struct ipv6hdr), sizeof(*msg), msg);
+       if (!m)
+               return NULL;
+
+       if (m->icmph.icmp6_code != 0 ||
+           (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION &&
+            m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT))
+               return NULL;
+
+       return m;
+}
+
+static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
+                      struct sk_buff *request, struct neighbour *n,
+                      __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns)
+{
+       struct net_device *dev = request->dev;
+       struct net_bridge_vlan_group *vg;
+       struct sk_buff *reply;
+       struct nd_msg *na;
+       struct ipv6hdr *pip6;
+       int na_olen = 8; /* opt hdr + ETH_ALEN for target */
+       int ns_olen;
+       int i, len;
+       u8 *daddr;
+       u16 pvid;
+
+       if (!dev)
+               return;
+
+       len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
+               sizeof(*na) + na_olen + dev->needed_tailroom;
+
+       reply = alloc_skb(len, GFP_ATOMIC);
+       if (!reply)
+               return;
+
+       reply->protocol = htons(ETH_P_IPV6);
+       reply->dev = dev;
+       skb_reserve(reply, LL_RESERVED_SPACE(dev));
+       skb_push(reply, sizeof(struct ethhdr));
+       skb_set_mac_header(reply, 0);
+
+       daddr = eth_hdr(request)->h_source;
+
+       /* Do we need option processing ? */
+       ns_olen = request->len - (skb_network_offset(request) +
+                                 sizeof(struct ipv6hdr)) - sizeof(*ns);
+       for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) {
+               if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
+                       daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
+                       break;
+               }
+       }
+
+       /* Ethernet header */
+       ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
+       ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
+       eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
+       reply->protocol = htons(ETH_P_IPV6);
+
+       skb_pull(reply, sizeof(struct ethhdr));
+       skb_set_network_header(reply, 0);
+       skb_put(reply, sizeof(struct ipv6hdr));
+
+       /* IPv6 header */
+       pip6 = ipv6_hdr(reply);
+       memset(pip6, 0, sizeof(struct ipv6hdr));
+       pip6->version = 6;
+       pip6->priority = ipv6_hdr(request)->priority;
+       pip6->nexthdr = IPPROTO_ICMPV6;
+       pip6->hop_limit = 255;
+       pip6->daddr = ipv6_hdr(request)->saddr;
+       pip6->saddr = *(struct in6_addr *)n->primary_key;
+
+       skb_pull(reply, sizeof(struct ipv6hdr));
+       skb_set_transport_header(reply, 0);
+
+       na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
+
+       /* Neighbor Advertisement */
+       memset(na, 0, sizeof(*na) + na_olen);
+       na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
+       na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */
+       na->icmph.icmp6_override = 1;
+       na->icmph.icmp6_solicited = 1;
+       na->target = ns->target;
+       ether_addr_copy(&na->opt[2], n->ha);
+       na->opt[0] = ND_OPT_TARGET_LL_ADDR;
+       na->opt[1] = na_olen >> 3;
+
+       na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
+                                               &pip6->daddr,
+                                               sizeof(*na) + na_olen,
+                                               IPPROTO_ICMPV6,
+                                               csum_partial(na, sizeof(*na) + na_olen, 0));
+
+       pip6->payload_len = htons(sizeof(*na) + na_olen);
+
+       skb_push(reply, sizeof(struct ipv6hdr));
+       skb_push(reply, sizeof(struct ethhdr));
+
+       reply->ip_summed = CHECKSUM_UNNECESSARY;
+
+       if (p)
+               vg = nbp_vlan_group_rcu(p);
+       else
+               vg = br_vlan_group_rcu(br);
+       pvid = br_get_pvid(vg);
+       if (pvid == (vlan_tci & VLAN_VID_MASK))
+               vlan_tci = 0;
+
+       if (vlan_tci)
+               __vlan_hwaccel_put_tag(reply, vlan_proto, vlan_tci);
+
+       netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n",
+                  dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha);
+
+       if (p) {
+               dev_queue_xmit(reply);
+       } else {
+               skb_reset_mac_header(reply);
+               __skb_pull(reply, skb_network_offset(reply));
+               reply->ip_summed = CHECKSUM_UNNECESSARY;
+               reply->pkt_type = PACKET_HOST;
+
+               netif_rx_ni(reply);
+       }
+}
+
+static int br_chk_addr_ip6(struct net_device *dev, void *data)
+{
+       struct in6_addr *addr = (struct in6_addr *)data;
+
+       if (ipv6_chk_addr(dev_net(dev), addr, dev, 0))
+               return 1;
+
+       return 0;
+}
+
+static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr)
+
+{
+       if (br_chk_addr_ip6(dev, addr))
+               return true;
+
+       /* check if ip is configured on upper dev */
+       if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr))
+               return true;
+
+       return false;
+}
+
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+                      u16 vid, struct net_bridge_port *p, struct nd_msg *msg)
+{
+       struct net_device *dev = br->dev;
+       struct net_device *vlandev = NULL;
+       struct in6_addr *saddr, *daddr;
+       struct ipv6hdr *iphdr;
+       struct neighbour *n;
+
+       BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+       if (p && (p->flags & BR_NEIGH_SUPPRESS))
+               return;
+
+       if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
+           !msg->icmph.icmp6_solicited) {
+               /* prevent flooding to neigh suppress ports */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
+               return;
+
+       iphdr = ipv6_hdr(skb);
+       saddr = &iphdr->saddr;
+       daddr = &iphdr->daddr;
+
+       if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) {
+               /* prevent flooding to neigh suppress ports */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       if (vid != 0) {
+               /* build neigh table lookup on the vlan device */
+               vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto,
+                                                  vid);
+               if (!vlandev)
+                       return;
+       } else {
+               vlandev = dev;
+       }
+
+       if (br_is_local_ip6(vlandev, &msg->target)) {
+               /* its our own ip, so don't proxy reply
+                * and don't forward to arp suppress ports
+                */
+               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               return;
+       }
+
+       n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev);
+       if (n) {
+               struct net_bridge_fdb_entry *f;
+
+               if (!(n->nud_state & NUD_VALID)) {
+                       neigh_release(n);
+                       return;
+               }
+
+               f = br_fdb_find_rcu(br, n->ha, vid);
+               if (f) {
+                       bool replied = false;
+
+                       if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) {
+                               if (vid != 0)
+                                       br_nd_send(br, p, skb, n,
+                                                  skb->vlan_proto,
+                                                  skb_vlan_tag_get(skb), msg);
+                               else
+                                       br_nd_send(br, p, skb, n, 0, 0, msg);
+                               replied = true;
+                       }
+
+                       /* If we have replied or as long as we know the
+                        * mac, indicate to NEIGH_SUPPRESS ports that we
+                        * have replied
+                        */
+                       if (replied || br->neigh_suppress_enabled)
+                               BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+               }
+               neigh_release(n);
+       }
+}
+#endif
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c

index 7acb77c..28bb221 100644 (file)
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -39,6 +39,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
         struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
         const struct nf_br_ops *nf_ops;
         const unsigned char *dest;
+       struct ethhdr *eth;
         u16 vid = 0;
  
         rcu_read_lock();
@@ -57,11 +58,30 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
         BR_INPUT_SKB_CB(skb)->brdev = dev;
  
         skb_reset_mac_header(skb);
+       eth = eth_hdr(skb);
         skb_pull(skb, ETH_HLEN);
  
         if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
                 goto out;
  
+       if (IS_ENABLED(CONFIG_INET) &&
+           (eth->h_proto == htons(ETH_P_ARP) ||
+            eth->h_proto == htons(ETH_P_RARP)) &&
+           br->neigh_suppress_enabled) {
+               br_do_proxy_suppress_arp(skb, br, vid, NULL);
+       } else if (IS_ENABLED(CONFIG_IPV6) &&
+                  skb->protocol == htons(ETH_P_IPV6) &&
+                  br->neigh_suppress_enabled &&
+                  pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+                                sizeof(struct nd_msg)) &&
+                  ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+                       struct nd_msg *msg, _msg;
+
+                       msg = br_is_nd_neigh_msg(skb, &_msg);
+                       if (msg)
+                               br_do_suppress_nd(skb, br, vid, NULL, msg);
+       }
+
         dest = eth_hdr(skb)->h_dest;
         if (is_broadcast_ether_addr(dest)) {
                 br_flood(br, skb, BR_PKT_BROADCAST, false, true);
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c

index 48fb174..b4eed11 100644 (file)
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
                 /* Do not flood to ports that enable proxy ARP */
                 if (p->flags & BR_PROXYARP)
                         continue;
-               if ((p->flags & BR_PROXYARP_WIFI) &&
+               if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
                     BR_INPUT_SKB_CB(skb)->proxyarp_replied)
                         continue;
  
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c

index 59a74a4..ae38547 100644 (file)
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
                 del_nbp(p);
         }
  
+       br_recalculate_neigh_suppress_enabled(br);
+
         br_fdb_delete_by_port(br, NULL, 0, 1);
  
         cancel_delayed_work_sync(&br->gc_work);
@@ -660,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
  
         if (mask & BR_AUTO_MASK)
                 nbp_update_port_count(br);
+
+       if (mask & BR_NEIGH_SUPPRESS)
+               br_recalculate_neigh_suppress_enabled(br);
  }
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c

index 7cb6137..a096d3e 100644 (file)
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -71,62 +71,6 @@ static int br_pass_frame_up(struct sk_buff *skb)
                        br_netif_receive_skb);
  }
  
-static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
-                           u16 vid, struct net_bridge_port *p)
-{
-       struct net_device *dev = br->dev;
-       struct neighbour *n;
-       struct arphdr *parp;
-       u8 *arpptr, *sha;
-       __be32 sip, tip;
-
-       BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
-
-       if ((dev->flags & IFF_NOARP) ||
-           !pskb_may_pull(skb, arp_hdr_len(dev)))
-               return;
-
-       parp = arp_hdr(skb);
-
-       if (parp->ar_pro != htons(ETH_P_IP) ||
-           parp->ar_op != htons(ARPOP_REQUEST) ||
-           parp->ar_hln != dev->addr_len ||
-           parp->ar_pln != 4)
-               return;
-
-       arpptr = (u8 *)parp + sizeof(struct arphdr);
-       sha = arpptr;
-       arpptr += dev->addr_len;        /* sha */
-       memcpy(&sip, arpptr, sizeof(sip));
-       arpptr += sizeof(sip);
-       arpptr += dev->addr_len;        /* tha */
-       memcpy(&tip, arpptr, sizeof(tip));
-
-       if (ipv4_is_loopback(tip) ||
-           ipv4_is_multicast(tip))
-               return;
-
-       n = neigh_lookup(&arp_tbl, &tip, dev);
-       if (n) {
-               struct net_bridge_fdb_entry *f;
-
-               if (!(n->nud_state & NUD_VALID)) {
-                       neigh_release(n);
-                       return;
-               }
-
-               f = br_fdb_find_rcu(br, n->ha, vid);
-               if (f && ((p->flags & BR_PROXYARP) ||
-                         (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) {
-                       arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip,
-                                sha, n->ha, sha);
-                       BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
-               }
-
-               neigh_release(n);
-       }
-}
-
  /* note: already called with rcu_read_lock */
  int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
  {
@@ -171,8 +115,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
  
         BR_INPUT_SKB_CB(skb)->brdev = br->dev;
  
-       if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
-               br_do_proxy_arp(skb, br, vid, p);
+       if (IS_ENABLED(CONFIG_INET) &&
+           (skb->protocol == htons(ETH_P_ARP) ||
+            skb->protocol == htons(ETH_P_RARP))) {
+               br_do_proxy_suppress_arp(skb, br, vid, p);
+       } else if (IS_ENABLED(CONFIG_IPV6) &&
+                  skb->protocol == htons(ETH_P_IPV6) &&
+                  br->neigh_suppress_enabled &&
+                  pskb_may_pull(skb, sizeof(struct ipv6hdr) +
+                                sizeof(struct nd_msg)) &&
+                  ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
+                       struct nd_msg *msg, _msg;
+
+                       msg = br_is_nd_neigh_msg(skb, &_msg);
+                       if (msg)
+                               br_do_suppress_nd(skb, br, vid, p, msg);
+       }
  
         switch (pkt_type) {
         case BR_PKT_MULTICAST:
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c

index 8dc5c8d..7947e04 100644 (file)
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -859,8 +859,32 @@ out:
         spin_unlock(&br->multicast_lock);
  }
  
+static void br_mc_router_state_change(struct net_bridge *p,
+                                     bool is_mc_router)
+{
+       struct switchdev_attr attr = {
+               .orig_dev = p->dev,
+               .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER,
+               .flags = SWITCHDEV_F_DEFER,
+               .u.mrouter = is_mc_router,
+       };
+
+       switchdev_port_attr_set(p->dev, &attr);
+}
+
  static void br_multicast_local_router_expired(unsigned long data)
  {
+       struct net_bridge *br = (struct net_bridge *)data;
+
+       spin_lock(&br->multicast_lock);
+       if (br->multicast_router == MDB_RTR_TYPE_DISABLED ||
+           br->multicast_router == MDB_RTR_TYPE_PERM ||
+           timer_pending(&br->multicast_router_timer))
+               goto out;
+
+       br_mc_router_state_change(br, false);
+out:
+       spin_unlock(&br->multicast_lock);
  }
  
  static void br_multicast_querier_expired(struct net_bridge *br,
@@ -1364,9 +1388,12 @@ static void br_multicast_mark_router(struct net_bridge *br,
         unsigned long now = jiffies;
  
         if (!port) {
-               if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
+               if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) {
+                       if (!timer_pending(&br->multicast_router_timer))
+                               br_mc_router_state_change(br, true);
                         mod_timer(&br->multicast_router_timer,
                                   now + br->multicast_querier_interval);
+               }
                 return;
         }
  
@@ -1952,7 +1979,7 @@ void br_multicast_init(struct net_bridge *br)
  
         spin_lock_init(&br->multicast_lock);
         setup_timer(&br->multicast_router_timer,
-                   br_multicast_local_router_expired, 0);
+                   br_multicast_local_router_expired, (unsigned long)br);
         setup_timer(&br->ip4_other_query.timer,
                     br_ip4_multicast_querier_expired, (unsigned long)br);
         setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired,
@@ -2042,9 +2069,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
         switch (val) {
         case MDB_RTR_TYPE_DISABLED:
         case MDB_RTR_TYPE_PERM:
+               br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM);
                 del_timer(&br->multicast_router_timer);
-               /* fall through */
+               br->multicast_router = val;
+               err = 0;
+               break;
         case MDB_RTR_TYPE_TEMP_QUERY:
+               if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+                       br_mc_router_state_change(br, false);
                 br->multicast_router = val;
                 err = 0;
                 break;
@@ -2184,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev)
  }
  EXPORT_SYMBOL_GPL(br_multicast_enabled);
  
+bool br_multicast_router(const struct net_device *dev)
+{
+       struct net_bridge *br = netdev_priv(dev);
+       bool is_router;
+
+       spin_lock_bh(&br->multicast_lock);
+       is_router = br_multicast_is_router(br);
+       spin_unlock_bh(&br->multicast_lock);
+       return is_router;
+}
+EXPORT_SYMBOL_GPL(br_multicast_router);
+
  int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
  {
         unsigned long max_delay;
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c

index dea88a2..f0e8268 100644 (file)
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void)
                 + nla_total_size(1)     /* IFLA_BRPORT_PROXYARP */
                 + nla_total_size(1)     /* IFLA_BRPORT_PROXYARP_WIFI */
                 + nla_total_size(1)     /* IFLA_BRPORT_VLAN_TUNNEL */
+               + nla_total_size(1)     /* IFLA_BRPORT_NEIGH_SUPPRESS */
                 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
                 + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
                 + nla_total_size(sizeof(u16))   /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -210,7 +211,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
             nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) ||
             nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags &
                                                         BR_VLAN_TUNNEL)) ||
-           nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask))
+           nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
+           nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
+                      !!(p->flags & BR_NEIGH_SUPPRESS)))
                 return -EMSGSIZE;
  
         timerval = br_timer_value(&p->message_age_timer);
@@ -785,6 +788,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
                 p->group_fwd_mask = fwd_mask;
         }
  
+       err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS,
+                              BR_NEIGH_SUPPRESS);
+       if (err)
+               return err;
+
         br_port_flags_change(p, old_flags ^ p->flags);
         return 0;
  }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h

index ab4df24..fa0039f 100644 (file)
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -404,6 +404,7 @@ struct net_bridge {
  #ifdef CONFIG_NET_SWITCHDEV
         int offload_fwd_mark;
  #endif
+       bool                            neigh_suppress_enabled;
  };
  
  struct br_input_skb_cb {
@@ -1139,4 +1140,11 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
  }
  #endif /* CONFIG_NET_SWITCHDEV */
  
+/* br_arp_nd_proxy.c */
+void br_recalculate_neigh_suppress_enabled(struct net_bridge *br);
+void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
+                             u16 vid, struct net_bridge_port *p);
+void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
+                      u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
+struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m);
  #endif
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c

index 9110d5e..0a1fa9c 100644 (file)
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -191,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
  BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
  BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
  BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
+BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS);
  
  #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
  static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -241,6 +242,7 @@ static const struct brport_attribute *brport_attrs[] = {
         &brport_attr_multicast_flood,
         &brport_attr_broadcast_flood,
         &brport_attr_group_fwd_mask,
+       &brport_attr_neigh_suppress,
         NULL
  };
  
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c

index 2585b10..276b602 100644 (file)
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb)
  
  static int __net_init broute_net_init(struct net *net)
  {
-       net->xt.broute_table = ebt_register_table(net, &broute_table, NULL);
-       return PTR_ERR_OR_ZERO(net->xt.broute_table);
+       return ebt_register_table(net, &broute_table, NULL,
+                                 &net->xt.broute_table);
  }
  
  static void __net_exit broute_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c

index 45a00db..c41da5f 100644 (file)
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = {
  
  static int __net_init frame_filter_net_init(struct net *net)
  {
-       net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter);
-       return PTR_ERR_OR_ZERO(net->xt.frame_filter);
+       return ebt_register_table(net, &frame_filter, ebt_ops_filter,
+                                 &net->xt.frame_filter);
  }
  
  static void __net_exit frame_filter_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c

index 57cd5bb..08df740 100644 (file)
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = {
  
  static int __net_init frame_nat_net_init(struct net *net)
  {
-       net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat);
-       return PTR_ERR_OR_ZERO(net->xt.frame_nat);
+       return ebt_register_table(net, &frame_nat, ebt_ops_nat,
+                                 &net->xt.frame_nat);
  }
  
  static void __net_exit frame_nat_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c

index 83951f9..3b3dcf7 100644 (file)
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
         kfree(table);
  }
  
-struct ebt_table *
-ebt_register_table(struct net *net, const struct ebt_table *input_table,
-                  const struct nf_hook_ops *ops)
+int ebt_register_table(struct net *net, const struct ebt_table *input_table,
+                      const struct nf_hook_ops *ops, struct ebt_table **res)
  {
         struct ebt_table_info *newinfo;
         struct ebt_table *t, *table;
@@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
             repl->entries == NULL || repl->entries_size == 0 ||
             repl->counters != NULL || input_table->private != NULL) {
                 BUGPRINT("Bad table data for ebt_register_table!!!\n");
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
         }
  
         /* Don't add one table to multiple lists. */
@@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
         list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
         mutex_unlock(&ebt_mutex);
  
+       WRITE_ONCE(*res, table);
+
         if (!ops)
-               return table;
+               return 0;
  
         ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
         if (ret) {
                 __ebt_unregister_table(net, table);
-               return ERR_PTR(ret);
+               *res = NULL;
         }
  
-       return table;
+       return ret;
  free_unlock:
         mutex_unlock(&ebt_mutex);
  free_chainstack:
@@ -1276,7 +1277,7 @@ free_newinfo:
  free_table:
         kfree(table);
  out:
-       return ERR_PTR(ret);
+       return ret;
  }
  
  void ebt_unregister_table(struct net *net, struct ebt_table *table,
diff --git a/net/core/dst.c b/net/core/dst.c

index a6c47da..662a2d4 100644 (file)
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -322,3 +322,19 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
         return md_dst;
  }
  EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
+
+void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
+
+               if (one_md_dst->type == METADATA_IP_TUNNEL)
+                       dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
+       }
+#endif
+       free_percpu(md_dst);
+}
+EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/core/filter.c b/net/core/filter.c

index b7e8caa..140fa9f 100644 (file)
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -43,6 +43,7 @@
  #include <linux/timer.h>
  #include <linux/uaccess.h>
  #include <asm/unaligned.h>
+#include <asm/cmpxchg.h>
  #include <linux/filter.h>
  #include <linux/ratelimit.h>
  #include <linux/seccomp.h>
@@ -2987,14 +2988,15 @@ static const struct bpf_func_proto *
  bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
  {
         if (!md_dst) {
-               /* Race is not possible, since it's called from verifier
-                * that is holding verifier mutex.
-                */
-               md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
-                                                  METADATA_IP_TUNNEL,
-                                                  GFP_KERNEL);
-               if (!md_dst)
+               struct metadata_dst __percpu *tmp;
+
+               tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+                                               METADATA_IP_TUNNEL,
+                                               GFP_KERNEL);
+               if (!tmp)
                         return NULL;
+               if (cmpxchg(&md_dst, NULL, tmp))
+                       metadata_dst_free_percpu(tmp);
         }
  
         switch (which) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c

index e84d108..6a09f3d 100644 (file)
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3066,21 +3066,21 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
  }
  EXPORT_SYMBOL(ndo_dflt_fdb_add);
  
-static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid)
+static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid,
+                        struct netlink_ext_ack *extack)
  {
         u16 vid = 0;
  
         if (vlan_attr) {
                 if (nla_len(vlan_attr) != sizeof(u16)) {
-                       pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n");
+                       NL_SET_ERR_MSG(extack, "invalid vlan attribute size");
                         return -EINVAL;
                 }
  
                 vid = nla_get_u16(vlan_attr);
  
                 if (!vid || vid >= VLAN_VID_MASK) {
-                       pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n",
-                               vid);
+                       NL_SET_ERR_MSG(extack, "invalid vlan id");
                         return -EINVAL;
                 }
         }
@@ -3105,24 +3105,24 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
  
         ndm = nlmsg_data(nlh);
         if (ndm->ndm_ifindex == 0) {
-               pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n");
+               NL_SET_ERR_MSG(extack, "invalid ifindex");
                 return -EINVAL;
         }
  
         dev = __dev_get_by_index(net, ndm->ndm_ifindex);
         if (dev == NULL) {
-               pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                 return -ENODEV;
         }
  
         if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
-               pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n");
+               NL_SET_ERR_MSG(extack, "invalid address");
                 return -EINVAL;
         }
  
         addr = nla_data(tb[NDA_LLADDR]);
  
-       err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+       err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
         if (err)
                 return err;
  
@@ -3209,24 +3209,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
  
         ndm = nlmsg_data(nlh);
         if (ndm->ndm_ifindex == 0) {
-               pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n");
+               NL_SET_ERR_MSG(extack, "invalid ifindex");
                 return -EINVAL;
         }
  
         dev = __dev_get_by_index(net, ndm->ndm_ifindex);
         if (dev == NULL) {
-               pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                 return -ENODEV;
         }
  
         if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
-               pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n");
+               NL_SET_ERR_MSG(extack, "invalid address");
                 return -EINVAL;
         }
  
         addr = nla_data(tb[NDA_LLADDR]);
  
-       err = fdb_vid_parse(tb[NDA_VLAN], &vid);
+       err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
         if (err)
                 return err;
  
@@ -3666,7 +3666,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
  
         dev = __dev_get_by_index(net, ifm->ifi_index);
         if (!dev) {
-               pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                 return -ENODEV;
         }
  
@@ -3741,7 +3741,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
  
         dev = __dev_get_by_index(net, ifm->ifi_index);
         if (!dev) {
-               pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n");
+               NL_SET_ERR_MSG(extack, "unknown ifindex");
                 return -ENODEV;
         }
  
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index 822a90e..4071750 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1350,8 +1350,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
         /* Set the tail pointer and length */
         skb_put(n, skb->len);
  
-       if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len))
-               BUG();
+       BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
  
         copy_skb_header(n, skb);
         return n;
@@ -1449,8 +1448,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
  
         BUG_ON(nhead < 0);
  
-       if (skb_shared(skb))
-               BUG();
+       BUG_ON(skb_shared(skb));
  
         size = SKB_DATA_ALIGN(size);
  
@@ -1595,9 +1593,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                 head_copy_off = newheadroom - head_copy_len;
  
         /* Copy the linear header and data. */
-       if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
-                         skb->len + head_copy_len))
-               BUG();
+       BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
+                            skb->len + head_copy_len));
  
         copy_skb_header(n, skb);
  
@@ -1878,8 +1875,8 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
                         return NULL;
         }
  
-       if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta))
-               BUG();
+       BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
+                            skb_tail_pointer(skb), delta));
  
         /* Optimization: no fragments, no reasons to preestimate
          * size of pulled pages. Superb.
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c

index 416bb30..1859c47 100644 (file)
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
                 greh = (struct gre_base_hdr *)skb_transport_header(skb);
                 pcsum = (__sum16 *)(greh + 1);
  
-               if (gso_partial) {
+               if (gso_partial && skb_is_gso(skb)) {
                         unsigned int partial_adj;
  
                         /* Adjust checksum to account for the fact that
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c

index dc23177..c105a31 100644 (file)
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
         if (gre_handle_offloads(skb, false))
                 goto err_free_rt;
  
-       if (skb->len > dev->mtu) {
-               pskb_trim(skb, dev->mtu);
+       if (skb->len > dev->mtu + dev->hard_header_len) {
+               pskb_trim(skb, dev->mtu + dev->hard_header_len);
                 truncate = true;
         }
  
@@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
         if (skb_cow_head(skb, dev->needed_headroom))
                 goto free_skb;
  
-       if (skb->len - dev->hard_header_len > dev->mtu) {
-               pskb_trim(skb, dev->mtu);
+       if (skb->len > dev->mtu + dev->hard_header_len) {
+               pskb_trim(skb, dev->mtu + dev->hard_header_len);
                 truncate = true;
         }
  
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c

index 811689e..f75fc6b 100644 (file)
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
         if (synproxy == NULL)
                 return NF_ACCEPT;
  
-       if (nf_is_loopback_packet(skb))
+       if (nf_is_loopback_packet(skb) ||
+           ip_hdr(skb)->protocol != IPPROTO_TCP)
                 return NF_ACCEPT;
  
         thoff = ip_hdrlen(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c

index 1c7ed77..4306db8 100644 (file)
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2513,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
         struct rtable *ort = (struct rtable *) dst_orig;
         struct rtable *rt;
  
-       rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
+       rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
         if (rt) {
                 struct dst_entry *new = &rt->dst;
  
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index 8cf742f..3b34850 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
         struct tcp_sock *tp = tcp_sk(sk);
  
         tp->out_of_order_queue = RB_ROOT;
+       sk->tcp_rtx_queue = RB_ROOT;
         tcp_init_xmit_timers(sk);
         INIT_LIST_HEAD(&tp->tsq_node);
         INIT_LIST_HEAD(&tp->tsorted_sent_queue);
@@ -469,8 +470,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
         tcp_init_buffer_space(sk);
  }
  
-static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
  {
+       struct sk_buff *skb = tcp_write_queue_tail(sk);
+
         if (tsflags && skb) {
                 struct skb_shared_info *shinfo = skb_shinfo(skb);
                 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -699,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb;
  
-       if (!tcp_send_head(sk))
-               return;
-
         skb = tcp_write_queue_tail(sk);
+       if (!skb)
+               return;
         if (!(flags & MSG_MORE) || forced_push(tp))
                 tcp_mark_push(tp, skb);
  
@@ -962,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
                 int copy, i;
                 bool can_coalesce;
  
-               if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+               if (!skb || (copy = size_goal - skb->len) <= 0 ||
                     !tcp_skb_can_collapse_to(skb)) {
  new_segment:
                         if (!sk_stream_memory_free(sk))
                                 goto wait_for_sndbuf;
  
                         skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
-                                                 skb_queue_empty(&sk->sk_write_queue));
+                                       tcp_rtx_and_write_queues_empty(sk));
                         if (!skb)
                                 goto wait_for_memory;
  
@@ -1041,7 +1043,7 @@ wait_for_memory:
  
  out:
         if (copied) {
-               tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk));
+               tcp_tx_timestamp(sk, sk->sk_tsflags);
                 if (!(flags & MSG_SENDPAGE_NOTLAST))
                         tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
         }
@@ -1197,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
                         goto out_err;
                 }
  
-               skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+               skb = tcp_write_queue_tail(sk);
                 uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
                 if (!uarg) {
                         err = -ENOBUFS;
@@ -1273,7 +1275,7 @@ restart:
                 int max = size_goal;
  
                 skb = tcp_write_queue_tail(sk);
-               if (tcp_send_head(sk)) {
+               if (skb) {
                         if (skb->ip_summed == CHECKSUM_NONE)
                                 max = mss_now;
                         copy = max - skb->len;
@@ -1293,7 +1295,7 @@ new_segment:
                                 process_backlog = false;
                                 goto restart;
                         }
-                       first_skb = skb_queue_empty(&sk->sk_write_queue);
+                       first_skb = tcp_rtx_and_write_queues_empty(sk);
                         skb = sk_stream_alloc_skb(sk,
                                                   select_size(sk, sg, first_skb),
                                                   sk->sk_allocation,
@@ -1418,7 +1420,7 @@ wait_for_memory:
  
  out:
         if (copied) {
-               tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk));
+               tcp_tx_timestamp(sk, sockc.tsflags);
                 tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
         }
  out_nopush:
@@ -1519,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
  
         /* XXX -- need to support SO_PEEK_OFF */
  
+       skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+               err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+               if (err)
+                       return err;
+               copied += skb->len;
+       }
+
         skb_queue_walk(&sk->sk_write_queue, skb) {
                 err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
                 if (err)
@@ -2318,6 +2327,37 @@ static inline bool tcp_need_reset(int state)
                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
  }
  
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+       struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+       while (p) {
+               struct sk_buff *skb = rb_to_skb(p);
+
+               p = rb_next(p);
+               /* Since we are deleting whole queue, no need to
+                * list_del(&skb->tcp_tsorted_anchor)
+                */
+               tcp_rtx_queue_unlink(skb, sk);
+               sk_wmem_free_skb(sk, skb);
+       }
+}
+
+void tcp_write_queue_purge(struct sock *sk)
+{
+       struct sk_buff *skb;
+
+       tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+       while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+               tcp_skb_tsorted_anchor_cleanup(skb);
+               sk_wmem_free_skb(sk, skb);
+       }
+       tcp_rtx_queue_purge(sk);
+       INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
+       sk_mem_reclaim(sk);
+       tcp_clear_all_retrans_hints(tcp_sk(sk));
+}
+
  int tcp_disconnect(struct sock *sk, int flags)
  {
         struct inet_sock *inet = inet_sk(sk);
@@ -2376,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
          * issue in __tcp_select_window()
          */
         icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
-       tcp_init_send_head(sk);
         memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
         __sk_dst_reset(sk);
         dst_release(sk->sk_rx_dst);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c

index 29fff14..7ee4aad 100644 (file)
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
  void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
-       struct rb_node *p;
-       struct sk_buff *skb;
         struct dst_entry *dst;
+       struct sk_buff *skb;
  
         if (!tp->syn_fastopen)
                 return;
  
         if (!tp->data_segs_in) {
-               p = rb_first(&tp->out_of_order_queue);
-               if (p && !rb_next(p)) {
-                       skb = rb_entry(p, struct sk_buff, rbnode);
+               skb = skb_rb_first(&tp->out_of_order_queue);
+               if (skb && !skb_rb_next(skb)) {
                         if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
                                 tcp_fastopen_active_disable(sk);
                                 return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index fb0d7ed..d0682ce 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1142,6 +1142,7 @@ struct tcp_sacktag_state {
         u64     last_sackt;
         struct rate_sample *rate;
         int     flag;
+       unsigned int mss_now;
  };
  
  /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                 if (pkt_len >= skb->len && !in_sack)
                         return 0;
  
-               err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+               err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                  pkt_len, mss, GFP_ATOMIC);
                 if (err < 0)
                         return err;
         }
@@ -1288,13 +1290,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
  /* Shift newly-SACKed bytes from this skb to the immediately previous
   * already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
   */
-static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
+static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+                           struct sk_buff *skb,
                             struct tcp_sacktag_state *state,
                             unsigned int pcount, int shifted, int mss,
                             bool dup_sack)
  {
         struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
         u32 start_seq = TCP_SKB_CB(skb)->seq;   /* start of newly-SACKed */
         u32 end_seq = start_seq + shifted;      /* end of newly-SACKed */
  
@@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
         if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
                 TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
  
-       tcp_unlink_write_queue(skb, sk);
-       sk_wmem_free_skb(sk, skb);
+       tcp_rtx_queue_unlink_and_free(skb, sk);
  
         NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
  
@@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
                 goto fallback;
  
         /* Can only happen with delayed DSACK + discard craziness */
-       if (unlikely(skb == tcp_write_queue_head(sk)))
+       prev = skb_rb_prev(skb);
+       if (!prev)
                 goto fallback;
-       prev = tcp_write_queue_prev(sk, skb);
  
         if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
                 goto fallback;
@@ -1495,18 +1496,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
  
         if (!skb_shift(prev, skb, len))
                 goto fallback;
-       if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
+       if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
                 goto out;
  
         /* Hole filled allows collapsing with the next as well, this is very
          * useful when hole on every nth skb pattern happens
          */
-       if (prev == tcp_write_queue_tail(sk))
+       skb = skb_rb_next(prev);
+       if (!skb)
                 goto out;
-       skb = tcp_write_queue_next(sk, prev);
  
         if (!skb_can_shift(skb) ||
-           (skb == tcp_send_head(sk)) ||
             ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
             (mss != tcp_skb_seglen(skb)))
                 goto out;
@@ -1514,7 +1514,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
         len = skb->len;
         if (skb_shift(prev, skb, len)) {
                 pcount += tcp_skb_pcount(skb);
-               tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
+               tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+                               len, mss, 0);
         }
  
  out:
@@ -1538,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *tmp;
  
-       tcp_for_write_queue_from(skb, sk) {
+       skb_rbtree_walk_from(skb) {
                 int in_sack = 0;
                 bool dup_sack = dup_sack_in;
  
-               if (skb == tcp_send_head(sk))
-                       break;
-
                 /* queue is in-order => we can short-circuit the walk early */
                 if (!before(TCP_SKB_CB(skb)->seq, end_seq))
                         break;
@@ -1606,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
         return skb;
  }
  
-/* Avoid all extra work that is being done by sacktag while walking in
- * a normal way
- */
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
+                                          struct tcp_sacktag_state *state,
+                                          u32 seq)
+{
+       struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+       struct sk_buff *skb;
+       int unack_bytes;
+
+       while (*p) {
+               parent = *p;
+               skb = rb_to_skb(parent);
+               if (before(seq, TCP_SKB_CB(skb)->seq)) {
+                       p = &parent->rb_left;
+                       continue;
+               }
+               if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+                       p = &parent->rb_right;
+                       continue;
+               }
+
+               state->fack_count = 0;
+               unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
+               if (state->mss_now && unack_bytes > 0)
+                       state->fack_count = unack_bytes / state->mss_now;
+
+               return skb;
+       }
+       return NULL;
+}
+
  static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
                                         struct tcp_sacktag_state *state,
                                         u32 skip_to_seq)
  {
-       tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-
-               if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
-                       break;
+       if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+               return skb;
  
-               state->fack_count += tcp_skb_pcount(skb);
-       }
-       return skb;
+       return tcp_sacktag_bsearch(sk, state, skip_to_seq);
  }
  
  static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1744,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
                 }
         }
  
-       skb = tcp_write_queue_head(sk);
+       state->mss_now = tcp_current_mss(sk);
         state->fack_count = 0;
+       skb = NULL;
         i = 0;
  
         if (!tp->sacked_out) {
@@ -1969,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk)
         if (tcp_is_reno(tp))
                 tcp_reset_reno_sack(tp);
  
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
         is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
         if (is_reneg) {
                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
@@ -1978,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk)
         }
         tcp_clear_all_retrans_hints(tp);
  
-       tcp_for_write_queue(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-
+       skb_rbtree_walk_from(skb) {
                 mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
                              is_reneg);
                 if (mark_lost)
@@ -2207,20 +2224,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
         const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
  
         WARN_ON(packets > tp->packets_out);
-       if (tp->lost_skb_hint) {
-               skb = tp->lost_skb_hint;
-               cnt = tp->lost_cnt_hint;
+       skb = tp->lost_skb_hint;
+       if (skb) {
                 /* Head already handled? */
-               if (mark_head && skb != tcp_write_queue_head(sk))
+               if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
                         return;
+               cnt = tp->lost_cnt_hint;
         } else {
-               skb = tcp_write_queue_head(sk);
+               skb = tcp_rtx_queue_head(sk);
                 cnt = 0;
         }
  
-       tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
+       skb_rbtree_walk_from(skb) {
                 /* TODO: do this better */
                 /* this is not the most efficient way to do this... */
                 tp->lost_skb_hint = skb;
@@ -2244,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
                         /* If needed, chop off the prefix to mark as lost. */
                         lost = (packets - oldcnt) * mss;
                         if (lost < skb->len &&
-                           tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+                           tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                        lost, mss, GFP_ATOMIC) < 0)
                                 break;
                         cnt = packets;
                 }
@@ -2328,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk)
         if (tp->retrans_out)
                 return true;
  
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
         if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
                 return true;
  
@@ -2369,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
         if (unmark_loss) {
                 struct sk_buff *skb;
  
-               tcp_for_write_queue(skb, sk) {
-                       if (skb == tcp_send_head(sk))
-                               break;
+               skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                         TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
                 }
                 tp->lost_out = 0;
@@ -2616,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk)
         unsigned int mss = tcp_current_mss(sk);
         u32 prior_lost = tp->lost_out;
  
-       tcp_for_write_queue(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
+       skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                 if (tcp_skb_seglen(skb) > mss &&
                     !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
                         if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
@@ -2712,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
                          * is updated in tcp_ack()). Otherwise fall back to
                          * the conventional recovery.
                          */
-                       if (tcp_send_head(sk) &&
+                       if (!tcp_write_queue_empty(sk) &&
                             after(tcp_wnd_end(tp), tp->snd_nxt)) {
                                 *rexmit = REXMIT_NEW;
                                 return;
@@ -2804,9 +2816,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
         bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
                                     (tcp_fackets_out(tp) > tp->reordering));
  
-       if (WARN_ON(!tp->packets_out && tp->sacked_out))
+       if (!tp->packets_out && tp->sacked_out)
                 tp->sacked_out = 0;
-       if (WARN_ON(!tp->sacked_out && tp->fackets_out))
+       if (!tp->sacked_out && tp->fackets_out)
                 tp->fackets_out = 0;
  
         /* Now state machine starts.
@@ -3076,11 +3088,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
         struct tcp_sock *tp = tcp_sk(sk);
         u32 prior_sacked = tp->sacked_out;
         u32 reord = tp->packets_out;
+       struct sk_buff *skb, *next;
         bool fully_acked = true;
         long sack_rtt_us = -1L;
         long seq_rtt_us = -1L;
         long ca_rtt_us = -1L;
-       struct sk_buff *skb;
         u32 pkts_acked = 0;
         u32 last_in_flight = 0;
         bool rtt_update;
@@ -3088,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  
         first_ackt = 0;
  
-       while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+       for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
                 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                 u8 sacked = scb->sacked;
                 u32 acked_pcount;
@@ -3106,8 +3118,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                                 break;
                         fully_acked = false;
                 } else {
-                       /* Speedup tcp_unlink_write_queue() and next loop */
-                       prefetchw(skb->next);
                         acked_pcount = tcp_skb_pcount(skb);
                 }
  
@@ -3159,12 +3169,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                 if (!fully_acked)
                         break;
  
-               tcp_unlink_write_queue(skb, sk);
-               sk_wmem_free_skb(sk, skb);
+               next = skb_rb_next(skb);
                 if (unlikely(skb == tp->retransmit_skb_hint))
                         tp->retransmit_skb_hint = NULL;
                 if (unlikely(skb == tp->lost_skb_hint))
                         tp->lost_skb_hint = NULL;
+               tcp_rtx_queue_unlink_and_free(skb, sk);
         }
  
         if (!skb)
@@ -3256,12 +3266,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  
  static void tcp_ack_probe(struct sock *sk)
  {
-       const struct tcp_sock *tp = tcp_sk(sk);
         struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *head = tcp_send_head(sk);
+       const struct tcp_sock *tp = tcp_sk(sk);
  
         /* Was it a usable window open? */
-
-       if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+       if (!head)
+               return;
+       if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
                 icsk->icsk_backoff = 0;
                 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
                 /* Socket must be waked up by subsequent tcp_data_snd_check().
@@ -3381,7 +3393,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
                         tp->pred_flags = 0;
                         tcp_fast_path_check(sk);
  
-                       if (tcp_send_head(sk))
+                       if (!tcp_write_queue_empty(sk))
                                 tcp_slow_start_after_idle_check(sk);
  
                         if (nwin > tp->max_window) {
@@ -3566,8 +3578,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         sack_state.first_sackt = 0;
         sack_state.rate = &rs;
  
-       /* We very likely will need to access write queue head. */
-       prefetchw(sk->sk_write_queue.next);
+       /* We very likely will need to access rtx queue. */
+       prefetch(sk->tcp_rtx_queue.rb_node);
  
         /* If the ack is older than previous acks
          * then we can probably ignore it.
@@ -3681,8 +3693,7 @@ no_queue:
          * being used to time the probes, and is probably far higher than
          * it needs to be for normal retransmission.
          */
-       if (tcp_send_head(sk))
-               tcp_ack_probe(sk);
+       tcp_ack_probe(sk);
  
         if (tp->tlp_high_seq)
                 tcp_process_tlp_ack(sk, ack, flag);
@@ -4335,7 +4346,7 @@ static void tcp_ofo_queue(struct sock *sk)
  
         p = rb_first(&tp->out_of_order_queue);
         while (p) {
-               skb = rb_entry(p, struct sk_buff, rbnode);
+               skb = rb_to_skb(p);
                 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
                         break;
  
@@ -4399,7 +4410,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
  static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
  {
         struct tcp_sock *tp = tcp_sk(sk);
-       struct rb_node **p, *q, *parent;
+       struct rb_node **p, *parent;
         struct sk_buff *skb1;
         u32 seq, end_seq;
         bool fragstolen;
@@ -4458,7 +4469,7 @@ coalesce_done:
         parent = NULL;
         while (*p) {
                 parent = *p;
-               skb1 = rb_entry(parent, struct sk_buff, rbnode);
+               skb1 = rb_to_skb(parent);
                 if (before(seq, TCP_SKB_CB(skb1)->seq)) {
                         p = &parent->rb_left;
                         continue;
@@ -4503,9 +4514,7 @@ insert:
  
  merge_right:
         /* Remove other segments covered by skb. */
-       while ((q = rb_next(&skb->rbnode)) != NULL) {
-               skb1 = rb_entry(q, struct sk_buff, rbnode);
-
+       while ((skb1 = skb_rb_next(skb)) != NULL) {
                 if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
                         break;
                 if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
@@ -4520,7 +4529,7 @@ merge_right:
                 tcp_drop(sk, skb1);
         }
         /* If there is no skb after us, we are the last_skb ! */
-       if (!q)
+       if (!skb1)
                 tp->ooo_last_skb = skb;
  
  add_sack:
@@ -4706,7 +4715,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li
         if (list)
                 return !skb_queue_is_last(list, skb) ? skb->next : NULL;
  
-       return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode);
+       return skb_rb_next(skb);
  }
  
  static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
@@ -4727,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
  }
  
  /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
  {
         struct rb_node **p = &root->rb_node;
         struct rb_node *parent = NULL;
@@ -4735,7 +4744,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
  
         while (*p) {
                 parent = *p;
-               skb1 = rb_entry(parent, struct sk_buff, rbnode);
+               skb1 = rb_to_skb(parent);
                 if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
                         p = &parent->rb_left;
                 else
@@ -4854,26 +4863,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb, *head;
-       struct rb_node *p;
         u32 start, end;
  
-       p = rb_first(&tp->out_of_order_queue);
-       skb = rb_entry_safe(p, struct sk_buff, rbnode);
+       skb = skb_rb_first(&tp->out_of_order_queue);
  new_range:
         if (!skb) {
-               p = rb_last(&tp->out_of_order_queue);
-               /* Note: This is possible p is NULL here. We do not
-                * use rb_entry_safe(), as ooo_last_skb is valid only
-                * if rbtree is not empty.
-                */
-               tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode);
+               tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
                 return;
         }
         start = TCP_SKB_CB(skb)->seq;
         end = TCP_SKB_CB(skb)->end_seq;
  
         for (head = skb;;) {
-               skb = tcp_skb_next(skb, NULL);
+               skb = skb_rb_next(skb);
  
                 /* Range is terminated when we see a gap or when
                  * we are at the queue end.
@@ -4916,14 +4918,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
         do {
                 prev = rb_prev(node);
                 rb_erase(node, &tp->out_of_order_queue);
-               tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+               tcp_drop(sk, rb_to_skb(node));
                 sk_mem_reclaim(sk);
                 if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
                     !tcp_under_memory_pressure(sk))
                         break;
                 node = prev;
         } while (node);
-       tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
+       tp->ooo_last_skb = rb_to_skb(prev);
  
         /* Reset SACK state.  A conforming SACK implementation will
          * do the same at a timeout based retransmit.  When a connection
@@ -5538,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
                                     struct tcp_fastopen_cookie *cookie)
  {
         struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+       struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
         u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
         bool syn_drop = false;
  
@@ -5573,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
         tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
  
         if (data) { /* Retransmit unacked data in SYN */
-               tcp_for_write_queue_from(data, sk) {
-                       if (data == tcp_send_head(sk) ||
-                           __tcp_retransmit_skb(sk, data, 1))
+               skb_rbtree_walk_from(data) {
+                       if (__tcp_retransmit_skb(sk, data, 1))
                                 break;
                 }
                 tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index c7460fd..5418ecf 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                                                TCP_TIMEOUT_INIT;
                 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
  
-               skb = tcp_write_queue_head(sk);
+               skb = tcp_rtx_queue_head(sk);
                 BUG_ON(!skb);
  
                 tcp_mstamp_refresh(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 8162e28..696b0a1 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                            int push_one, gfp_t gfp);
  
  /* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
  {
         struct inet_connection_sock *icsk = inet_csk(sk);
         struct tcp_sock *tp = tcp_sk(sk);
         unsigned int prior_packets = tp->packets_out;
  
-       tcp_advance_send_head(sk, skb);
         tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
  
+       __skb_unlink(skb, &sk->sk_write_queue);
+       tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
         tp->packets_out += tcp_skb_pcount(skb);
         if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                 tcp_rearm_rto(sk);
@@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
         TCP_SKB_CB(skb)->eor = 0;
  }
  
+/* Insert buff after skb on the write or rtx queue of sk.  */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+                                        struct sk_buff *buff,
+                                        struct sock *sk,
+                                        enum tcp_queue tcp_queue)
+{
+       if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+               __skb_queue_after(&sk->sk_write_queue, skb, buff);
+       else
+               tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
  /* Function to create two new TCP segments.  Shrinks the given segment
   * to the specified size and appends a new segment with the rest of the
   * packet to the list.  This won't be called frequently, I hope.
   * Remember, these are still headerless SKBs at this point.
   */
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                struct sk_buff *skb, u32 len,
                  unsigned int mss_now, gfp_t gfp)
  {
         struct tcp_sock *tp = tcp_sk(sk);
@@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
  
         /* Link BUFF into the send queue. */
         __skb_header_release(buff);
-       tcp_insert_write_queue_after(skb, buff, sk);
+       tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
         list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
  
         return 0;
@@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
                  * is caused by insufficient sender buffer:
                  * 1) just sent some data (see tcp_write_xmit)
                  * 2) not cwnd limited (this else condition)
-                * 3) no more data to send (null tcp_send_head )
+                * 3) no more data to send (tcp_write_queue_empty())
                  * 4) application is hitting buffer limit (SOCK_NOSPACE)
                  */
-               if (!tcp_send_head(sk) && sk->sk_socket &&
+               if (tcp_write_queue_empty(sk) && sk->sk_socket &&
                     test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
                     (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
                         tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
@@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
   * know that all the data is in scatter-gather pages, and that the
   * packet has never been sent out before (and thus is not cloned).
   */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                       struct sk_buff *skb, unsigned int len,
                         unsigned int mss_now, gfp_t gfp)
  {
         struct sk_buff *buff;
@@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  
         /* All of a TSO frame must be composed of paged data.  */
         if (skb->len != skb->data_len)
-               return tcp_fragment(sk, skb, len, mss_now, gfp);
+               return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
  
         buff = sk_stream_alloc_skb(sk, 0, gfp, true);
         if (unlikely(!buff))
@@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  
         /* Link BUFF into the send queue. */
         __skb_header_release(buff);
-       tcp_insert_write_queue_after(skb, buff, sk);
+       tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
  
         return 0;
  }
@@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
                         goto send_now;
         }
  
-       head = tcp_write_queue_head(sk);
-
+       /* TODO : use tsorted_sent_queue ? */
+       head = tcp_rtx_queue_head(sk);
+       if (!head)
+               goto send_now;
         age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
         /* If next ACK is likely to come too late (half srtt), do not defer */
         if (age < (tp->srtt_us >> 4))
@@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
         limit <<= factor;
  
         if (refcount_read(&sk->sk_wmem_alloc) > limit) {
-               /* Always send the 1st or 2nd skb in write queue.
+               /* Always send skb if rtx queue is empty.
                  * No need to wait for TX completion to call us back,
                  * after softirq/tasklet schedule.
                  * This helps when TX completions are delayed too much.
                  */
-               if (skb == sk->sk_write_queue.next ||
-                   skb->prev == sk->sk_write_queue.next)
+               if (tcp_rtx_queue_empty(sk))
                         return false;
  
                 set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
          * it's the "most interesting" or current chrono we are
          * tracking and starts busy chrono if we have pending data.
          */
-       if (tcp_write_queue_empty(sk))
+       if (tcp_rtx_and_write_queues_empty(sk))
                 tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
         else if (type == tp->chrono_type)
                 tcp_chrono_set(tp, TCP_CHRONO_BUSY);
@@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                                     nonagle);
  
                 if (skb->len > limit &&
-                   unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+                   unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+                                         skb, limit, mss_now, gfp)))
                         break;
  
                 if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2350,7 +2368,7 @@ repair:
                 tcp_cwnd_validate(sk, is_cwnd_limited);
                 return false;
         }
-       return !tp->packets_out && tcp_send_head(sk);
+       return !tp->packets_out && !tcp_write_queue_empty(sk);
  }
  
  bool tcp_schedule_loss_probe(struct sock *sk)
@@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
                 return false;
  
         if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
-            tcp_send_head(sk))
+            !tcp_write_queue_empty(sk))
                 return false;
  
         /* Probe timeout is 2*rtt. Add minimum RTO to account
@@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk)
         int mss = tcp_current_mss(sk);
  
         skb = tcp_send_head(sk);
-       if (skb) {
-               if (tcp_snd_wnd_test(tp, skb, mss)) {
-                       pcount = tp->packets_out;
-                       tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
-                       if (tp->packets_out > pcount)
-                               goto probe_sent;
-                       goto rearm_timer;
-               }
-               skb = tcp_write_queue_prev(sk, skb);
-       } else {
-               skb = tcp_write_queue_tail(sk);
+       if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+               pcount = tp->packets_out;
+               tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+               if (tp->packets_out > pcount)
+                       goto probe_sent;
+               goto rearm_timer;
         }
+       skb = skb_rb_last(&sk->tcp_rtx_queue);
  
         /* At most one outstanding TLP retransmission. */
         if (tp->tlp_high_seq)
@@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk)
                 goto rearm_timer;
  
         if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
-               if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+               if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                         (pcount - 1) * mss, mss,
                                           GFP_ATOMIC)))
                         goto rearm_timer;
-               skb = tcp_write_queue_next(sk, skb);
+               skb = skb_rb_next(skb);
         }
  
         if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
@@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
  static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
  {
         struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+       struct sk_buff *next_skb = skb_rb_next(skb);
         int skb_size, next_skb_size;
  
         skb_size = skb->len;
@@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
         }
         tcp_highest_sack_combine(sk, next_skb, skb);
  
-       tcp_unlink_write_queue(next_skb, sk);
-
         if (next_skb->ip_summed == CHECKSUM_PARTIAL)
                 skb->ip_summed = CHECKSUM_PARTIAL;
  
@@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
  
         tcp_skb_collapse_tstamp(skb, next_skb);
  
-       sk_wmem_free_skb(sk, next_skb);
+       tcp_rtx_queue_unlink_and_free(next_skb, sk);
         return true;
  }
  
@@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
                 return false;
         if (skb_cloned(skb))
                 return false;
-       if (skb == tcp_send_head(sk))
-               return false;
         /* Some heuristics for collapsing over SACK'd could be invented */
         if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                 return false;
@@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
         if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
                 return;
  
-       tcp_for_write_queue_from_safe(skb, tmp, sk) {
+       skb_rbtree_walk_from_safe(skb, tmp) {
                 if (!tcp_can_collapse(sk, skb))
                         break;
  
@@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  
         len = cur_mss * segs;
         if (skb->len > len) {
-               if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+               if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+                                cur_mss, GFP_ATOMIC))
                         return -ENOMEM; /* We'll try again later. */
         } else {
                 if (skb_unclone(skb, GFP_ATOMIC))
@@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
  void tcp_xmit_retransmit_queue(struct sock *sk)
  {
         const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
         struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *skb;
-       struct sk_buff *hole = NULL;
         u32 max_segs;
         int mib_idx;
  
         if (!tp->packets_out)
                 return;
  
-       if (tp->retransmit_skb_hint) {
-               skb = tp->retransmit_skb_hint;
-       } else {
-               skb = tcp_write_queue_head(sk);
+       skb = tp->retransmit_skb_hint;
+       if (!skb) {
+               rtx_head = tcp_rtx_queue_head(sk);
+               skb = rtx_head;
         }
-
         max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
-       tcp_for_write_queue_from(skb, sk) {
+       skb_rbtree_walk_from(skb) {
                 __u8 sacked;
                 int segs;
  
-               if (skb == tcp_send_head(sk))
-                       break;
-
                 if (tcp_pacing_check(sk))
                         break;
  
@@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                 if (tcp_in_cwnd_reduction(sk))
                         tp->prr_out += tcp_skb_pcount(skb);
  
-               if (skb == tcp_write_queue_head(sk) &&
+               if (skb == rtx_head &&
                     icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                   inet_csk(sk)->icsk_rto,
@@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
          * Note: in the latter case, FIN packet will be sent after a timeout,
          * as TCP stack thinks it has already been transmitted.
          */
-       if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+       if (!tskb && tcp_under_memory_pressure(sk))
+               tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+       if (tskb) {
  coalesce:
                 TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
                 TCP_SKB_CB(tskb)->end_seq++;
                 tp->write_seq++;
-               if (!tcp_send_head(sk)) {
+               if (tcp_write_queue_empty(sk)) {
                         /* This means tskb was already sent.
                          * Pretend we included the FIN on previous transmit.
                          * We need to set tp->snd_nxt to the value it would have
@@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk)
  {
         struct sk_buff *skb;
  
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
         if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
-               pr_debug("%s: wrong queue state\n", __func__);
+               pr_err("%s: wrong queue state\n", __func__);
                 return -EFAULT;
         }
         if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
@@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk)
                         if (!nskb)
                                 return -ENOMEM;
                         INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
-                       tcp_unlink_write_queue(skb, sk);
+                       tcp_rtx_queue_unlink_and_free(skb, sk);
                         __skb_header_release(nskb);
-                       __tcp_add_write_queue_head(sk, nskb);
-                       sk_wmem_free_skb(sk, skb);
+                       tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
                         sk->sk_wmem_queued += nskb->truesize;
                         sk_mem_charge(sk, nskb->truesize);
                         skb = nskb;
@@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
  
         tcb->end_seq += skb->len;
         __skb_header_release(skb);
-       __tcp_add_write_queue_tail(sk, skb);
         sk->sk_wmem_queued += skb->truesize;
         sk_mem_charge(sk, skb->truesize);
         tp->write_seq = tcb->end_seq;
@@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
         TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
         if (!err) {
                 tp->syn_data = (fo->copied > 0);
+               tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
                 goto done;
         }
  
-       /* data was not sent, this is our new send_head */
-       sk->sk_send_head = syn_data;
+       /* data was not sent, put it in write_queue */
+       __skb_queue_tail(&sk->sk_write_queue, syn_data);
         tp->packets_out -= tcp_skb_pcount(syn_data);
  
  fallback:
@@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk)
         tp->retrans_stamp = tcp_time_stamp(tp);
         tcp_connect_queue_skb(sk, buff);
         tcp_ecn_send_syn(sk, buff);
+       tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
  
         /* Send off SYN; include data in Fast Open. */
         err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
@@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
                     skb->len > mss) {
                         seg_size = min(seg_size, mss);
                         TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
-                       if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+                       if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+                                        skb, seg_size, mss, GFP_ATOMIC))
                                 return -1;
                 } else if (!tcp_skb_pcount(skb))
                         tcp_set_skb_tso_segs(skb, mss);
@@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
  
         err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
  
-       if (tp->packets_out || !tcp_send_head(sk)) {
+       if (tp->packets_out || tcp_write_queue_empty(sk)) {
                 /* Cancel probe timer, if it is not required. */
                 icsk->icsk_probes_out = 0;
                 icsk->icsk_backoff = 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c

index 655dd8d..7014cc0 100644 (file)
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
                 return false;
  
         start_ts = tcp_sk(sk)->retrans_stamp;
-       if (unlikely(!start_ts))
-               start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+       if (unlikely(!start_ts)) {
+               struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+               if (!head)
+                       return false;
+               start_ts = tcp_skb_timestamp(head);
+       }
  
         if (likely(timeout == 0)) {
                 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data)
  static void tcp_probe_timer(struct sock *sk)
  {
         struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb = tcp_send_head(sk);
         struct tcp_sock *tp = tcp_sk(sk);
         int max_probes;
         u32 start_ts;
  
-       if (tp->packets_out || !tcp_send_head(sk)) {
+       if (tp->packets_out || !skb) {
                 icsk->icsk_probes_out = 0;
                 return;
         }
@@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk)
          * corresponding system limit. We also implement similar policy when
          * we use RTO to probe window in tcp_retransmit_timer().
          */
-       start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+       start_ts = tcp_skb_timestamp(skb);
         if (!start_ts)
-               tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
+               skb->skb_mstamp = tp->tcp_mstamp;
         else if (icsk->icsk_user_timeout &&
                  (s32)(tcp_time_stamp(tp) - start_ts) >
                  jiffies_to_msecs(icsk->icsk_user_timeout))
@@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk)
         if (!tp->packets_out)
                 goto out;
  
-       WARN_ON(tcp_write_queue_empty(sk));
+       WARN_ON(tcp_rtx_queue_empty(sk));
  
         tp->tlp_high_seq = 0;
  
@@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk)
                         goto out;
                 }
                 tcp_enter_loss(sk);
-               tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
+               tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
                 __sk_dst_reset(sk);
                 goto out_reset_timer;
         }
@@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk)
  
         tcp_enter_loss(sk);
  
-       if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
+       if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
                 /* Retransmission failed because of local congestion,
                  * do not backoff.
                  */
@@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data)
         elapsed = keepalive_time_when(tp);
  
         /* It is alive without keepalive 8) */
-       if (tp->packets_out || tcp_send_head(sk))
+       if (tp->packets_out || !tcp_write_queue_empty(sk))
                 goto resched;
  
         elapsed = keepalive_time_elapsed(tp);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c

index eb0359b..7c9a6e4 100644 (file)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2239,20 +2239,16 @@ int udp_v4_early_demux(struct sk_buff *skb)
         iph = ip_hdr(skb);
         uh = udp_hdr(skb);
  
-       if (skb->pkt_type == PACKET_BROADCAST ||
-           skb->pkt_type == PACKET_MULTICAST) {
+       if (skb->pkt_type == PACKET_MULTICAST) {
                 in_dev = __in_dev_get_rcu(skb->dev);
  
                 if (!in_dev)
                         return 0;
  
-               /* we are supposed to accept bcast packets */
-               if (skb->pkt_type == PACKET_MULTICAST) {
-                       ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
-                                              iph->protocol);
-                       if (!ours)
-                               return 0;
-               }
+               ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+                                      iph->protocol);
+               if (!ours)
+                       return 0;
  
                 sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
                                                    uh->source, iph->saddr,
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c

index 97658bf..e360d55 100644 (file)
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
                  * will be using a length value equal to only one MSS sized
                  * segment instead of the entire frame.
                  */
-               if (gso_partial) {
+               if (gso_partial && skb_is_gso(skb)) {
                         uh->len = htons(skb_shinfo(skb)->gso_size +
                                         SKB_GSO_CB(skb)->data_offset +
                                         skb->head - (unsigned char *)uh);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c

index 837418f..d9f6226 100644 (file)
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -152,7 +152,7 @@ static void ipv6_regen_rndid(struct inet6_dev *idev);
  static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
  
  static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
-static int ipv6_count_addresses(struct inet6_dev *idev);
+static int ipv6_count_addresses(const struct inet6_dev *idev);
  static int ipv6_generate_stable_address(struct in6_addr *addr,
                                         u8 dad_count,
                                         const struct inet6_dev *idev);
@@ -945,7 +945,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
                         break;
         }
  
-       list_add_tail(&ifp->if_list, p);
+       list_add_tail_rcu(&ifp->if_list, p);
  }
  
  static u32 inet6_addr_hash(const struct in6_addr *addr)
@@ -1204,7 +1204,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
         if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE))
                 action = check_cleanup_prefix_route(ifp, &expires);
  
-       list_del_init(&ifp->if_list);
+       list_del_rcu(&ifp->if_list);
         __in6_ifa_put(ifp);
  
         write_unlock_bh(&ifp->idev->lock);
@@ -1558,8 +1558,7 @@ static int __ipv6_dev_get_saddr(struct net *net,
  {
         struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];
  
-       read_lock_bh(&idev->lock);
-       list_for_each_entry(score->ifa, &idev->addr_list, if_list) {
+       list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) {
                 int i;
  
                 /*
@@ -1609,11 +1608,6 @@ static int __ipv6_dev_get_saddr(struct net *net,
                                 }
                                 break;
                         } else if (minihiscore < miniscore) {
-                               if (hiscore->ifa)
-                                       in6_ifa_put(hiscore->ifa);
-
-                               in6_ifa_hold(score->ifa);
-
                                 swap(hiscore, score);
                                 hiscore_idx = 1 - hiscore_idx;
  
@@ -1625,7 +1619,6 @@ static int __ipv6_dev_get_saddr(struct net *net,
                 }
         }
  out:
-       read_unlock_bh(&idev->lock);
         return hiscore_idx;
  }
  
@@ -1662,6 +1655,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
         int dst_type;
         bool use_oif_addr = false;
         int hiscore_idx = 0;
+       int ret = 0;
  
         dst_type = __ipv6_addr_type(daddr);
         dst.addr = daddr;
@@ -1737,15 +1731,14 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
         }
  
  out:
-       rcu_read_unlock();
-
         hiscore = &scores[hiscore_idx];
         if (!hiscore->ifa)
-               return -EADDRNOTAVAIL;
+               ret = -EADDRNOTAVAIL;
+       else
+               *saddr = hiscore->ifa->addr;
  
-       *saddr = hiscore->ifa->addr;
-       in6_ifa_put(hiscore->ifa);
-       return 0;
+       rcu_read_unlock();
+       return ret;
  }
  EXPORT_SYMBOL(ipv6_dev_get_saddr);
  
@@ -1785,15 +1778,15 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
         return err;
  }
  
-static int ipv6_count_addresses(struct inet6_dev *idev)
+static int ipv6_count_addresses(const struct inet6_dev *idev)
  {
+       const struct inet6_ifaddr *ifp;
         int cnt = 0;
-       struct inet6_ifaddr *ifp;
  
-       read_lock_bh(&idev->lock);
-       list_for_each_entry(ifp, &idev->addr_list, if_list)
+       rcu_read_lock();
+       list_for_each_entry_rcu(ifp, &idev->addr_list, if_list)
                 cnt++;
-       read_unlock_bh(&idev->lock);
+       rcu_read_unlock();
         return cnt;
  }
  
@@ -1859,20 +1852,18 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
  bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
         const unsigned int prefix_len, struct net_device *dev)
  {
-       struct inet6_dev *idev;
-       struct inet6_ifaddr *ifa;
+       const struct inet6_ifaddr *ifa;
+       const struct inet6_dev *idev;
         bool ret = false;
  
         rcu_read_lock();
         idev = __in6_dev_get(dev);
         if (idev) {
-               read_lock_bh(&idev->lock);
-               list_for_each_entry(ifa, &idev->addr_list, if_list) {
+               list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                         ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len);
                         if (ret)
                                 break;
                 }
-               read_unlock_bh(&idev->lock);
         }
         rcu_read_unlock();
  
@@ -1882,22 +1873,20 @@ EXPORT_SYMBOL(ipv6_chk_custom_prefix);
  
  int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
  {
-       struct inet6_dev *idev;
-       struct inet6_ifaddr *ifa;
+       const struct inet6_ifaddr *ifa;
+       const struct inet6_dev *idev;
         int     onlink;
  
         onlink = 0;
         rcu_read_lock();
         idev = __in6_dev_get(dev);
         if (idev) {
-               read_lock_bh(&idev->lock);
-               list_for_each_entry(ifa, &idev->addr_list, if_list) {
+               list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                         onlink = ipv6_prefix_equal(addr, &ifa->addr,
                                                    ifa->prefix_len);
                         if (onlink)
                                 break;
                 }
-               read_unlock_bh(&idev->lock);
         }
         rcu_read_unlock();
         return onlink;
@@ -2321,24 +2310,24 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
         if (!table)
                 return NULL;
  
-       read_lock_bh(&table->tb6_lock);
-       fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0);
+       rcu_read_lock();
+       fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
         if (!fn)
                 goto out;
  
-       noflags |= RTF_CACHE;
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                 if (rt->dst.dev->ifindex != dev->ifindex)
                         continue;
                 if ((rt->rt6i_flags & flags) != flags)
                         continue;
                 if ((rt->rt6i_flags & noflags) != 0)
                         continue;
-               dst_hold(&rt->dst);
+               if (!dst_hold_safe(&rt->dst))
+                       rt = NULL;
                 break;
         }
  out:
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
         return rt;
  }
  
@@ -3562,7 +3551,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
         struct net *net = dev_net(dev);
         struct inet6_dev *idev;
         struct inet6_ifaddr *ifa, *tmp;
-       struct list_head del_list;
         int _keep_addr;
         bool keep_addr;
         int state, i;
@@ -3654,7 +3642,6 @@ restart:
          */
         keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
  
-       INIT_LIST_HEAD(&del_list);
         list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
                 struct rt6_info *rt = NULL;
                 bool keep;
@@ -3663,8 +3650,6 @@ restart:
  
                 keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
                         !addr_is_local(&ifa->addr);
-               if (!keep)
-                       list_move(&ifa->if_list, &del_list);
  
                 write_unlock_bh(&idev->lock);
                 spin_lock_bh(&ifa->lock);
@@ -3698,19 +3683,14 @@ restart:
                 }
  
                 write_lock_bh(&idev->lock);
+               if (!keep) {
+                       list_del_rcu(&ifa->if_list);
+                       in6_ifa_put(ifa);
+               }
         }
  
         write_unlock_bh(&idev->lock);
  
-       /* now clean up addresses to be removed */
-       while (!list_empty(&del_list)) {
-               ifa = list_first_entry(&del_list,
-                                      struct inet6_ifaddr, if_list);
-               list_del(&ifa->if_list);
-
-               in6_ifa_put(ifa);
-       }
-
         /* Step 5: Discard anycast and multicast list */
         if (how) {
                 ipv6_ac_destroy_dev(idev);
@@ -3820,8 +3800,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
                 goto out;
  
         if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
-           dev_net(dev)->ipv6.devconf_all->accept_dad < 1 ||
-           idev->cnf.accept_dad < 1 ||
+           (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 &&
+            idev->cnf.accept_dad < 1) ||
             !(ifp->flags&IFA_F_TENTATIVE) ||
             ifp->flags & IFA_F_NODAD) {
                 bump_id = ifp->flags & IFA_F_TENTATIVE;
@@ -5898,10 +5878,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
                 spin_lock(&ifa->lock);
                 if (ifa->rt) {
                         struct rt6_info *rt = ifa->rt;
-                       struct fib6_table *table = rt->rt6i_table;
                         int cpu;
  
-                       read_lock(&table->tb6_lock);
+                       rcu_read_lock();
                         addrconf_set_nopolicy(ifa->rt, val);
                         if (rt->rt6i_pcpu) {
                                 for_each_possible_cpu(cpu) {
@@ -5911,7 +5890,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
                                         addrconf_set_nopolicy(*rtp, val);
                                 }
                         }
-                       read_unlock(&table->tb6_lock);
+                       rcu_read_unlock();
                 }
                 spin_unlock(&ifa->lock);
         }
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c

index c6311d7..2606d2f 100644 (file)
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -18,7 +18,6 @@
  #include <linux/if_addrlabel.h>
  #include <linux/netlink.h>
  #include <linux/rtnetlink.h>
-#include <linux/refcount.h>
  
  #if 0
  #define ADDRLABEL(x...) printk(x)
@@ -36,7 +35,6 @@ struct ip6addrlbl_entry {
         int addrtype;
         u32 label;
         struct hlist_node list;
-       refcount_t refcnt;
         struct rcu_head rcu;
  };
  
@@ -111,28 +109,6 @@ static const __net_initconst struct ip6addrlbl_init_table
         }
  };
  
-/* Object management */
-static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p)
-{
-       kfree(p);
-}
-
-static void ip6addrlbl_free_rcu(struct rcu_head *h)
-{
-       ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu));
-}
-
-static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p)
-{
-       return refcount_inc_not_zero(&p->refcnt);
-}
-
-static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p)
-{
-       if (refcount_dec_and_test(&p->refcnt))
-               call_rcu(&p->rcu, ip6addrlbl_free_rcu);
-}
-
  /* Find label */
  static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p,
                                const struct in6_addr *addr,
@@ -219,7 +195,6 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
         newp->addrtype = addrtype;
         newp->label = label;
         INIT_HLIST_NODE(&newp->list);
-       refcount_set(&newp->refcnt, 1);
         return newp;
  }
  
@@ -243,7 +218,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
                                 goto out;
                         }
                         hlist_replace_rcu(&p->list, &newp->list);
-                       ip6addrlbl_put(p);
+                       kfree_rcu(p, rcu);
                         goto out;
                 } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
                            (p->prefixlen < newp->prefixlen)) {
@@ -281,7 +256,7 @@ static int ip6addrlbl_add(struct net *net,
         ret = __ip6addrlbl_add(net, newp, replace);
         spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
         if (ret)
-               ip6addrlbl_free(newp);
+               kfree(newp);
         return ret;
  }
  
@@ -302,7 +277,7 @@ static int __ip6addrlbl_del(struct net *net,
                     p->ifindex == ifindex &&
                     ipv6_addr_equal(&p->prefix, prefix)) {
                         hlist_del_rcu(&p->list);
-                       ip6addrlbl_put(p);
+                       kfree_rcu(p, rcu);
                         ret = 0;
                         break;
                 }
@@ -360,7 +335,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net)
         spin_lock(&net->ipv6.ip6addrlbl_table.lock);
         hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
                 hlist_del_rcu(&p->list);
-               ip6addrlbl_put(p);
+               kfree_rcu(p, rcu);
         }
         spin_unlock(&net->ipv6.ip6addrlbl_table.lock);
  }
@@ -546,38 +521,28 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                 return -EINVAL;
         addr = nla_data(tb[IFAL_ADDRESS]);
  
-       rcu_read_lock();
-       p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
-       if (p && !ip6addrlbl_hold(p))
-               p = NULL;
-       lseq = net->ipv6.ip6addrlbl_table.seq;
-       rcu_read_unlock();
-
-       if (!p) {
-               err = -ESRCH;
-               goto out;
-       }
-
         skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL);
-       if (!skb) {
-               ip6addrlbl_put(p);
+       if (!skb)
                 return -ENOBUFS;
-       }
  
-       err = ip6addrlbl_fill(skb, p, lseq,
-                             NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
-                             RTM_NEWADDRLABEL, 0);
+       err = -ESRCH;
  
-       ip6addrlbl_put(p);
+       rcu_read_lock();
+       p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
+       lseq = net->ipv6.ip6addrlbl_table.seq;
+       if (p)
+               err = ip6addrlbl_fill(skb, p, lseq,
+                                     NETLINK_CB(in_skb).portid,
+                                     nlh->nlmsg_seq,
+                                     RTM_NEWADDRLABEL, 0);
+       rcu_read_unlock();
  
         if (err < 0) {
                 WARN_ON(err == -EMSGSIZE);
                 kfree_skb(skb);
-               goto out;
+       } else {
+               err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
         }
-
-       err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-out:
         return err;
  }
  
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c

index aeb49b4..4e52d52 100644 (file)
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -250,15 +250,15 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset)
         return (*op & 0xC0) == 0x80;
  }
  
-int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
-                              struct icmp6hdr *thdr, int len)
+void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
+                               struct icmp6hdr *thdr, int len)
  {
         struct sk_buff *skb;
         struct icmp6hdr *icmp6h;
  
         skb = skb_peek(&sk->sk_write_queue);
         if (!skb)
-               goto out;
+               return;
  
         icmp6h = icmp6_hdr(skb);
         memcpy(icmp6h, thdr, sizeof(struct icmp6hdr));
@@ -286,8 +286,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
                                                       tmp_csum);
         }
         ip6_push_pending_frames(sk);
-out:
-       return 0;
  }
  
  struct icmpv6_msg {
@@ -437,7 +435,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
         int iif = 0;
         int addr_type = 0;
         int len;
-       int err = 0;
         u32 mark = IP6_REPLY_MARK(net, skb->mark);
  
         if ((u8 *)hdr < skb->head ||
@@ -574,17 +571,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
         rcu_read_lock();
         idev = __in6_dev_get(skb->dev);
  
-       err = ip6_append_data(sk, icmpv6_getfrag, &msg,
-                             len + sizeof(struct icmp6hdr),
-                             sizeof(struct icmp6hdr),
-                             &ipc6, &fl6, (struct rt6_info *)dst,
-                             MSG_DONTWAIT, &sockc_unused);
-       if (err) {
+       if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+                           len + sizeof(struct icmp6hdr),
+                           sizeof(struct icmp6hdr),
+                           &ipc6, &fl6, (struct rt6_info *)dst,
+                           MSG_DONTWAIT, &sockc_unused)) {
                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
                 ip6_flush_pending_frames(sk);
         } else {
-               err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
-                                                len + sizeof(struct icmp6hdr));
+               icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+                                          len + sizeof(struct icmp6hdr));
         }
         rcu_read_unlock();
  out_dst_release:
@@ -681,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
         struct icmpv6_msg msg;
         struct dst_entry *dst;
         struct ipcm6_cookie ipc6;
-       int err = 0;
         u32 mark = IP6_REPLY_MARK(net, skb->mark);
         struct sockcm_cookie sockc_unused = {0};
  
@@ -718,8 +713,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
         else if (!fl6.flowi6_oif)
                 fl6.flowi6_oif = np->ucast_oif;
  
-       err = ip6_dst_lookup(net, sk, &dst, &fl6);
-       if (err)
+       if (ip6_dst_lookup(net, sk, &dst, &fl6))
                 goto out;
         dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
         if (IS_ERR(dst))
@@ -736,17 +730,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
         ipc6.dontfrag = np->dontfrag;
         ipc6.opt = NULL;
  
-       err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr),
-                               sizeof(struct icmp6hdr), &ipc6, &fl6,
-                               (struct rt6_info *)dst, MSG_DONTWAIT,
-                               &sockc_unused);
-
-       if (err) {
+       if (ip6_append_data(sk, icmpv6_getfrag, &msg,
+                           skb->len + sizeof(struct icmp6hdr),
+                           sizeof(struct icmp6hdr), &ipc6, &fl6,
+                           (struct rt6_info *)dst, MSG_DONTWAIT,
+                           &sockc_unused)) {
                 __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
                 ip6_flush_pending_frames(sk);
         } else {
-               err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
-                                                skb->len + sizeof(struct icmp6hdr));
+               icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
+                                          skb->len + sizeof(struct icmp6hdr));
         }
         dst_release(dst);
  out:
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c

index e5308d7..c2ecd5e 100644 (file)
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -38,14 +38,6 @@
  #include <net/ip6_fib.h>
  #include <net/ip6_route.h>
  
-#define RT6_DEBUG 2
-
-#if RT6_DEBUG >= 3
-#define RT6_TRACE(x...) pr_debug(x)
-#else
-#define RT6_TRACE(x...) do { ; } while (0)
-#endif
-
  static struct kmem_cache *fib6_node_kmem __read_mostly;
  
  struct fib6_cleaner {
@@ -62,9 +54,12 @@ struct fib6_cleaner {
  #define FWS_INIT FWS_L
  #endif
  
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
-static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
+static struct rt6_info *fib6_find_prefix(struct net *net,
+                                        struct fib6_table *table,
+                                        struct fib6_node *fn);
+static struct fib6_node *fib6_repair_tree(struct net *net,
+                                         struct fib6_table *table,
+                                         struct fib6_node *fn);
  static int fib6_walk(struct net *net, struct fib6_walker *w);
  static int fib6_walk_continue(struct fib6_walker *w);
  
@@ -110,6 +105,20 @@ enum {
         FIB6_NO_SERNUM_CHANGE = 0,
  };
  
+void fib6_update_sernum(struct rt6_info *rt)
+{
+       struct fib6_table *table = rt->rt6i_table;
+       struct net *net = dev_net(rt->dst.dev);
+       struct fib6_node *fn;
+
+       spin_lock_bh(&table->tb6_lock);
+       fn = rcu_dereference_protected(rt->rt6i_node,
+                       lockdep_is_held(&table->tb6_lock));
+       if (fn)
+               fn->fn_sernum = fib6_new_sernum(net);
+       spin_unlock_bh(&table->tb6_lock);
+}
+
  /*
   *     Auxiliary address test functions for the radix tree.
   *
@@ -140,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
                addr[fn_bit >> 5];
  }
  
-static struct fib6_node *node_alloc(void)
+static struct fib6_node *node_alloc(struct net *net)
  {
         struct fib6_node *fn;
  
         fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
+       if (fn)
+               net->ipv6.rt6_stats->fib_nodes++;
  
         return fn;
  }
  
-static void node_free_immediate(struct fib6_node *fn)
+static void node_free_immediate(struct net *net, struct fib6_node *fn)
  {
         kmem_cache_free(fib6_node_kmem, fn);
+       net->ipv6.rt6_stats->fib_nodes--;
  }
  
  static void node_free_rcu(struct rcu_head *head)
@@ -161,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head)
         kmem_cache_free(fib6_node_kmem, fn);
  }
  
-static void node_free(struct fib6_node *fn)
+static void node_free(struct net *net, struct fib6_node *fn)
  {
         call_rcu(&fn->rcu, node_free_rcu);
+       net->ipv6.rt6_stats->fib_nodes--;
  }
  
  void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
@@ -185,9 +198,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)
                         *ppcpu_rt = NULL;
                 }
         }
-
-       free_percpu(non_pcpu_rt->rt6i_pcpu);
-       non_pcpu_rt->rt6i_pcpu = NULL;
  }
  EXPORT_SYMBOL_GPL(rt6_free_pcpu);
  
@@ -205,8 +215,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
          * Initialize table lock at a single place to give lockdep a key,
          * tables aren't visible prior to being linked to the list.
          */
-       rwlock_init(&tb->tb6_lock);
-
+       spin_lock_init(&tb->tb6_lock);
         h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
  
         /*
@@ -225,7 +234,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
         table = kzalloc(sizeof(*table), GFP_ATOMIC);
         if (table) {
                 table->tb6_id = id;
-               table->tb6_root.leaf = net->ipv6.ip6_null_entry;
+               rcu_assign_pointer(table->tb6_root.leaf,
+                                  net->ipv6.ip6_null_entry);
                 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
                 inet_peer_base_init(&table->tb6_peers);
         }
@@ -322,11 +332,8 @@ unsigned int fib6_tables_seq_read(struct net *net)
                 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                 struct fib6_table *tb;
  
-               hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
-                       read_lock_bh(&tb->tb6_lock);
+               hlist_for_each_entry_rcu(tb, head, tb6_hlist)
                         fib_seq += tb->fib_seq;
-                       read_unlock_bh(&tb->tb6_lock);
-               }
         }
         rcu_read_unlock();
  
@@ -372,7 +379,7 @@ static int fib6_node_dump(struct fib6_walker *w)
  {
         struct rt6_info *rt;
  
-       for (rt = w->leaf; rt; rt = rt->dst.rt6_next)
+       for_each_fib6_walker_rt(w)
                 fib6_rt_dump(rt, w->args);
         w->leaf = NULL;
         return 0;
@@ -382,9 +389,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb,
                             struct fib6_walker *w)
  {
         w->root = &tb->tb6_root;
-       read_lock_bh(&tb->tb6_lock);
+       spin_lock_bh(&tb->tb6_lock);
         fib6_walk(net, w);
-       read_unlock_bh(&tb->tb6_lock);
+       spin_unlock_bh(&tb->tb6_lock);
  }
  
  /* Called with rcu_read_lock() */
@@ -421,7 +428,7 @@ static int fib6_dump_node(struct fib6_walker *w)
         int res;
         struct rt6_info *rt;
  
-       for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_walker_rt(w) {
                 res = rt6_dump_route(rt, w->args);
                 if (res < 0) {
                         /* Frame is full, suspend walking */
@@ -480,9 +487,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                 w->count = 0;
                 w->skip = 0;
  
-               read_lock_bh(&table->tb6_lock);
+               spin_lock_bh(&table->tb6_lock);
                 res = fib6_walk(net, w);
-               read_unlock_bh(&table->tb6_lock);
+               spin_unlock_bh(&table->tb6_lock);
                 if (res > 0) {
                         cb->args[4] = 1;
                         cb->args[5] = w->root->fn_sernum;
@@ -497,9 +504,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                 } else
                         w->skip = 0;
  
-               read_lock_bh(&table->tb6_lock);
+               spin_lock_bh(&table->tb6_lock);
                 res = fib6_walk_continue(w);
-               read_unlock_bh(&table->tb6_lock);
+               spin_unlock_bh(&table->tb6_lock);
                 if (res <= 0) {
                         fib6_walker_unlink(net, w);
                         cb->args[4] = 0;
@@ -580,11 +587,13 @@ out:
   *     node.
   */
  
-static struct fib6_node *fib6_add_1(struct fib6_node *root,
-                                    struct in6_addr *addr, int plen,
-                                    int offset, int allow_create,
-                                    int replace_required, int sernum,
-                                    struct netlink_ext_ack *extack)
+static struct fib6_node *fib6_add_1(struct net *net,
+                                   struct fib6_table *table,
+                                   struct fib6_node *root,
+                                   struct in6_addr *addr, int plen,
+                                   int offset, int allow_create,
+                                   int replace_required,
+                                   struct netlink_ext_ack *extack)
  {
         struct fib6_node *fn, *in, *ln;
         struct fib6_node *pn = NULL;
@@ -599,7 +608,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
         fn = root;
  
         do {
-               key = (struct rt6key *)((u8 *)fn->leaf + offset);
+               struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               key = (struct rt6key *)((u8 *)leaf + offset);
  
                 /*
                  *      Prefix match
@@ -625,12 +636,10 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
                 if (plen == fn->fn_bit) {
                         /* clean up an intermediate node */
                         if (!(fn->fn_flags & RTN_RTINFO)) {
-                               rt6_release(fn->leaf);
-                               fn->leaf = NULL;
+                               RCU_INIT_POINTER(fn->leaf, NULL);
+                               rt6_release(leaf);
                         }
  
-                       fn->fn_sernum = sernum;
-
                         return fn;
                 }
  
@@ -639,10 +648,13 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
                  */
  
                 /* Try to walk down on tree. */
-               fn->fn_sernum = sernum;
                 dir = addr_bit_set(addr, fn->fn_bit);
                 pn = fn;
-               fn = dir ? fn->right : fn->left;
+               fn = dir ?
+                    rcu_dereference_protected(fn->right,
+                                       lockdep_is_held(&table->tb6_lock)) :
+                    rcu_dereference_protected(fn->left,
+                                       lockdep_is_held(&table->tb6_lock));
         } while (fn);
  
         if (!allow_create) {
@@ -668,19 +680,17 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
          *      Create new leaf node without children.
          */
  
-       ln = node_alloc();
+       ln = node_alloc(net);
  
         if (!ln)
                 return ERR_PTR(-ENOMEM);
         ln->fn_bit = plen;
-
-       ln->parent = pn;
-       ln->fn_sernum = sernum;
+       RCU_INIT_POINTER(ln->parent, pn);
  
         if (dir)
-               pn->right = ln;
+               rcu_assign_pointer(pn->right, ln);
         else
-               pn->left  = ln;
+               rcu_assign_pointer(pn->left, ln);
  
         return ln;
  
@@ -694,7 +704,8 @@ insert_above:
          * and the current
          */
  
-       pn = fn->parent;
+       pn = rcu_dereference_protected(fn->parent,
+                                      lockdep_is_held(&table->tb6_lock));
  
         /* find 1st bit in difference between the 2 addrs.
  
@@ -710,14 +721,14 @@ insert_above:
          *      (new leaf node)[ln] (old node)[fn]
          */
         if (plen > bit) {
-               in = node_alloc();
-               ln = node_alloc();
+               in = node_alloc(net);
+               ln = node_alloc(net);
  
                 if (!in || !ln) {
                         if (in)
-                               node_free_immediate(in);
+                               node_free_immediate(net, in);
                         if (ln)
-                               node_free_immediate(ln);
+                               node_free_immediate(net, ln);
                         return ERR_PTR(-ENOMEM);
                 }
  
@@ -731,31 +742,28 @@ insert_above:
  
                 in->fn_bit = bit;
  
-               in->parent = pn;
+               RCU_INIT_POINTER(in->parent, pn);
                 in->leaf = fn->leaf;
-               atomic_inc(&in->leaf->rt6i_ref);
-
-               in->fn_sernum = sernum;
+               atomic_inc(&rcu_dereference_protected(in->leaf,
+                               lockdep_is_held(&table->tb6_lock))->rt6i_ref);
  
                 /* update parent pointer */
                 if (dir)
-                       pn->right = in;
+                       rcu_assign_pointer(pn->right, in);
                 else
-                       pn->left  = in;
+                       rcu_assign_pointer(pn->left, in);
  
                 ln->fn_bit = plen;
  
-               ln->parent = in;
-               fn->parent = in;
-
-               ln->fn_sernum = sernum;
+               RCU_INIT_POINTER(ln->parent, in);
+               rcu_assign_pointer(fn->parent, in);
  
                 if (addr_bit_set(addr, bit)) {
-                       in->right = ln;
-                       in->left  = fn;
+                       rcu_assign_pointer(in->right, ln);
+                       rcu_assign_pointer(in->left, fn);
                 } else {
-                       in->left  = ln;
-                       in->right = fn;
+                       rcu_assign_pointer(in->left, ln);
+                       rcu_assign_pointer(in->right, fn);
                 }
         } else { /* plen <= bit */
  
@@ -765,28 +773,26 @@ insert_above:
                  *           (old node)[fn] NULL
                  */
  
-               ln = node_alloc();
+               ln = node_alloc(net);
  
                 if (!ln)
                         return ERR_PTR(-ENOMEM);
  
                 ln->fn_bit = plen;
  
-               ln->parent = pn;
-
-               ln->fn_sernum = sernum;
-
-               if (dir)
-                       pn->right = ln;
-               else
-                       pn->left  = ln;
+               RCU_INIT_POINTER(ln->parent, pn);
  
                 if (addr_bit_set(&key->addr, plen))
-                       ln->right = fn;
+                       RCU_INIT_POINTER(ln->right, fn);
                 else
-                       ln->left  = fn;
+                       RCU_INIT_POINTER(ln->left, fn);
+
+               rcu_assign_pointer(fn->parent, ln);
  
-               fn->parent = ln;
+               if (dir)
+                       rcu_assign_pointer(pn->right, ln);
+               else
+                       rcu_assign_pointer(pn->left, ln);
         }
         return ln;
  }
@@ -832,6 +838,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc)
  static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
                           struct net *net)
  {
+       struct fib6_table *table = rt->rt6i_table;
+
         if (atomic_read(&rt->rt6i_ref) != 1) {
                 /* This route is used as dummy address holder in some split
                  * nodes. It is not leaked, but it still holds other resources,
@@ -840,12 +848,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
                  * to still alive ones.
                  */
                 while (fn) {
-                       if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) {
-                               fn->leaf = fib6_find_prefix(net, fn);
-                               atomic_inc(&fn->leaf->rt6i_ref);
+                       struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+                       struct rt6_info *new_leaf;
+                       if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
+                               new_leaf = fib6_find_prefix(net, table, fn);
+                               atomic_inc(&new_leaf->rt6i_ref);
+                               rcu_assign_pointer(fn->leaf, new_leaf);
                                 rt6_release(rt);
                         }
-                       fn = fn->parent;
+                       fn = rcu_dereference_protected(fn->parent,
+                                   lockdep_is_held(&table->tb6_lock));
                 }
         }
  }
@@ -857,9 +870,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn,
  static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
                             struct nl_info *info, struct mx6_config *mxc)
  {
+       struct rt6_info *leaf = rcu_dereference_protected(fn->leaf,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
         struct rt6_info *iter = NULL;
-       struct rt6_info **ins;
-       struct rt6_info **fallback_ins = NULL;
+       struct rt6_info __rcu **ins;
+       struct rt6_info __rcu **fallback_ins = NULL;
         int replace = (info->nlh &&
                        (info->nlh->nlmsg_flags & NLM_F_REPLACE));
         int add = (!info->nlh ||
@@ -874,7 +889,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
  
         ins = &fn->leaf;
  
-       for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) {
+       for (iter = leaf; iter;
+            iter = rcu_dereference_protected(iter->dst.rt6_next,
+                               lockdep_is_held(&rt->rt6i_table->tb6_lock))) {
                 /*
                  *      Search for duplicates
                  */
@@ -936,7 +953,8 @@ next_iter:
         if (fallback_ins && !found) {
                 /* No ECMP-able route found, replace first non-ECMP one */
                 ins = fallback_ins;
-               iter = *ins;
+               iter = rcu_dereference_protected(*ins,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
                 found++;
         }
  
@@ -950,7 +968,7 @@ next_iter:
                 struct rt6_info *sibling, *temp_sibling;
  
                 /* Find the first route that have the same metric */
-               sibling = fn->leaf;
+               sibling = leaf;
                 while (sibling) {
                         if (sibling->rt6i_metric == rt->rt6i_metric &&
                             rt6_qualify_for_ecmp(sibling)) {
@@ -958,7 +976,8 @@ next_iter:
                                               &sibling->rt6i_siblings);
                                 break;
                         }
-                       sibling = sibling->dst.rt6_next;
+                       sibling = rcu_dereference_protected(sibling->dst.rt6_next,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
                 }
                 /* For each sibling in the list, increment the counter of
                  * siblings. BUG() if counters does not match, list of siblings
@@ -987,10 +1006,10 @@ add:
                 if (err)
                         return err;
  
-               rt->dst.rt6_next = iter;
-               *ins = rt;
-               rcu_assign_pointer(rt->rt6i_node, fn);
+               rcu_assign_pointer(rt->dst.rt6_next, iter);
                 atomic_inc(&rt->rt6i_ref);
+               rcu_assign_pointer(rt->rt6i_node, fn);
+               rcu_assign_pointer(*ins, rt);
                 call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
                                           rt);
                 if (!info->skip_notify)
@@ -1016,10 +1035,10 @@ add:
                 if (err)
                         return err;
  
-               *ins = rt;
+               atomic_inc(&rt->rt6i_ref);
                 rcu_assign_pointer(rt->rt6i_node, fn);
                 rt->dst.rt6_next = iter->dst.rt6_next;
-               atomic_inc(&rt->rt6i_ref);
+               rcu_assign_pointer(*ins, rt);
                 call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
                                           rt);
                 if (!info->skip_notify)
@@ -1031,14 +1050,15 @@ add:
                 nsiblings = iter->rt6i_nsiblings;
                 iter->rt6i_node = NULL;
                 fib6_purge_rt(iter, fn, info->nl_net);
-               if (fn->rr_ptr == iter)
+               if (rcu_access_pointer(fn->rr_ptr) == iter)
                         fn->rr_ptr = NULL;
                 rt6_release(iter);
  
                 if (nsiblings) {
                         /* Replacing an ECMP route, remove all siblings */
                         ins = &rt->dst.rt6_next;
-                       iter = *ins;
+                       iter = rcu_dereference_protected(*ins,
+                                   lockdep_is_held(&rt->rt6i_table->tb6_lock));
                         while (iter) {
                                 if (iter->rt6i_metric > rt->rt6i_metric)
                                         break;
@@ -1046,14 +1066,16 @@ add:
                                         *ins = iter->dst.rt6_next;
                                         iter->rt6i_node = NULL;
                                         fib6_purge_rt(iter, fn, info->nl_net);
-                                       if (fn->rr_ptr == iter)
+                                       if (rcu_access_pointer(fn->rr_ptr) == iter)
                                                 fn->rr_ptr = NULL;
                                         rt6_release(iter);
                                         nsiblings--;
+                                       info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
                                 } else {
                                         ins = &iter->dst.rt6_next;
                                 }
-                               iter = *ins;
+                               iter = rcu_dereference_protected(*ins,
+                                       lockdep_is_held(&rt->rt6i_table->tb6_lock));
                         }
                         WARN_ON(nsiblings != 0);
                 }
@@ -1077,16 +1099,33 @@ void fib6_force_start_gc(struct net *net)
                           jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
  }
  
+static void fib6_update_sernum_upto_root(struct rt6_info *rt,
+                                        int sernum)
+{
+       struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
+                               lockdep_is_held(&rt->rt6i_table->tb6_lock));
+
+       /* paired with smp_rmb() in rt6_get_cookie_safe() */
+       smp_wmb();
+       while (fn) {
+               fn->fn_sernum = sernum;
+               fn = rcu_dereference_protected(fn->parent,
+                               lockdep_is_held(&rt->rt6i_table->tb6_lock));
+       }
+}
+
  /*
   *     Add routing information to the routing tree.
   *     <destination addr>/<source addr>
   *     with source addr info in sub-trees
+ *     Need to own table->tb6_lock
   */
  
  int fib6_add(struct fib6_node *root, struct rt6_info *rt,
              struct nl_info *info, struct mx6_config *mxc,
              struct netlink_ext_ack *extack)
  {
+       struct fib6_table *table = rt->rt6i_table;
         struct fib6_node *fn, *pn = NULL;
         int err = -ENOMEM;
         int allow_create = 1;
@@ -1095,6 +1134,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
  
         if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt)))
                 return -EINVAL;
+       if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE))
+               return -EINVAL;
  
         if (info->nlh) {
                 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1105,9 +1146,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
         if (!allow_create && !replace_required)
                 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
  
-       fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
+       fn = fib6_add_1(info->nl_net, table, root,
+                       &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
                         offsetof(struct rt6_info, rt6i_dst), allow_create,
-                       replace_required, sernum, extack);
+                       replace_required, extack);
         if (IS_ERR(fn)) {
                 err = PTR_ERR(fn);
                 fn = NULL;
@@ -1120,7 +1162,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
         if (rt->rt6i_src.plen) {
                 struct fib6_node *sn;
  
-               if (!fn->subtree) {
+               if (!rcu_access_pointer(fn->subtree)) {
                         struct fib6_node *sfn;
  
                         /*
@@ -1134,42 +1176,40 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
                          */
  
                         /* Create subtree root node */
-                       sfn = node_alloc();
+                       sfn = node_alloc(info->nl_net);
                         if (!sfn)
                                 goto failure;
  
-                       sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
                         atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
+                       rcu_assign_pointer(sfn->leaf,
+                                          info->nl_net->ipv6.ip6_null_entry);
                         sfn->fn_flags = RTN_ROOT;
-                       sfn->fn_sernum = sernum;
  
                         /* Now add the first leaf node to new subtree */
  
-                       sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
-                                       rt->rt6i_src.plen,
+                       sn = fib6_add_1(info->nl_net, table, sfn,
+                                       &rt->rt6i_src.addr, rt->rt6i_src.plen,
                                         offsetof(struct rt6_info, rt6i_src),
-                                       allow_create, replace_required, sernum,
-                                       extack);
+                                       allow_create, replace_required, extack);
  
                         if (IS_ERR(sn)) {
                                 /* If it is failed, discard just allocated
                                    root, and then (in failure) stale node
                                    in main tree.
                                  */
-                               node_free_immediate(sfn);
+                               node_free_immediate(info->nl_net, sfn);
                                 err = PTR_ERR(sn);
                                 goto failure;
                         }
  
                         /* Now link new subtree to main tree */
-                       sfn->parent = fn;
-                       fn->subtree = sfn;
+                       rcu_assign_pointer(sfn->parent, fn);
+                       rcu_assign_pointer(fn->subtree, sfn);
                 } else {
-                       sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
-                                       rt->rt6i_src.plen,
+                       sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
+                                       &rt->rt6i_src.addr, rt->rt6i_src.plen,
                                         offsetof(struct rt6_info, rt6i_src),
-                                       allow_create, replace_required, sernum,
-                                       extack);
+                                       allow_create, replace_required, extack);
  
                         if (IS_ERR(sn)) {
                                 err = PTR_ERR(sn);
@@ -1177,9 +1217,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
                         }
                 }
  
-               if (!fn->leaf) {
-                       fn->leaf = rt;
+               if (!rcu_access_pointer(fn->leaf)) {
                         atomic_inc(&rt->rt6i_ref);
+                       rcu_assign_pointer(fn->leaf, rt);
                 }
                 fn = sn;
         }
@@ -1187,9 +1227,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
  
         err = fib6_add_rt2node(fn, rt, info, mxc);
         if (!err) {
+               fib6_update_sernum_upto_root(rt, sernum);
                 fib6_start_gc(info->nl_net, rt);
-               if (!(rt->rt6i_flags & RTF_CACHE))
-                       fib6_prune_clones(info->nl_net, pn);
         }
  
  out:
@@ -1199,19 +1238,23 @@ out:
                  * If fib6_add_1 has cleared the old leaf pointer in the
                  * super-tree leaf node we have to find a new one for it.
                  */
-               if (pn != fn && pn->leaf == rt) {
-                       pn->leaf = NULL;
+               struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               if (pn != fn && pn_leaf == rt) {
+                       pn_leaf = NULL;
+                       RCU_INIT_POINTER(pn->leaf, NULL);
                         atomic_dec(&rt->rt6i_ref);
                 }
-               if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) {
-                       pn->leaf = fib6_find_prefix(info->nl_net, pn);
+               if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
+                       pn_leaf = fib6_find_prefix(info->nl_net, table, pn);
  #if RT6_DEBUG >= 2
-                       if (!pn->leaf) {
-                               WARN_ON(pn->leaf == NULL);
-                               pn->leaf = info->nl_net->ipv6.ip6_null_entry;
+                       if (!pn_leaf) {
+                               WARN_ON(!pn_leaf);
+                               pn_leaf = info->nl_net->ipv6.ip6_null_entry;
                         }
  #endif
-                       atomic_inc(&pn->leaf->rt6i_ref);
+                       atomic_inc(&pn_leaf->rt6i_ref);
+                       rcu_assign_pointer(pn->leaf, pn_leaf);
                 }
  #endif
                 goto failure;
@@ -1226,7 +1269,7 @@ failure:
          * fn->leaf.
          */
         if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))
-               fib6_repair_tree(info->nl_net, fn);
+               fib6_repair_tree(info->nl_net, table, fn);
         /* Always release dst as dst->__refcnt is guaranteed
          * to be taken before entering this function
          */
@@ -1264,7 +1307,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
  
                 dir = addr_bit_set(args->addr, fn->fn_bit);
  
-               next = dir ? fn->right : fn->left;
+               next = dir ? rcu_dereference(fn->right) :
+                            rcu_dereference(fn->left);
  
                 if (next) {
                         fn = next;
@@ -1274,18 +1318,22 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
         }
  
         while (fn) {
-               if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) {
+               struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
+               if (subtree || fn->fn_flags & RTN_RTINFO) {
+                       struct rt6_info *leaf = rcu_dereference(fn->leaf);
                         struct rt6key *key;
  
-                       key = (struct rt6key *) ((u8 *) fn->leaf +
-                                                args->offset);
+                       if (!leaf)
+                               goto backtrack;
+
+                       key = (struct rt6key *) ((u8 *)leaf + args->offset);
  
                         if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
  #ifdef CONFIG_IPV6_SUBTREES
-                               if (fn->subtree) {
+                               if (subtree) {
                                         struct fib6_node *sfn;
-                                       sfn = fib6_lookup_1(fn->subtree,
-                                                           args + 1);
+                                       sfn = fib6_lookup_1(subtree, args + 1);
                                         if (!sfn)
                                                 goto backtrack;
                                         fn = sfn;
@@ -1295,18 +1343,18 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
                                         return fn;
                         }
                 }
-#ifdef CONFIG_IPV6_SUBTREES
  backtrack:
-#endif
                 if (fn->fn_flags & RTN_ROOT)
                         break;
  
-               fn = fn->parent;
+               fn = rcu_dereference(fn->parent);
         }
  
         return NULL;
  }
  
+/* called with rcu_read_lock() held
+ */
  struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
                               const struct in6_addr *saddr)
  {
@@ -1337,54 +1385,84 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
  /*
   *     Get node with specified destination prefix (and source prefix,
   *     if subtrees are used)
+ *     exact_match == true means we try to find fn with exact match of
+ *     the passed in prefix addr
+ *     exact_match == false means we try to find fn with longest prefix
+ *     match of the passed in prefix addr. This is useful for finding fn
+ *     for cached route as it will be stored in the exception table under
+ *     the node with longest prefix length.
   */
  
  
  static struct fib6_node *fib6_locate_1(struct fib6_node *root,
                                        const struct in6_addr *addr,
-                                      int plen, int offset)
+                                      int plen, int offset,
+                                      bool exact_match)
  {
-       struct fib6_node *fn;
+       struct fib6_node *fn, *prev = NULL;
  
         for (fn = root; fn ; ) {
-               struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset);
+               struct rt6_info *leaf = rcu_dereference(fn->leaf);
+               struct rt6key *key;
+
+               /* This node is being deleted */
+               if (!leaf) {
+                       if (plen <= fn->fn_bit)
+                               goto out;
+                       else
+                               goto next;
+               }
+
+               key = (struct rt6key *)((u8 *)leaf + offset);
  
                 /*
                  *      Prefix match
                  */
                 if (plen < fn->fn_bit ||
                     !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
-                       return NULL;
+                       goto out;
  
                 if (plen == fn->fn_bit)
                         return fn;
  
+               prev = fn;
+
+next:
                 /*
                  *      We have more bits to go
                  */
                 if (addr_bit_set(addr, fn->fn_bit))
-                       fn = fn->right;
+                       fn = rcu_dereference(fn->right);
                 else
-                       fn = fn->left;
+                       fn = rcu_dereference(fn->left);
         }
-       return NULL;
+out:
+       if (exact_match)
+               return NULL;
+       else
+               return prev;
  }
  
  struct fib6_node *fib6_locate(struct fib6_node *root,
                               const struct in6_addr *daddr, int dst_len,
-                             const struct in6_addr *saddr, int src_len)
+                             const struct in6_addr *saddr, int src_len,
+                             bool exact_match)
  {
         struct fib6_node *fn;
  
         fn = fib6_locate_1(root, daddr, dst_len,
-                          offsetof(struct rt6_info, rt6i_dst));
+                          offsetof(struct rt6_info, rt6i_dst),
+                          exact_match);
  
  #ifdef CONFIG_IPV6_SUBTREES
         if (src_len) {
+               struct fib6_node *subtree = FIB6_SUBTREE(fn);
+
                 WARN_ON(saddr == NULL);
-               if (fn && fn->subtree)
-                       fn = fib6_locate_1(fn->subtree, saddr, src_len,
-                                          offsetof(struct rt6_info, rt6i_src));
+               if (fn && subtree)
+                       fn = fib6_locate_1(subtree, saddr, src_len,
+                                          offsetof(struct rt6_info, rt6i_src),
+                                          exact_match);
         }
  #endif
  
@@ -1400,16 +1478,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
   *
   */
  
-static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
+static struct rt6_info *fib6_find_prefix(struct net *net,
+                                        struct fib6_table *table,
+                                        struct fib6_node *fn)
  {
+       struct fib6_node *child_left, *child_right;
+
         if (fn->fn_flags & RTN_ROOT)
                 return net->ipv6.ip6_null_entry;
  
         while (fn) {
-               if (fn->left)
-                       return fn->left->leaf;
-               if (fn->right)
-                       return fn->right->leaf;
+               child_left = rcu_dereference_protected(fn->left,
+                                   lockdep_is_held(&table->tb6_lock));
+               child_right = rcu_dereference_protected(fn->right,
+                                   lockdep_is_held(&table->tb6_lock));
+               if (child_left)
+                       return rcu_dereference_protected(child_left->leaf,
+                                       lockdep_is_held(&table->tb6_lock));
+               if (child_right)
+                       return rcu_dereference_protected(child_right->leaf,
+                                       lockdep_is_held(&table->tb6_lock));
  
                 fn = FIB6_SUBTREE(fn);
         }
@@ -1419,31 +1507,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn)
  /*
   *     Called to trim the tree of intermediate nodes when possible. "fn"
   *     is the node we want to try and remove.
+ *     Need to own table->tb6_lock
   */
  
  static struct fib6_node *fib6_repair_tree(struct net *net,
-                                          struct fib6_node *fn)
+                                         struct fib6_table *table,
+                                         struct fib6_node *fn)
  {
         int children;
         int nstate;
-       struct fib6_node *child, *pn;
+       struct fib6_node *child;
         struct fib6_walker *w;
         int iter = 0;
  
         for (;;) {
+               struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *pn = rcu_dereference_protected(fn->parent,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
+                                           lockdep_is_held(&table->tb6_lock));
+               struct rt6_info *new_fn_leaf;
+
                 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
                 iter++;
  
                 WARN_ON(fn->fn_flags & RTN_RTINFO);
                 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
-               WARN_ON(fn->leaf);
+               WARN_ON(fn_leaf);
  
                 children = 0;
                 child = NULL;
-               if (fn->right)
-                       child = fn->right, children |= 1;
-               if (fn->left)
-                       child = fn->left, children |= 2;
+               if (fn_r)
+                       child = fn_r, children |= 1;
+               if (fn_l)
+                       child = fn_l, children |= 2;
  
                 if (children == 3 || FIB6_SUBTREE(fn)
  #ifdef CONFIG_IPV6_SUBTREES
@@ -1451,36 +1557,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
                     || (children && fn->fn_flags & RTN_ROOT)
  #endif
                     ) {
-                       fn->leaf = fib6_find_prefix(net, fn);
+                       new_fn_leaf = fib6_find_prefix(net, table, fn);
  #if RT6_DEBUG >= 2
-                       if (!fn->leaf) {
-                               WARN_ON(!fn->leaf);
-                               fn->leaf = net->ipv6.ip6_null_entry;
+                       if (!new_fn_leaf) {
+                               WARN_ON(!new_fn_leaf);
+                               new_fn_leaf = net->ipv6.ip6_null_entry;
                         }
  #endif
-                       atomic_inc(&fn->leaf->rt6i_ref);
-                       return fn->parent;
+                       atomic_inc(&new_fn_leaf->rt6i_ref);
+                       rcu_assign_pointer(fn->leaf, new_fn_leaf);
+                       return pn;
                 }
  
-               pn = fn->parent;
  #ifdef CONFIG_IPV6_SUBTREES
                 if (FIB6_SUBTREE(pn) == fn) {
                         WARN_ON(!(fn->fn_flags & RTN_ROOT));
-                       FIB6_SUBTREE(pn) = NULL;
+                       RCU_INIT_POINTER(pn->subtree, NULL);
                         nstate = FWS_L;
                 } else {
                         WARN_ON(fn->fn_flags & RTN_ROOT);
  #endif
-                       if (pn->right == fn)
-                               pn->right = child;
-                       else if (pn->left == fn)
-                               pn->left = child;
+                       if (pn_r == fn)
+                               rcu_assign_pointer(pn->right, child);
+                       else if (pn_l == fn)
+                               rcu_assign_pointer(pn->left, child);
  #if RT6_DEBUG >= 2
                         else
                                 WARN_ON(1);
  #endif
                         if (child)
-                               child->parent = pn;
+                               rcu_assign_pointer(child->parent, pn);
                         nstate = FWS_R;
  #ifdef CONFIG_IPV6_SUBTREES
                 }
@@ -1489,19 +1595,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
                 read_lock(&net->ipv6.fib6_walker_lock);
                 FOR_WALKERS(net, w) {
                         if (!child) {
-                               if (w->root == fn) {
-                                       w->root = w->node = NULL;
-                                       RT6_TRACE("W %p adjusted by delroot 1\n", w);
-                               } else if (w->node == fn) {
+                               if (w->node == fn) {
                                         RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
                                         w->node = pn;
                                         w->state = nstate;
                                 }
                         } else {
-                               if (w->root == fn) {
-                                       w->root = child;
-                                       RT6_TRACE("W %p adjusted by delroot 2\n", w);
-                               }
                                 if (w->node == fn) {
                                         w->node = child;
                                         if (children&2) {
@@ -1516,33 +1615,39 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
                 }
                 read_unlock(&net->ipv6.fib6_walker_lock);
  
-               node_free(fn);
+               node_free(net, fn);
                 if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
                         return pn;
  
-               rt6_release(pn->leaf);
-               pn->leaf = NULL;
+               RCU_INIT_POINTER(pn->leaf, NULL);
+               rt6_release(pn_leaf);
                 fn = pn;
         }
  }
  
-static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
-                          struct nl_info *info)
+static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
+                          struct rt6_info __rcu **rtp, struct nl_info *info)
  {
         struct fib6_walker *w;
-       struct rt6_info *rt = *rtp;
+       struct rt6_info *rt = rcu_dereference_protected(*rtp,
+                                   lockdep_is_held(&table->tb6_lock));
         struct net *net = info->nl_net;
  
         RT6_TRACE("fib6_del_route\n");
  
+       WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE);
+
         /* Unlink it */
         *rtp = rt->dst.rt6_next;
         rt->rt6i_node = NULL;
         net->ipv6.rt6_stats->fib_rt_entries--;
         net->ipv6.rt6_stats->fib_discarded_routes++;
  
+       /* Flush all cached dst in exception table */
+       rt6_flush_exceptions(rt);
+
         /* Reset round-robin state, if necessary */
-       if (fn->rr_ptr == rt)
+       if (rcu_access_pointer(fn->rr_ptr) == rt)
                 fn->rr_ptr = NULL;
  
         /* Remove this entry from other siblings */
@@ -1561,20 +1666,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
         FOR_WALKERS(net, w) {
                 if (w->state == FWS_C && w->leaf == rt) {
                         RT6_TRACE("walker %p adjusted by delroute\n", w);
-                       w->leaf = rt->dst.rt6_next;
+                       w->leaf = rcu_dereference_protected(rt->dst.rt6_next,
+                                           lockdep_is_held(&table->tb6_lock));
                         if (!w->leaf)
                                 w->state = FWS_U;
                 }
         }
         read_unlock(&net->ipv6.fib6_walker_lock);
  
-       rt->dst.rt6_next = NULL;
-
         /* If it was last route, expunge its radix tree node */
-       if (!fn->leaf) {
+       if (!rcu_access_pointer(fn->leaf)) {
                 fn->fn_flags &= ~RTN_RTINFO;
                 net->ipv6.rt6_stats->fib_route_nodes--;
-               fn = fib6_repair_tree(net, fn);
+               fn = fib6_repair_tree(net, table, fn);
         }
  
         fib6_purge_rt(rt, fn, net);
@@ -1585,12 +1689,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
         rt6_release(rt);
  }
  
+/* Need to own table->tb6_lock */
  int fib6_del(struct rt6_info *rt, struct nl_info *info)
  {
         struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
                                     lockdep_is_held(&rt->rt6i_table->tb6_lock));
+       struct fib6_table *table = rt->rt6i_table;
         struct net *net = info->nl_net;
-       struct rt6_info **rtp;
+       struct rt6_info __rcu **rtp;
+       struct rt6_info __rcu **rtp_next;
  
  #if RT6_DEBUG >= 2
         if (rt->dst.obsolete > 0) {
@@ -1603,28 +1710,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
  
         WARN_ON(!(fn->fn_flags & RTN_RTINFO));
  
-       if (!(rt->rt6i_flags & RTF_CACHE)) {
-               struct fib6_node *pn = fn;
-#ifdef CONFIG_IPV6_SUBTREES
-               /* clones of this route might be in another subtree */
-               if (rt->rt6i_src.plen) {
-                       while (!(pn->fn_flags & RTN_ROOT))
-                               pn = pn->parent;
-                       pn = pn->parent;
-               }
-#endif
-               fib6_prune_clones(info->nl_net, pn);
-       }
+       /* remove cached dst from exception table */
+       if (rt->rt6i_flags & RTF_CACHE)
+               return rt6_remove_exception_rt(rt);
  
         /*
          *      Walk the leaf entries looking for ourself
          */
  
-       for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) {
-               if (*rtp == rt) {
-                       fib6_del_route(fn, rtp, info);
+       for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
+               struct rt6_info *cur = rcu_dereference_protected(*rtp,
+                                       lockdep_is_held(&table->tb6_lock));
+               if (rt == cur) {
+                       fib6_del_route(table, fn, rtp, info);
                         return 0;
                 }
+               rtp_next = &cur->dst.rt6_next;
         }
         return -ENOENT;
  }
@@ -1651,22 +1752,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info)
   *     0   -> walk is complete.
   *     >0  -> walk is incomplete (i.e. suspended)
   *     <0  -> walk is terminated by an error.
+ *
+ *     This function is called with tb6_lock held.
   */
  
  static int fib6_walk_continue(struct fib6_walker *w)
  {
-       struct fib6_node *fn, *pn;
+       struct fib6_node *fn, *pn, *left, *right;
+
+       /* w->root should always be table->tb6_root */
+       WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
  
         for (;;) {
                 fn = w->node;
                 if (!fn)
                         return 0;
  
-               if (w->prune && fn != w->root &&
-                   fn->fn_flags & RTN_RTINFO && w->state < FWS_C) {
-                       w->state = FWS_C;
-                       w->leaf = fn->leaf;
-               }
                 switch (w->state) {
  #ifdef CONFIG_IPV6_SUBTREES
                 case FWS_S:
@@ -1677,20 +1778,22 @@ static int fib6_walk_continue(struct fib6_walker *w)
                         w->state = FWS_L;
  #endif
                 case FWS_L:
-                       if (fn->left) {
-                               w->node = fn->left;
+                       left = rcu_dereference_protected(fn->left, 1);
+                       if (left) {
+                               w->node = left;
                                 w->state = FWS_INIT;
                                 continue;
                         }
                         w->state = FWS_R;
                 case FWS_R:
-                       if (fn->right) {
-                               w->node = fn->right;
+                       right = rcu_dereference_protected(fn->right, 1);
+                       if (right) {
+                               w->node = right;
                                 w->state = FWS_INIT;
                                 continue;
                         }
                         w->state = FWS_C;
-                       w->leaf = fn->leaf;
+                       w->leaf = rcu_dereference_protected(fn->leaf, 1);
                 case FWS_C:
                         if (w->leaf && fn->fn_flags & RTN_RTINFO) {
                                 int err;
@@ -1712,7 +1815,9 @@ skip:
                 case FWS_U:
                         if (fn == w->root)
                                 return 0;
-                       pn = fn->parent;
+                       pn = rcu_dereference_protected(fn->parent, 1);
+                       left = rcu_dereference_protected(pn->left, 1);
+                       right = rcu_dereference_protected(pn->right, 1);
                         w->node = pn;
  #ifdef CONFIG_IPV6_SUBTREES
                         if (FIB6_SUBTREE(pn) == fn) {
@@ -1721,13 +1826,13 @@ skip:
                                 continue;
                         }
  #endif
-                       if (pn->left == fn) {
+                       if (left == fn) {
                                 w->state = FWS_R;
                                 continue;
                         }
-                       if (pn->right == fn) {
+                       if (right == fn) {
                                 w->state = FWS_C;
-                               w->leaf = w->node->leaf;
+                               w->leaf = rcu_dereference_protected(w->node->leaf, 1);
                                 continue;
                         }
  #if RT6_DEBUG >= 2
@@ -1770,7 +1875,7 @@ static int fib6_clean_node(struct fib6_walker *w)
                 return 0;
         }
  
-       for (rt = w->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_walker_rt(w) {
                 res = c->func(rt, c->arg);
                 if (res < 0) {
                         w->leaf = rt;
@@ -1798,20 +1903,16 @@ static int fib6_clean_node(struct fib6_walker *w)
   *     func is called on each route.
   *             It may return -1 -> delete this route.
   *                           0  -> continue walking
- *
- *     prune==1 -> only immediate children of node (certainly,
- *     ignoring pure split nodes) will be scanned.
   */
  
  static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                             int (*func)(struct rt6_info *, void *arg),
-                           bool prune, int sernum, void *arg)
+                           int sernum, void *arg)
  {
         struct fib6_cleaner c;
  
         c.w.root = root;
         c.w.func = fib6_clean_node;
-       c.w.prune = prune;
         c.w.count = 0;
         c.w.skip = 0;
         c.func = func;
@@ -1834,10 +1935,10 @@ static void __fib6_clean_all(struct net *net,
         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                 head = &net->ipv6.fib_table_hash[h];
                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
-                       write_lock_bh(&table->tb6_lock);
+                       spin_lock_bh(&table->tb6_lock);
                         fib6_clean_tree(net, &table->tb6_root,
-                                       func, false, sernum, arg);
-                       write_unlock_bh(&table->tb6_lock);
+                                       func, sernum, arg);
+                       spin_unlock_bh(&table->tb6_lock);
                 }
         }
         rcu_read_unlock();
@@ -1849,22 +1950,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *),
         __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
  }
  
-static int fib6_prune_clone(struct rt6_info *rt, void *arg)
-{
-       if (rt->rt6i_flags & RTF_CACHE) {
-               RT6_TRACE("pruning clone %p\n", rt);
-               return -1;
-       }
-
-       return 0;
-}
-
-static void fib6_prune_clones(struct net *net, struct fib6_node *fn)
-{
-       fib6_clean_tree(net, fn, fib6_prune_clone, true,
-                       FIB6_NO_SERNUM_CHANGE, NULL);
-}
-
  static void fib6_flush_trees(struct net *net)
  {
         int new_sernum = fib6_new_sernum(net);
@@ -1876,12 +1961,6 @@ static void fib6_flush_trees(struct net *net)
   *     Garbage collection
   */
  
-struct fib6_gc_args
-{
-       int                     timeout;
-       int                     more;
-};
-
  static int fib6_age(struct rt6_info *rt, void *arg)
  {
         struct fib6_gc_args *gc_args = arg;
@@ -1890,9 +1969,6 @@ static int fib6_age(struct rt6_info *rt, void *arg)
         /*
          *      check addrconf expiration here.
          *      Routes are expired even if they are in use.
-        *
-        *      Also age clones. Note, that clones are aged out
-        *      only if they are not in use now.
          */
  
         if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
@@ -1901,31 +1977,14 @@ static int fib6_age(struct rt6_info *rt, void *arg)
                         return -1;
                 }
                 gc_args->more++;
-       } else if (rt->rt6i_flags & RTF_CACHE) {
-               if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout))
-                       rt->dst.obsolete = DST_OBSOLETE_KILL;
-               if (atomic_read(&rt->dst.__refcnt) == 1 &&
-                   rt->dst.obsolete == DST_OBSOLETE_KILL) {
-                       RT6_TRACE("aging clone %p\n", rt);
-                       return -1;
-               } else if (rt->rt6i_flags & RTF_GATEWAY) {
-                       struct neighbour *neigh;
-                       __u8 neigh_flags = 0;
-
-                       neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
-                       if (neigh) {
-                               neigh_flags = neigh->flags;
-                               neigh_release(neigh);
-                       }
-                       if (!(neigh_flags & NTF_ROUTER)) {
-                               RT6_TRACE("purging route %p via non-router but gateway\n",
-                                         rt);
-                               return -1;
-                       }
-               }
-               gc_args->more++;
         }
  
+       /*      Also age clones in the exception table.
+        *      Note, that clones are aged out
+        *      only if they are not in use now.
+        */
+       rt6_age_exceptions(rt, gc_args, now);
+
         return 0;
  }
  
@@ -1993,7 +2052,8 @@ static int __net_init fib6_net_init(struct net *net)
                 goto out_fib_table_hash;
  
         net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
-       net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+       rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
+                          net->ipv6.ip6_null_entry);
         net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
                 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
         inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
@@ -2004,7 +2064,8 @@ static int __net_init fib6_net_init(struct net *net)
         if (!net->ipv6.fib6_local_tbl)
                 goto out_fib6_main_tbl;
         net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
-       net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
+       rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
+                          net->ipv6.ip6_null_entry);
         net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
                 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
         inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
@@ -2134,7 +2195,9 @@ static int ipv6_route_yield(struct fib6_walker *w)
                 return 1;
  
         do {
-               iter->w.leaf = iter->w.leaf->dst.rt6_next;
+               iter->w.leaf = rcu_dereference_protected(
+                               iter->w.leaf->dst.rt6_next,
+                               lockdep_is_held(&iter->tbl->tb6_lock));
                 iter->skip--;
                 if (!iter->skip && iter->w.leaf)
                         return 1;
@@ -2199,7 +2262,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
         if (!v)
                 goto iter_table;
  
-       n = ((struct rt6_info *)v)->dst.rt6_next;
+       n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next);
         if (n) {
                 ++*pos;
                 return n;
@@ -2207,9 +2270,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  
  iter_table:
         ipv6_route_check_sernum(iter);
-       read_lock(&iter->tbl->tb6_lock);
+       spin_lock_bh(&iter->tbl->tb6_lock);
         r = fib6_walk_continue(&iter->w);
-       read_unlock(&iter->tbl->tb6_lock);
+       spin_unlock_bh(&iter->tbl->tb6_lock);
         if (r > 0) {
                 if (v)
                         ++*pos;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c

index cdb3728..4a87f94 100644 (file)
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
  
         for (skb = segs; skb; skb = skb->next) {
                 ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
-               if (gso_partial)
+               if (gso_partial && skb_is_gso(skb))
                         payload_len = skb_shinfo(skb)->gso_size +
                                       SKB_GSO_CB(skb)->data_offset +
                                       skb->head - (unsigned char *)(ipv6h + 1);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c

index a5cd43d..437af8c 100644 (file)
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv,
         nexthdr = ipv6_hdr(skb)->nexthdr;
         thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
                                  &frag_off);
-       if (thoff < 0)
+       if (thoff < 0 || nexthdr != IPPROTO_TCP)
                 return NF_ACCEPT;
  
         th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c

index ac826dd..d12c55d 100644 (file)
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -154,9 +154,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
                                 ICMP6_MIB_OUTERRORS);
                 ip6_flush_pending_frames(sk);
         } else {
-               err = icmpv6_push_pending_frames(sk, &fl6,
-                                                (struct icmp6hdr *) &pfh.icmph,
-                                                len);
+               icmpv6_push_pending_frames(sk, &fl6,
+                                          (struct icmp6hdr *)&pfh.icmph, len);
         }
         release_sock(sk);
  
diff --git a/net/ipv6/route.c b/net/ipv6/route.c

index 26cc9f4..2e8842f 100644 (file)
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -44,6 +44,7 @@
  #include <linux/seq_file.h>
  #include <linux/nsproxy.h>
  #include <linux/slab.h>
+#include <linux/jhash.h>
  #include <net/net_namespace.h>
  #include <net/snmp.h>
  #include <net/ipv6.h>
@@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net,
                          struct in6_addr *dst, struct in6_addr *src,
                          int iif, int type, u32 portid, u32 seq,
                          unsigned int flags);
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+                                          struct in6_addr *daddr,
+                                          struct in6_addr *saddr);
  
  #ifdef CONFIG_IPV6_ROUTE_INFO
  static struct rt6_info *rt6_add_route_info(struct net *net,
@@ -139,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt)
  {
         if (!list_empty(&rt->rt6i_uncached)) {
                 struct uncached_list *ul = rt->rt6i_uncached_list;
+               struct net *net = dev_net(rt->dst.dev);
  
                 spin_lock_bh(&ul->lock);
                 list_del(&rt->rt6i_uncached);
+               atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
                 spin_unlock_bh(&ul->lock);
         }
  }
@@ -355,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,
         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                         1, DST_OBSOLETE_FORCE_CHK, flags);
  
-       if (rt)
+       if (rt) {
                 rt6_info_init(rt);
+               atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
+       }
  
         return rt;
  }
@@ -369,17 +377,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net,
  
         if (rt) {
                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
-               if (rt->rt6i_pcpu) {
-                       int cpu;
-
-                       for_each_possible_cpu(cpu) {
-                               struct rt6_info **p;
-
-                               p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
-                               /* no one shares rt */
-                               *p =  NULL;
-                       }
-               } else {
+               if (!rt->rt6i_pcpu) {
                         dst_release_immediate(&rt->dst);
                         return NULL;
                 }
@@ -392,6 +390,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
  static void ip6_dst_destroy(struct dst_entry *dst)
  {
         struct rt6_info *rt = (struct rt6_info *)dst;
+       struct rt6_exception_bucket *bucket;
         struct dst_entry *from = dst->from;
         struct inet6_dev *idev;
  
@@ -404,6 +403,11 @@ static void ip6_dst_destroy(struct dst_entry *dst)
                 rt->rt6i_idev = NULL;
                 in6_dev_put(idev);
         }
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
+       if (bucket) {
+               rt->rt6i_exception_bucket = NULL;
+               kfree(bucket);
+       }
  
         dst->from = NULL;
         dst_release(from);
@@ -478,7 +482,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
  }
  
  /*
- *     Route lookup. Any table->tb6_lock is implied.
+ *     Route lookup. rcu_read_lock() should be held.
   */
  
  static inline struct rt6_info *rt6_device_match(struct net *net,
@@ -493,7 +497,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net,
         if (!oif && ipv6_addr_any(saddr))
                 goto out;
  
-       for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
+       for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
                 struct net_device *dev = sprt->dst.dev;
  
                 if (oif) {
@@ -702,6 +706,7 @@ out:
  }
  
  static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+                                    struct rt6_info *leaf,
                                      struct rt6_info *rr_head,
                                      u32 metric, int oif, int strict,
                                      bool *do_rr)
@@ -711,7 +716,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
  
         match = NULL;
         cont = NULL;
-       for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
+       for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
                 if (rt->rt6i_metric != metric) {
                         cont = rt;
                         break;
@@ -720,7 +725,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
         }
  
-       for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
+       for (rt = leaf; rt && rt != rr_head;
+            rt = rcu_dereference(rt->dst.rt6_next)) {
                 if (rt->rt6i_metric != metric) {
                         cont = rt;
                         break;
@@ -732,37 +738,59 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
         if (match || !cont)
                 return match;
  
-       for (rt = cont; rt; rt = rt->dst.rt6_next)
+       for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
  
         return match;
  }
  
-static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
+                                  int oif, int strict)
  {
+       struct rt6_info *leaf = rcu_dereference(fn->leaf);
         struct rt6_info *match, *rt0;
-       struct net *net;
         bool do_rr = false;
+       int key_plen;
  
-       rt0 = fn->rr_ptr;
+       if (!leaf)
+               return net->ipv6.ip6_null_entry;
+
+       rt0 = rcu_dereference(fn->rr_ptr);
         if (!rt0)
-               fn->rr_ptr = rt0 = fn->leaf;
+               rt0 = leaf;
  
-       match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
+       /* Double check to make sure fn is not an intermediate node
+        * and fn->leaf does not points to its child's leaf
+        * (This might happen if all routes under fn are deleted from
+        * the tree and fib6_repair_tree() is called on the node.)
+        */
+       key_plen = rt0->rt6i_dst.plen;
+#ifdef CONFIG_IPV6_SUBTREES
+       if (rt0->rt6i_src.plen)
+               key_plen = rt0->rt6i_src.plen;
+#endif
+       if (fn->fn_bit != key_plen)
+               return net->ipv6.ip6_null_entry;
+
+       match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
                              &do_rr);
  
         if (do_rr) {
-               struct rt6_info *next = rt0->dst.rt6_next;
+               struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
  
                 /* no entries matched; do round-robin */
                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
-                       next = fn->leaf;
-
-               if (next != rt0)
-                       fn->rr_ptr = next;
+                       next = leaf;
+
+               if (next != rt0) {
+                       spin_lock_bh(&leaf->rt6i_table->tb6_lock);
+                       /* make sure next is not being deleted from the tree */
+                       if (next->rt6i_node)
+                               rcu_assign_pointer(fn->rr_ptr, next);
+                       spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
+               }
         }
  
-       net = dev_net(rt0->dst.dev);
         return match ? match : net->ipv6.ip6_null_entry;
  }
  
@@ -850,13 +878,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
  static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
                                         struct in6_addr *saddr)
  {
-       struct fib6_node *pn;
+       struct fib6_node *pn, *sn;
         while (1) {
                 if (fn->fn_flags & RTN_TL_ROOT)
                         return NULL;
-               pn = fn->parent;
-               if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
-                       fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+               pn = rcu_dereference(fn->parent);
+               sn = FIB6_SUBTREE(pn);
+               if (sn && sn != fn)
+                       fn = fib6_lookup(sn, NULL, saddr);
                 else
                         fn = pn;
                 if (fn->fn_flags & RTN_RTINFO)
@@ -864,27 +893,57 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
         }
  }
  
+static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
+                         bool null_fallback)
+{
+       struct rt6_info *rt = *prt;
+
+       if (dst_hold_safe(&rt->dst))
+               return true;
+       if (null_fallback) {
+               rt = net->ipv6.ip6_null_entry;
+               dst_hold(&rt->dst);
+       } else {
+               rt = NULL;
+       }
+       *prt = rt;
+       return false;
+}
+
  static struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                              struct fib6_table *table,
                                              struct flowi6 *fl6, int flags)
  {
+       struct rt6_info *rt, *rt_cache;
         struct fib6_node *fn;
-       struct rt6_info *rt;
  
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
  restart:
-       rt = fn->leaf;
-       rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
-       if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-               rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
+       rt = rcu_dereference(fn->leaf);
+       if (!rt) {
+               rt = net->ipv6.ip6_null_entry;
+       } else {
+               rt = rt6_device_match(net, rt, &fl6->saddr,
+                                     fl6->flowi6_oif, flags);
+               if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+                       rt = rt6_multipath_select(rt, fl6,
+                                                 fl6->flowi6_oif, flags);
+       }
         if (rt == net->ipv6.ip6_null_entry) {
                 fn = fib6_backtrack(fn, &fl6->saddr);
                 if (fn)
                         goto restart;
         }
-       dst_use(&rt->dst, jiffies);
-       read_unlock_bh(&table->tb6_lock);
+       /* Search through exception table */
+       rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+       if (rt_cache)
+               rt = rt_cache;
+
+       if (ip6_hold_safe(net, &rt, true))
+               dst_use_noref(&rt->dst, jiffies);
+
+       rcu_read_unlock();
  
         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
  
@@ -938,9 +997,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
         struct fib6_table *table;
  
         table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
         err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
  
         return err;
  }
@@ -1038,7 +1097,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
         return pcpu_rt;
  }
  
-/* It should be called with read_lock_bh(&tb6_lock) acquired */
+/* It should be called with rcu_read_lock() acquired */
  static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
  {
         struct rt6_info *pcpu_rt, **p;
@@ -1046,16 +1105,14 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
         p = this_cpu_ptr(rt->rt6i_pcpu);
         pcpu_rt = *p;
  
-       if (pcpu_rt) {
-               dst_hold(&pcpu_rt->dst);
+       if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
                 rt6_dst_from_metrics_check(pcpu_rt);
-       }
+
         return pcpu_rt;
  }
  
  static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
  {
-       struct fib6_table *table = rt->rt6i_table;
         struct rt6_info *pcpu_rt, *prev, **p;
  
         pcpu_rt = ip6_rt_pcpu_alloc(rt);
@@ -1066,36 +1123,514 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
                 return net->ipv6.ip6_null_entry;
         }
  
-       read_lock_bh(&table->tb6_lock);
-       if (rt->rt6i_pcpu) {
-               p = this_cpu_ptr(rt->rt6i_pcpu);
-               prev = cmpxchg(p, NULL, pcpu_rt);
-               if (prev) {
-                       /* If someone did it before us, return prev instead */
-                       dst_release_immediate(&pcpu_rt->dst);
-                       pcpu_rt = prev;
-               }
-       } else {
-               /* rt has been removed from the fib6 tree
-                * before we have a chance to acquire the read_lock.
-                * In this case, don't brother to create a pcpu rt
-                * since rt is going away anyway.  The next
-                * dst_check() will trigger a re-lookup.
-                */
-               dst_release_immediate(&pcpu_rt->dst);
-               pcpu_rt = rt;
-       }
         dst_hold(&pcpu_rt->dst);
+       p = this_cpu_ptr(rt->rt6i_pcpu);
+       prev = cmpxchg(p, NULL, pcpu_rt);
+       BUG_ON(prev);
+
         rt6_dst_from_metrics_check(pcpu_rt);
-       read_unlock_bh(&table->tb6_lock);
         return pcpu_rt;
  }
  
+/* exception hash table implementation
+ */
+static DEFINE_SPINLOCK(rt6_exception_lock);
+
+/* Remove rt6_ex from hash table and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
+                                struct rt6_exception *rt6_ex)
+{
+       struct net *net;
+
+       if (!bucket || !rt6_ex)
+               return;
+
+       net = dev_net(rt6_ex->rt6i->dst.dev);
+       rt6_ex->rt6i->rt6i_node = NULL;
+       hlist_del_rcu(&rt6_ex->hlist);
+       rt6_release(rt6_ex->rt6i);
+       kfree_rcu(rt6_ex, rcu);
+       WARN_ON_ONCE(!bucket->depth);
+       bucket->depth--;
+       net->ipv6.rt6_stats->fib_rt_cache--;
+}
+
+/* Remove oldest rt6_ex in bucket and free the memory
+ * Caller must hold rt6_exception_lock
+ */
+static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
+{
+       struct rt6_exception *rt6_ex, *oldest = NULL;
+
+       if (!bucket)
+               return;
+
+       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+               if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
+                       oldest = rt6_ex;
+       }
+       rt6_remove_exception(bucket, oldest);
+}
+
+static u32 rt6_exception_hash(const struct in6_addr *dst,
+                             const struct in6_addr *src)
+{
+       static u32 seed __read_mostly;
+       u32 val;
+
+       net_get_random_once(&seed, sizeof(seed));
+       val = jhash(dst, sizeof(*dst), seed);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       if (src)
+               val = jhash(src, sizeof(*src), val);
+#endif
+       return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rt6_exception_lock
+ */
+static struct rt6_exception *
+__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
+                             const struct in6_addr *daddr,
+                             const struct in6_addr *saddr)
+{
+       struct rt6_exception *rt6_ex;
+       u32 hval;
+
+       if (!(*bucket) || !daddr)
+               return NULL;
+
+       hval = rt6_exception_hash(daddr, saddr);
+       *bucket += hval;
+
+       hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
+               struct rt6_info *rt6 = rt6_ex->rt6i;
+               bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+               if (matched && saddr)
+                       matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+               if (matched)
+                       return rt6_ex;
+       }
+       return NULL;
+}
+
+/* Helper function to find the cached rt in the hash table
+ * and update bucket pointer to point to the bucket for this
+ * (daddr, saddr) pair
+ * Caller must hold rcu_read_lock()
+ */
+static struct rt6_exception *
+__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
+                        const struct in6_addr *daddr,
+                        const struct in6_addr *saddr)
+{
+       struct rt6_exception *rt6_ex;
+       u32 hval;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (!(*bucket) || !daddr)
+               return NULL;
+
+       hval = rt6_exception_hash(daddr, saddr);
+       *bucket += hval;
+
+       hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
+               struct rt6_info *rt6 = rt6_ex->rt6i;
+               bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
+
+#ifdef CONFIG_IPV6_SUBTREES
+               if (matched && saddr)
+                       matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
+#endif
+               if (matched)
+                       return rt6_ex;
+       }
+       return NULL;
+}
+
+static int rt6_insert_exception(struct rt6_info *nrt,
+                               struct rt6_info *ort)
+{
+       struct net *net = dev_net(ort->dst.dev);
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       int err = 0;
+
+       /* ort can't be a cache or pcpu route */
+       if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
+               ort = (struct rt6_info *)ort->dst.from;
+       WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
+
+       spin_lock_bh(&rt6_exception_lock);
+
+       if (ort->exception_bucket_flushed) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+       if (!bucket) {
+               bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
+                                GFP_ATOMIC);
+               if (!bucket) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+       }
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates ort is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (ort->rt6i_src.plen)
+               src_key = &nrt->rt6i_src.addr;
+#endif
+
+       /* Update rt6i_prefsrc as it could be changed
+        * in rt6_remove_prefsrc()
+        */
+       nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
+       /* rt6_mtu_change() might lower mtu on ort.
+        * Only insert this exception route if its mtu
+        * is less than ort's mtu value.
+        */
+       if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
+                                              src_key);
+       if (rt6_ex)
+               rt6_remove_exception(bucket, rt6_ex);
+
+       rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
+       if (!rt6_ex) {
+               err = -ENOMEM;
+               goto out;
+       }
+       rt6_ex->rt6i = nrt;
+       rt6_ex->stamp = jiffies;
+       atomic_inc(&nrt->rt6i_ref);
+       nrt->rt6i_node = ort->rt6i_node;
+       hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
+       bucket->depth++;
+       net->ipv6.rt6_stats->fib_rt_cache++;
+
+       if (bucket->depth > FIB6_MAX_DEPTH)
+               rt6_exception_remove_oldest(bucket);
+
+out:
+       spin_unlock_bh(&rt6_exception_lock);
+
+       /* Update fn->fn_sernum to invalidate all cached dst */
+       if (!err)
+               fib6_update_sernum(ort);
+
+       return err;
+}
+
+void rt6_flush_exceptions(struct rt6_info *rt)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       spin_lock_bh(&rt6_exception_lock);
+       /* Prevent rt6_insert_exception() to recreate the bucket list */
+       rt->exception_bucket_flushed = 1;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+       if (!bucket)
+               goto out;
+
+       for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+               hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
+                       rt6_remove_exception(bucket, rt6_ex);
+               WARN_ON_ONCE(bucket->depth);
+               bucket++;
+       }
+
+out:
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
+/* Find cached rt in the hash table inside passed in rt
+ * Caller has to hold rcu_read_lock()
+ */
+static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
+                                          struct in6_addr *daddr,
+                                          struct in6_addr *saddr)
+{
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       struct rt6_info *res = NULL;
+
+       bucket = rcu_dereference(rt->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates rt is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (rt->rt6i_src.plen)
+               src_key = saddr;
+#endif
+       rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
+
+       if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
+               res = rt6_ex->rt6i;
+
+       return res;
+}
+
+/* Remove the passed in cached rt from the hash table that contains it */
+int rt6_remove_exception_rt(struct rt6_info *rt)
+{
+       struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+       int err;
+
+       if (!from ||
+           !(rt->rt6i_flags & RTF_CACHE))
+               return -EINVAL;
+
+       if (!rcu_access_pointer(from->rt6i_exception_bucket))
+               return -ENOENT;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates 'from' is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (from->rt6i_src.plen)
+               src_key = &rt->rt6i_src.addr;
+#endif
+       rt6_ex = __rt6_find_exception_spinlock(&bucket,
+                                              &rt->rt6i_dst.addr,
+                                              src_key);
+       if (rt6_ex) {
+               rt6_remove_exception(bucket, rt6_ex);
+               err = 0;
+       } else {
+               err = -ENOENT;
+       }
+
+       spin_unlock_bh(&rt6_exception_lock);
+       return err;
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+       struct rt6_info *from = (struct rt6_info *)rt->dst.from;
+       struct rt6_exception_bucket *bucket;
+       struct in6_addr *src_key = NULL;
+       struct rt6_exception *rt6_ex;
+
+       if (!from ||
+           !(rt->rt6i_flags & RTF_CACHE))
+               return;
+
+       rcu_read_lock();
+       bucket = rcu_dereference(from->rt6i_exception_bucket);
+
+#ifdef CONFIG_IPV6_SUBTREES
+       /* rt6i_src.plen != 0 indicates 'from' is in subtree
+        * and exception table is indexed by a hash of
+        * both rt6i_dst and rt6i_src.
+        * Otherwise, the exception table is indexed by
+        * a hash of only rt6i_dst.
+        */
+       if (from->rt6i_src.plen)
+               src_key = &rt->rt6i_src.addr;
+#endif
+       rt6_ex = __rt6_find_exception_rcu(&bucket,
+                                         &rt->rt6i_dst.addr,
+                                         src_key);
+       if (rt6_ex)
+               rt6_ex->stamp = jiffies;
+
+       rcu_read_unlock();
+}
+
+static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       int i;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+                               rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
+                       }
+                       bucket++;
+               }
+       }
+}
+
+static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       int i;
+
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                       lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+                               struct rt6_info *entry = rt6_ex->rt6i;
+                               /* For RTF_CACHE with rt6i_pmtu == 0
+                                * (i.e. a redirected route),
+                                * the metrics of its rt->dst.from has already
+                                * been updated.
+                                */
+                               if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
+                                       entry->rt6i_pmtu = mtu;
+                       }
+                       bucket++;
+               }
+       }
+}
+
+#define RTF_CACHE_GATEWAY      (RTF_GATEWAY | RTF_CACHE)
+
+static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
+                                       struct in6_addr *gateway)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+               return;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                    lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry_safe(rt6_ex, tmp,
+                                                 &bucket->chain, hlist) {
+                               struct rt6_info *entry = rt6_ex->rt6i;
+
+                               if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
+                                   RTF_CACHE_GATEWAY &&
+                                   ipv6_addr_equal(gateway,
+                                                   &entry->rt6i_gateway)) {
+                                       rt6_remove_exception(bucket, rt6_ex);
+                               }
+                       }
+                       bucket++;
+               }
+       }
+
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
+static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
+                                     struct rt6_exception *rt6_ex,
+                                     struct fib6_gc_args *gc_args,
+                                     unsigned long now)
+{
+       struct rt6_info *rt = rt6_ex->rt6i;
+
+       if (atomic_read(&rt->dst.__refcnt) == 1 &&
+           time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+               RT6_TRACE("aging clone %p\n", rt);
+               rt6_remove_exception(bucket, rt6_ex);
+               return;
+       } else if (rt->rt6i_flags & RTF_GATEWAY) {
+               struct neighbour *neigh;
+               __u8 neigh_flags = 0;
+
+               neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
+               if (neigh) {
+                       neigh_flags = neigh->flags;
+                       neigh_release(neigh);
+               }
+               if (!(neigh_flags & NTF_ROUTER)) {
+                       RT6_TRACE("purging route %p via non-router but gateway\n",
+                                 rt);
+                       rt6_remove_exception(bucket, rt6_ex);
+                       return;
+               }
+       }
+       gc_args->more++;
+}
+
+void rt6_age_exceptions(struct rt6_info *rt,
+                       struct fib6_gc_args *gc_args,
+                       unsigned long now)
+{
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+       struct hlist_node *tmp;
+       int i;
+
+       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+               return;
+
+       spin_lock_bh(&rt6_exception_lock);
+       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+                                   lockdep_is_held(&rt6_exception_lock));
+
+       if (bucket) {
+               for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+                       hlist_for_each_entry_safe(rt6_ex, tmp,
+                                                 &bucket->chain, hlist) {
+                               rt6_age_examine_exception(bucket, rt6_ex,
+                                                         gc_args, now);
+                       }
+                       bucket++;
+               }
+       }
+       spin_unlock_bh(&rt6_exception_lock);
+}
+
  struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                                int oif, struct flowi6 *fl6, int flags)
  {
         struct fib6_node *fn, *saved_fn;
-       struct rt6_info *rt;
+       struct rt6_info *rt, *rt_cache;
         int strict = 0;
  
         strict |= flags & RT6_LOOKUP_F_IFACE;
@@ -1103,7 +1638,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
         if (net->ipv6.devconf_all->forwarding == 0)
                 strict |= RT6_LOOKUP_F_REACHABLE;
  
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
  
         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
         saved_fn = fn;
@@ -1112,7 +1647,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                 oif = 0;
  
  redo_rt6_select:
-       rt = rt6_select(fn, oif, strict);
+       rt = rt6_select(net, fn, oif, strict);
         if (rt->rt6i_nsiblings)
                 rt = rt6_multipath_select(rt, fl6, oif, strict);
         if (rt == net->ipv6.ip6_null_entry) {
@@ -1127,13 +1662,22 @@ redo_rt6_select:
                 }
         }
  
+       /*Search through exception table */
+       rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
+       if (rt_cache)
+               rt = rt_cache;
  
-       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
-               dst_use(&rt->dst, jiffies);
-               read_unlock_bh(&table->tb6_lock);
-
-               rt6_dst_from_metrics_check(rt);
-
+       if (rt == net->ipv6.ip6_null_entry) {
+               rcu_read_unlock();
+               dst_hold(&rt->dst);
+               trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+               return rt;
+       } else if (rt->rt6i_flags & RTF_CACHE) {
+               if (ip6_hold_safe(net, &rt, true)) {
+                       dst_use_noref(&rt->dst, jiffies);
+                       rt6_dst_from_metrics_check(rt);
+               }
+               rcu_read_unlock();
                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
                 return rt;
         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
@@ -1146,8 +1690,14 @@ redo_rt6_select:
  
                 struct rt6_info *uncached_rt;
  
-               dst_use(&rt->dst, jiffies);
-               read_unlock_bh(&table->tb6_lock);
+               if (ip6_hold_safe(net, &rt, true)) {
+                       dst_use_noref(&rt->dst, jiffies);
+               } else {
+                       rcu_read_unlock();
+                       uncached_rt = rt;
+                       goto uncached_rt_out;
+               }
+               rcu_read_unlock();
  
                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
                 dst_release(&rt->dst);
@@ -1157,11 +1707,13 @@ redo_rt6_select:
                          * No need for another dst_hold()
                          */
                         rt6_uncached_list_add(uncached_rt);
+                       atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
                 } else {
                         uncached_rt = net->ipv6.ip6_null_entry;
                         dst_hold(&uncached_rt->dst);
                 }
  
+uncached_rt_out:
                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
                 return uncached_rt;
  
@@ -1170,26 +1722,28 @@ redo_rt6_select:
  
                 struct rt6_info *pcpu_rt;
  
-               rt->dst.lastuse = jiffies;
-               rt->dst.__use++;
+               dst_use_noref(&rt->dst, jiffies);
+               local_bh_disable();
                 pcpu_rt = rt6_get_pcpu_route(rt);
  
-               if (pcpu_rt) {
-                       read_unlock_bh(&table->tb6_lock);
-               } else {
-                       /* We have to do the read_unlock first
-                        * because rt6_make_pcpu_route() may trigger
-                        * ip6_dst_gc() which will take the write_lock.
-                        */
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       pcpu_rt = rt6_make_pcpu_route(rt);
-                       dst_release(&rt->dst);
+               if (!pcpu_rt) {
+                       /* atomic_inc_not_zero() is needed when using rcu */
+                       if (atomic_inc_not_zero(&rt->rt6i_ref)) {
+                               /* No dst_hold() on rt is needed because grabbing
+                                * rt->rt6i_ref makes sure rt can't be released.
+                                */
+                               pcpu_rt = rt6_make_pcpu_route(rt);
+                               rt6_release(rt);
+                       } else {
+                               /* rt is already removed from tree */
+                               pcpu_rt = net->ipv6.ip6_null_entry;
+                               dst_hold(&pcpu_rt->dst);
+                       }
                 }
-
+               local_bh_enable();
+               rcu_read_unlock();
                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
                 return pcpu_rt;
-
         }
  }
  EXPORT_SYMBOL_GPL(ip6_pol_route);
@@ -1325,9 +1879,10 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
         struct dst_entry *new = NULL;
  
         rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
-                      DST_OBSOLETE_NONE, 0);
+                      DST_OBSOLETE_DEAD, 0);
         if (rt) {
                 rt6_info_init(rt);
+               atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
  
                 new = &rt->dst;
                 new->__use = 1;
@@ -1491,23 +2046,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
  
         if (!rt6_cache_allowed_for_pmtu(rt6)) {
                 rt6_do_update_pmtu(rt6, mtu);
+               /* update rt6_ex->stamp for cache */
+               if (rt6->rt6i_flags & RTF_CACHE)
+                       rt6_update_exception_stamp_rt(rt6);
         } else if (daddr) {
                 struct rt6_info *nrt6;
  
                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
                 if (nrt6) {
                         rt6_do_update_pmtu(nrt6, mtu);
-
-                       /* ip6_ins_rt(nrt6) will bump the
-                        * rt6->rt6i_node->fn_sernum
-                        * which will fail the next rt6_check() and
-                        * invalidate the sk->sk_dst_cache.
-                        */
-                       ip6_ins_rt(nrt6);
-                       /* Release the reference taken in
-                        * ip6_rt_cache_alloc()
-                        */
-                       dst_release(&nrt6->dst);
+                       if (rt6_insert_exception(nrt6, rt6))
+                               dst_release_immediate(&nrt6->dst);
                 }
         }
  }
@@ -1571,7 +2120,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
                                              int flags)
  {
         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
-       struct rt6_info *rt;
+       struct rt6_info *rt, *rt_cache;
         struct fib6_node *fn;
  
         /* Get the "current" route for this destination and
@@ -1584,10 +2133,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
          * routes.
          */
  
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
  restart:
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                 if (rt6_check_expired(rt))
                         continue;
                 if (rt->dst.error)
@@ -1596,8 +2145,23 @@ restart:
                         continue;
                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
                         continue;
-               if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
+               /* rt_cache's gateway might be different from its 'parent'
+                * in the case of an ip redirect.
+                * So we keep searching in the exception table if the gateway
+                * is different.
+                */
+               if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
+                       rt_cache = rt6_find_cached_rt(rt,
+                                                     &fl6->daddr,
+                                                     &fl6->saddr);
+                       if (rt_cache &&
+                           ipv6_addr_equal(&rdfl->gateway,
+                                           &rt_cache->rt6i_gateway)) {
+                               rt = rt_cache;
+                               break;
+                       }
                         continue;
+               }
                 break;
         }
  
@@ -1615,9 +2179,9 @@ restart:
         }
  
  out:
-       dst_hold(&rt->dst);
+       ip6_hold_safe(net, &rt, true);
  
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
  
         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
         return rt;
@@ -1766,6 +2330,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
          * do proper release of the net_device
          */
         rt6_uncached_list_add(rt);
+       atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
  
         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
  
@@ -2216,9 +2781,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
         }
  
         table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
         err = fib6_del(rt, info);
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
  
  out:
         ip6_rt_put(rt);
@@ -2244,7 +2809,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
         if (rt == net->ipv6.ip6_null_entry)
                 goto out_put;
         table = rt->rt6i_table;
-       write_lock_bh(&table->tb6_lock);
+       spin_lock_bh(&table->tb6_lock);
  
         if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
                 struct rt6_info *sibling, *next_sibling;
@@ -2274,7 +2839,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
  
         err = fib6_del(rt, info);
  out_unlock:
-       write_unlock_bh(&table->tb6_lock);
+       spin_unlock_bh(&table->tb6_lock);
  out_put:
         ip6_rt_put(rt);
  
@@ -2288,9 +2853,9 @@ out_put:
  static int ip6_route_del(struct fib6_config *cfg,
                          struct netlink_ext_ack *extack)
  {
+       struct rt6_info *rt, *rt_cache;
         struct fib6_table *table;
         struct fib6_node *fn;
-       struct rt6_info *rt;
         int err = -ESRCH;
  
         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
@@ -2299,17 +2864,22 @@ static int ip6_route_del(struct fib6_config *cfg,
                 return err;
         }
  
-       read_lock_bh(&table->tb6_lock);
+       rcu_read_lock();
  
         fn = fib6_locate(&table->tb6_root,
                          &cfg->fc_dst, cfg->fc_dst_len,
-                        &cfg->fc_src, cfg->fc_src_len);
+                        &cfg->fc_src, cfg->fc_src_len,
+                        !(cfg->fc_flags & RTF_CACHE));
  
         if (fn) {
-               for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
-                       if ((rt->rt6i_flags & RTF_CACHE) &&
-                           !(cfg->fc_flags & RTF_CACHE))
-                               continue;
+               for_each_fib6_node_rt_rcu(fn) {
+                       if (cfg->fc_flags & RTF_CACHE) {
+                               rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
+                                                             &cfg->fc_src);
+                               if (!rt_cache)
+                                       continue;
+                               rt = rt_cache;
+                       }
                         if (cfg->fc_ifindex &&
                             (!rt->dst.dev ||
                              rt->dst.dev->ifindex != cfg->fc_ifindex))
@@ -2321,8 +2891,9 @@ static int ip6_route_del(struct fib6_config *cfg,
                                 continue;
                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
                                 continue;
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
+                       if (!dst_hold_safe(&rt->dst))
+                               break;
+                       rcu_read_unlock();
  
                         /* if gateway was specified only delete the one hop */
                         if (cfg->fc_flags & RTF_GATEWAY)
@@ -2331,7 +2902,7 @@ static int ip6_route_del(struct fib6_config *cfg,
                         return __ip6_del_rt_siblings(rt, cfg);
                 }
         }
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
  
         return err;
  }
@@ -2435,8 +3006,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
         nrt->rt6i_protocol = RTPROT_REDIRECT;
         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
  
-       if (ip6_ins_rt(nrt))
-               goto out_release;
+       /* No need to remove rt from the exception table if rt is
+        * a cached route because rt6_insert_exception() will
+        * takes care of it
+        */
+       if (rt6_insert_exception(nrt, rt)) {
+               dst_release_immediate(&nrt->dst);
+               goto out;
+       }
  
         netevent.old = &rt->dst;
         netevent.new = &nrt->dst;
@@ -2444,17 +3021,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
         netevent.neigh = neigh;
         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
  
-       if (rt->rt6i_flags & RTF_CACHE) {
-               rt = (struct rt6_info *) dst_clone(&rt->dst);
-               ip6_del_rt(rt);
-       }
-
-out_release:
-       /* Release the reference taken in
-        * ip6_rt_cache_alloc()
-        */
-       dst_release(&nrt->dst);
-
  out:
         neigh_release(neigh);
  }
@@ -2511,23 +3077,23 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
         if (!table)
                 return NULL;
  
-       read_lock_bh(&table->tb6_lock);
-       fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
+       rcu_read_lock();
+       fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
         if (!fn)
                 goto out;
  
-       for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
+       for_each_fib6_node_rt_rcu(fn) {
                 if (rt->dst.dev->ifindex != ifindex)
                         continue;
                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
                         continue;
                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
                         continue;
-               dst_hold(&rt->dst);
+               ip6_hold_safe(NULL, &rt, false);
                 break;
         }
  out:
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
         return rt;
  }
  
@@ -2573,16 +3139,16 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
         if (!table)
                 return NULL;
  
-       read_lock_bh(&table->tb6_lock);
-       for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+       rcu_read_lock();
+       for_each_fib6_node_rt_rcu(&table->tb6_root) {
                 if (dev == rt->dst.dev &&
                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
                         break;
         }
         if (rt)
-               dst_hold(&rt->dst);
-       read_unlock_bh(&table->tb6_lock);
+               ip6_hold_safe(NULL, &rt, false);
+       rcu_read_unlock();
         return rt;
  }
  
@@ -2620,17 +3186,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table)
         struct rt6_info *rt;
  
  restart:
-       read_lock_bh(&table->tb6_lock);
-       for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
+       rcu_read_lock();
+       for_each_fib6_node_rt_rcu(&table->tb6_root) {
                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       ip6_del_rt(rt);
+                       if (dst_hold_safe(&rt->dst)) {
+                               rcu_read_unlock();
+                               ip6_del_rt(rt);
+                       } else {
+                               rcu_read_unlock();
+                       }
                         goto restart;
                 }
         }
-       read_unlock_bh(&table->tb6_lock);
+       rcu_read_unlock();
  
         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
  }
@@ -2818,8 +3387,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
         if (((void *)rt->dst.dev == dev || !dev) &&
             rt != net->ipv6.ip6_null_entry &&
             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+               spin_lock_bh(&rt6_exception_lock);
                 /* remove prefsrc entry */
                 rt->rt6i_prefsrc.plen = 0;
+               /* need to update cache as well */
+               rt6_exceptions_remove_prefsrc(rt);
+               spin_unlock_bh(&rt6_exception_lock);
         }
         return 0;
  }
@@ -2836,18 +3409,23 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
  }
  
  #define RTF_RA_ROUTER          (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
-#define RTF_CACHE_GATEWAY      (RTF_GATEWAY | RTF_CACHE)
  
  /* Remove routers and update dst entries when gateway turn into host. */
  static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
  {
         struct in6_addr *gateway = (struct in6_addr *)arg;
  
-       if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
-            ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
-            ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
+       if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
+           ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
                 return -1;
         }
+
+       /* Further clean up cached routes in exception table.
+        * This is needed because cached route may have a different
+        * gateway than its 'parent' in the case of an ip redirect.
+        */
+       rt6_exceptions_clean_tohost(rt, gateway);
+
         return 0;
  }
  
@@ -2926,19 +3504,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
         if (rt->dst.dev == arg->dev &&
             dst_metric_raw(&rt->dst, RTAX_MTU) &&
             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
-               if (rt->rt6i_flags & RTF_CACHE) {
-                       /* For RTF_CACHE with rt6i_pmtu == 0
-                        * (i.e. a redirected route),
-                        * the metrics of its rt->dst.from has already
-                        * been updated.
-                        */
-                       if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
-                               rt->rt6i_pmtu = arg->mtu;
-               } else if (dst_mtu(&rt->dst) >= arg->mtu ||
-                          (dst_mtu(&rt->dst) < arg->mtu &&
-                           dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
+               spin_lock_bh(&rt6_exception_lock);
+               if (dst_mtu(&rt->dst) >= arg->mtu ||
+                   (dst_mtu(&rt->dst) < arg->mtu &&
+                    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
                 }
+               rt6_exceptions_update_pmtu(rt, arg->mtu);
+               spin_unlock_bh(&rt6_exception_lock);
         }
         return 0;
  }
@@ -3839,7 +4412,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
                    net->ipv6.rt6_stats->fib_nodes,
                    net->ipv6.rt6_stats->fib_route_nodes,
-                  net->ipv6.rt6_stats->fib_rt_alloc,
+                  atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
                    net->ipv6.rt6_stats->fib_rt_entries,
                    net->ipv6.rt6_stats->fib_rt_cache,
                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c

index c5b9ce4..9745e8f 100644 (file)
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -16,6 +16,7 @@
  #include <net/arp.h>
  #include <net/ip_fib.h>
  #include <net/netevent.h>
+#include <net/ip_tunnels.h>
  #include <net/netns/generic.h>
  #if IS_ENABLED(CONFIG_IPV6)
  #include <net/ipv6.h>
@@ -39,6 +40,36 @@ static int one = 1;
  static int label_limit = (1 << 20) - 1;
  static int ttl_max = 255;
  
+#if IS_ENABLED(CONFIG_NET_IP_TUNNEL)
+size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e)
+{
+       return sizeof(struct mpls_shim_hdr);
+}
+
+static const struct ip_tunnel_encap_ops mpls_iptun_ops = {
+       .encap_hlen     = ipgre_mpls_encap_hlen,
+};
+
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+       return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+       ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS);
+}
+#else
+static int ipgre_tunnel_encap_add_mpls_ops(void)
+{
+       return 0;
+}
+
+static void ipgre_tunnel_encap_del_mpls_ops(void)
+{
+}
+#endif
+
  static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
                        struct nlmsghdr *nlh, struct net *net, u32 portid,
                        unsigned int nlm_flags);
@@ -2485,6 +2516,10 @@ static int __init mpls_init(void)
                       0);
         rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf,
                       mpls_netconf_dump_devconf, 0);
+       err = ipgre_tunnel_encap_add_mpls_ops();
+       if (err)
+               pr_err("Can't add mpls over gre tunnel ops\n");
+
         err = 0;
  out:
         return err;
@@ -2502,6 +2537,7 @@ static void __exit mpls_exit(void)
         dev_remove_pack(&mpls_packet_type);
         unregister_netdevice_notifier(&mpls_dev_notifier);
         unregister_pernet_subsys(&mpls_net_ops);
+       ipgre_tunnel_encap_del_mpls_ops();
  }
  module_exit(mpls_exit);
  
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c

index e495b5e..cf84f7b 100644 (file)
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1191,14 +1191,17 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
               from->family == to->family))
                 return -IPSET_ERR_TYPE_MISMATCH;
  
-       if (from->ref_netlink || to->ref_netlink)
+       write_lock_bh(&ip_set_ref_lock);
+
+       if (from->ref_netlink || to->ref_netlink) {
+               write_unlock_bh(&ip_set_ref_lock);
                 return -EBUSY;
+       }
  
         strncpy(from_name, from->name, IPSET_MAXNAMELEN);
         strncpy(from->name, to->name, IPSET_MAXNAMELEN);
         strncpy(to->name, from_name, IPSET_MAXNAMELEN);
  
-       write_lock_bh(&ip_set_ref_lock);
         swap(from->ref, to->ref);
         ip_set(inst, from_id) = to;
         ip_set(inst, to_id) = from;
@@ -2072,25 +2075,28 @@ static struct pernet_operations ip_set_net_ops = {
  static int __init
  ip_set_init(void)
  {
-       int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+       int ret = register_pernet_subsys(&ip_set_net_ops);
+
+       if (ret) {
+               pr_err("ip_set: cannot register pernet_subsys.\n");
+               return ret;
+       }
  
+       ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
         if (ret != 0) {
                 pr_err("ip_set: cannot register with nfnetlink.\n");
+               unregister_pernet_subsys(&ip_set_net_ops);
                 return ret;
         }
+
         ret = nf_register_sockopt(&so_set);
         if (ret != 0) {
                 pr_err("SO_SET registry failed: %d\n", ret);
                 nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+               unregister_pernet_subsys(&ip_set_net_ops);
                 return ret;
         }
-       ret = register_pernet_subsys(&ip_set_net_ops);
-       if (ret) {
-               pr_err("ip_set: cannot register pernet_subsys.\n");
-               nf_unregister_sockopt(&so_set);
-               nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-               return ret;
-       }
+
         pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
         return 0;
  }
@@ -2098,9 +2104,10 @@ ip_set_init(void)
  static void __exit
  ip_set_fini(void)
  {
-       unregister_pernet_subsys(&ip_set_net_ops);
         nf_unregister_sockopt(&so_set);
         nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+
+       unregister_pernet_subsys(&ip_set_net_ops);
         pr_debug("these are the famous last words\n");
  }
  
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c

index 20bfbd3..613eb21 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -123,13 +123,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
                 return ret;
  
         ip &= ip_set_hostmask(h->netmask);
+       e.ip = htonl(ip);
+       if (e.ip == 0)
+               return -IPSET_ERR_HASH_ELEM;
  
-       if (adt == IPSET_TEST) {
-               e.ip = htonl(ip);
-               if (e.ip == 0)
-                       return -IPSET_ERR_HASH_ELEM;
+       if (adt == IPSET_TEST)
                 return adtfn(set, &e, &ext, &ext, flags);
-       }
  
         ip_to = ip;
         if (tb[IPSET_ATTR_IP_TO]) {
@@ -148,17 +147,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
  
         hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
  
-       if (retried)
+       if (retried) {
                 ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip += hosts) {
                 e.ip = htonl(ip);
-               if (e.ip == 0)
-                       return -IPSET_ERR_HASH_ELEM;
+       }
+       for (; ip <= ip_to;) {
                 ret = adtfn(set, &e, &ext, &ext, flags);
-
                 if (ret && !ip_set_eexist(ret, flags))
                         return ret;
  
+               ip += hosts;
+               e.ip = htonl(ip);
+               if (e.ip == 0)
+                       return 0;
+
                 ret = 0;
         }
         return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c

index b64cf14..f3ba834 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
  
         if (retried)
                 ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                 e.ip = htonl(ip);
                 ret = adtfn(set, &e, &ext, &ext, flags);
  
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c

index f438740..ddb8039 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -178,7 +178,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
  
         if (retried)
                 ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                 p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
                                                        : port;
                 for (; p <= port_to; p++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c

index 6215fb8..a7f4d7a 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -185,7 +185,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
  
         if (retried)
                 ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                 p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
                                                        : port;
                 for (; p <= port_to; p++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c

index 5ab1b99..a2f19b9 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -271,7 +271,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
  
         if (retried)
                 ip = ntohl(h->next.ip);
-       for (; !before(ip_to, ip); ip++) {
+       for (; ip <= ip_to; ip++) {
                 e.ip = htonl(ip);
                 p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
                                                        : port;
@@ -281,7 +281,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
                               ip == ntohl(h->next.ip) &&
                               p == ntohs(h->next.port)
                                 ? ntohl(h->next.ip2) : ip2_from;
-                       while (!after(ip2, ip2_to)) {
+                       while (ip2 <= ip2_to) {
                                 e.ip2 = htonl(ip2);
                                 ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
                                                                 &cidr);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c

index 5d9e895..1c67a17 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -193,7 +193,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
         }
         if (retried)
                 ip = ntohl(h->next.ip);
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                 e.ip = htonl(ip);
                 last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
                 ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c

index 44cf119..d417074 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -255,7 +255,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
  
         if (retried)
                 ip = ntohl(h->next.ip);
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                 e.ip = htonl(ip);
                 last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
                 ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c

index db614e1..7f9ae2e 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -250,13 +250,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
         if (retried)
                 ip = ntohl(h->next.ip[0]);
  
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                 e.ip[0] = htonl(ip);
                 last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
                 ip2 = (retried &&
                        ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
                                                    : ip2_from;
-               while (!after(ip2, ip2_to)) {
+               while (ip2 <= ip2_to) {
                         e.ip[1] = htonl(ip2);
                         last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
                         ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c

index 54b64b6..e6ef382 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -241,7 +241,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
  
         if (retried)
                 ip = ntohl(h->next.ip);
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                 e.ip = htonl(ip);
                 last = ip_set_range_to_cidr(ip, ip_to, &cidr);
                 e.cidr = cidr - 1;
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c

index aff8469..8602f25 100644 (file)
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -291,7 +291,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
         if (retried)
                 ip = ntohl(h->next.ip[0]);
  
-       while (!after(ip, ip_to)) {
+       while (ip <= ip_to) {
                 e.ip[0] = htonl(ip);
                 ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
                 p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
@@ -301,7 +301,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
                         ip2 = (retried && ip == ntohl(h->next.ip[0]) &&
                                p == ntohs(h->next.port)) ? ntohl(h->next.ip[1])
                                                          : ip2_from;
-                       while (!after(ip2, ip2_to)) {
+                       while (ip2 <= ip2_to) {
                                 e.ip[1] = htonl(ip2);
                                 ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
                                                                 &e.cidr[1]);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c

index 90d3968..4527921 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -921,6 +921,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
  {
         struct sk_buff *new_skb = NULL;
         struct iphdr *old_iph = NULL;
+       __u8 old_dsfield;
  #ifdef CONFIG_IP_VS_IPV6
         struct ipv6hdr *old_ipv6h = NULL;
  #endif
@@ -945,7 +946,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
                         *payload_len =
                                 ntohs(old_ipv6h->payload_len) +
                                 sizeof(*old_ipv6h);
-               *dsfield = ipv6_get_dsfield(old_ipv6h);
+               old_dsfield = ipv6_get_dsfield(old_ipv6h);
                 *ttl = old_ipv6h->hop_limit;
                 if (df)
                         *df = 0;
@@ -960,12 +961,15 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
  
                 /* fix old IP header checksum */
                 ip_send_check(old_iph);
-               *dsfield = ipv4_get_dsfield(old_iph);
+               old_dsfield = ipv4_get_dsfield(old_iph);
                 *ttl = old_iph->ttl;
                 if (payload_len)
                         *payload_len = ntohs(old_iph->tot_len);
         }
  
+       /* Implement full-functionality option for ECN encapsulation */
+       *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
+
         return skb;
  error:
         kfree_skb(skb);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c

index 9299271..64e1ee0 100644 (file)
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1048,7 +1048,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
                 if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
                         goto nla_put_failure;
  
-               if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+               if (basechain->stats && nft_dump_stats(skb, basechain->stats))
                         goto nla_put_failure;
         }
  
@@ -1487,8 +1487,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
  
                 chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME],
                                                 genmask);
-               if (IS_ERR(chain2))
-                       return PTR_ERR(chain2);
+               if (!IS_ERR(chain2))
+                       return -EEXIST;
         }
  
         if (nla[NFTA_CHAIN_COUNTERS]) {
@@ -2741,8 +2741,10 @@ cont:
         list_for_each_entry(i, &ctx->table->sets, list) {
                 if (!nft_is_active_next(ctx->net, i))
                         continue;
-               if (!strcmp(set->name, i->name))
+               if (!strcmp(set->name, i->name)) {
+                       kfree(set->name);
                         return -ENFILE;
+               }
         }
         return 0;
  }
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c

index c83a3b5..d8571f4 100644 (file)
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -892,7 +892,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
                 if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
                         return ERR_PTR(-EFAULT);
  
-               strlcpy(info->name, compat_tmp.name, sizeof(info->name));
+               memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1);
                 info->num_counters = compat_tmp.num_counters;
                 user += sizeof(compat_tmp);
         } else
@@ -905,9 +905,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
                 if (copy_from_user(info, user, sizeof(*info)) != 0)
                         return ERR_PTR(-EFAULT);
  
-               info->name[sizeof(info->name) - 1] = '\0';
                 user += sizeof(*info);
         }
+       info->name[sizeof(info->name) - 1] = '\0';
  
         size = sizeof(struct xt_counters);
         size *= info->num_counters;
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c

index 38986a9..2912393 100644 (file)
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -8,6 +8,7 @@
   */
  
  #include <linux/module.h>
+#include <linux/syscalls.h>
  #include <linux/skbuff.h>
  #include <linux/filter.h>
  #include <linux/bpf.h>
@@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
         return 0;
  }
  
+static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
+{
+       mm_segment_t oldfs = get_fs();
+       int retval, fd;
+
+       set_fs(KERNEL_DS);
+       fd = bpf_obj_get_user(path);
+       set_fs(oldfs);
+       if (fd < 0)
+               return fd;
+
+       retval = __bpf_mt_check_fd(fd, ret);
+       sys_close(fd);
+       return retval;
+}
+
  static int bpf_mt_check(const struct xt_mtchk_param *par)
  {
         struct xt_bpf_info *info = par->matchinfo;
@@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
                 return __bpf_mt_check_bytecode(info->bpf_program,
                                                info->bpf_program_num_elem,
                                                &info->filter);
-       else if (info->mode == XT_BPF_MODE_FD_PINNED ||
-                info->mode == XT_BPF_MODE_FD_ELF)
+       else if (info->mode == XT_BPF_MODE_FD_ELF)
                 return __bpf_mt_check_fd(info->fd, &info->filter);
+       else if (info->mode == XT_BPF_MODE_PATH_PINNED)
+               return __bpf_mt_check_path(info->path, &info->filter);
         else
                 return -EINVAL;
  }
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c

index e75ef39..575d215 100644 (file)
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
                         transparent = nf_sk_is_transparent(sk);
  
                 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
-                   transparent)
+                   transparent && sk_fullsock(sk))
                         pskb->mark = sk->sk_mark;
  
                 if (sk != skb->sk)
@@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
                         transparent = nf_sk_is_transparent(sk);
  
                 if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
-                   transparent)
+                   transparent && sk_fullsock(sk))
                         pskb->mark = sk->sk_mark;
  
                 if (sk != skb->sk)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c

index 94c11cf..f347506 100644 (file)
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
         cb->min_dump_alloc = control->min_dump_alloc;
         cb->skb = skb;
  
+       if (cb->start) {
+               ret = cb->start(cb);
+               if (ret)
+                       goto error_unlock;
+       }
+
         nlk->cb_running = true;
  
         mutex_unlock(nlk->cb_mutex);
  
-       ret = 0;
-       if (cb->start)
-               ret = cb->start(cb);
-
-       if (!ret)
-               ret = netlink_dump(sk);
+       ret = netlink_dump(sk);
  
         sock_put(sk);
  
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c

index a54a556..a551232 100644 (file)
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
                                 return err == -EINPROGRESS ? 0 : err;
                         break;
  
+               case OVS_ACTION_ATTR_CT_CLEAR:
+                       err = ovs_ct_clear(skb, key);
+                       break;
+
                 case OVS_ACTION_ATTR_PUSH_ETH:
                         err = push_eth(skb, key, nla_data(a));
                         break;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c

index d558e88..fe861e2 100644 (file)
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1129,6 +1129,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
         return err;
  }
  
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
+{
+       if (skb_nfct(skb)) {
+               nf_conntrack_put(skb_nfct(skb));
+               nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+               ovs_ct_fill_key(skb, key);
+       }
+
+       return 0;
+}
+
  static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
                              const struct sw_flow_key *key, bool log)
  {
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h

index bc7efd1..399dfdd 100644 (file)
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *);
  
  int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
                    const struct ovs_conntrack_info *);
+int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key);
  
  void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);
  int ovs_ct_put_key(const struct sw_flow_key *swkey,
@@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,
         return -ENOTSUPP;
  }
  
+static inline int ovs_ct_clear(struct sk_buff *skb,
+                              struct sw_flow_key *key)
+{
+       return -ENOTSUPP;
+}
+
  static inline void ovs_ct_fill_key(const struct sk_buff *skb,
                                    struct sw_flow_key *key)
  {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c

index e8eb427..dc0d790 100644 (file)
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -48,6 +48,7 @@
  #include <net/ndisc.h>
  #include <net/mpls.h>
  #include <net/vxlan.h>
+#include <net/erspan.h>
  
  #include "flow_netlink.h"
  
@@ -75,6 +76,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
                         break;
  
                 case OVS_ACTION_ATTR_CT:
+               case OVS_ACTION_ATTR_CT_CLEAR:
                 case OVS_ACTION_ATTR_HASH:
                 case OVS_ACTION_ATTR_POP_ETH:
                 case OVS_ACTION_ATTR_POP_MPLS:
@@ -319,7 +321,8 @@ size_t ovs_tun_key_attr_size(void)
                  * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it.
                  */
                 + nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_SRC */
-               + nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+               + nla_total_size(2)    /* OVS_TUNNEL_KEY_ATTR_TP_DST */
+               + nla_total_size(4);   /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */
  }
  
  size_t ovs_key_attr_size(void)
@@ -371,6 +374,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]
                                                 .next = ovs_vxlan_ext_key_lens },
         [OVS_TUNNEL_KEY_ATTR_IPV6_SRC]      = { .len = sizeof(struct in6_addr) },
         [OVS_TUNNEL_KEY_ATTR_IPV6_DST]      = { .len = sizeof(struct in6_addr) },
+       [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS]   = { .len = sizeof(u32) },
  };
  
  /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */
@@ -593,6 +597,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
         return 0;
  }
  
+static int erspan_tun_opt_from_nlattr(const struct nlattr *attr,
+                                     struct sw_flow_match *match, bool is_mask,
+                                     bool log)
+{
+       unsigned long opt_key_offset;
+       struct erspan_metadata opts;
+
+       BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
+
+       memset(&opts, 0, sizeof(opts));
+       opts.index = nla_get_be32(attr);
+
+       /* Index has only 20-bit */
+       if (ntohl(opts.index) & ~INDEX_MASK) {
+               OVS_NLERR(log, "ERSPAN index number %x too large.",
+                         ntohl(opts.index));
+               return -EINVAL;
+       }
+
+       SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask);
+       opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts));
+       SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts),
+                                 is_mask);
+
+       return 0;
+}
+
  static int ip_tun_from_nlattr(const struct nlattr *attr,
                               struct sw_flow_match *match, bool is_mask,
                               bool log)
@@ -700,6 +731,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
                         break;
                 case OVS_TUNNEL_KEY_ATTR_PAD:
                         break;
+               case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+                       if (opts_type) {
+                               OVS_NLERR(log, "Multiple metadata blocks provided");
+                               return -EINVAL;
+                       }
+
+                       err = erspan_tun_opt_from_nlattr(a, match, is_mask, log);
+                       if (err)
+                               return err;
+
+                       tun_flags |= TUNNEL_ERSPAN_OPT;
+                       opts_type = type;
+                       break;
                 default:
                         OVS_NLERR(log, "Unknown IP tunnel attribute %d",
                                   type);
@@ -824,6 +868,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
                 else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
                          vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
                         return -EMSGSIZE;
+               else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
+                        nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
+                                     ((struct erspan_metadata *)tun_opts)->index))
+                       return -EMSGSIZE;
         }
  
         return 0;
@@ -2195,6 +2243,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
                         break;
                 case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
                         break;
+               case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
+                       break;
                 }
         };
  
@@ -2479,6 +2529,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
                         [OVS_ACTION_ATTR_SAMPLE] = (u32)-1,
                         [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash),
                         [OVS_ACTION_ATTR_CT] = (u32)-1,
+                       [OVS_ACTION_ATTR_CT_CLEAR] = 0,
                         [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc),
                         [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth),
                         [OVS_ACTION_ATTR_POP_ETH] = 0,
@@ -2620,6 +2671,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
                         skip_copy = true;
                         break;
  
+               case OVS_ACTION_ATTR_CT_CLEAR:
+                       break;
+
                 case OVS_ACTION_ATTR_PUSH_ETH:
                         /* Disallow pushing an Ethernet header if one
                          * is already present */
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c

index f925753..3b0ef69 100644 (file)
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -35,11 +35,11 @@
  #include <net/phonet/pn_dev.h>
  
  /* Transport protocol registration */
-static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
  
-static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
+static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
  {
-       struct phonet_protocol *pp;
+       const struct phonet_protocol *pp;
  
         if (protocol >= PHONET_NPROTO)
                 return NULL;
@@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol)
         return pp;
  }
  
-static inline void phonet_proto_put(struct phonet_protocol *pp)
+static inline void phonet_proto_put(const struct phonet_protocol *pp)
  {
         module_put(pp->prot->owner);
  }
@@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
  {
         struct sock *sk;
         struct pn_sock *pn;
-       struct phonet_protocol *pnp;
+       const struct phonet_protocol *pnp;
         int err;
  
         if (!capable(CAP_SYS_ADMIN))
@@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr)
         return 1;
  }
  
-struct header_ops phonet_header_ops = {
+const struct header_ops phonet_header_ops = {
         .create = pn_header_create,
         .parse = pn_header_parse,
  };
@@ -470,7 +470,7 @@ static struct packet_type phonet_packet_type __read_mostly = {
  static DEFINE_MUTEX(proto_tab_lock);
  
  int __init_or_module phonet_proto_register(unsigned int protocol,
-                                               struct phonet_protocol *pp)
+                               const struct phonet_protocol *pp)
  {
         int err = 0;
  
@@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol,
  }
  EXPORT_SYMBOL(phonet_proto_register);
  
-void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp)
+void phonet_proto_unregister(unsigned int protocol,
+                       const struct phonet_protocol *pp)
  {
         mutex_lock(&proto_tab_lock);
         BUG_ON(proto_tab[protocol] != pp);
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c

index 5e71043..b44fb90 100644 (file)
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -195,7 +195,7 @@ static struct proto pn_proto = {
         .name           = "PHONET",
  };
  
-static struct phonet_protocol pn_dgram_proto = {
+static const struct phonet_protocol pn_dgram_proto = {
         .ops            = &phonet_dgram_ops,
         .prot           = &pn_proto,
         .sock_type      = SOCK_DGRAM,
diff --git a/net/phonet/pep.c b/net/phonet/pep.c

index e815379..9fc76b1 100644 (file)
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1351,7 +1351,7 @@ static struct proto pep_proto = {
         .name           = "PNPIPE",
  };
  
-static struct phonet_protocol pep_pn_proto = {
+static const struct phonet_protocol pep_pn_proto = {
         .ops            = &phonet_stream_ops,
         .prot           = &pep_proto,
         .sock_type      = SOCK_SEQPACKET,
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c

index 5a4f100..db0228a 100644 (file)
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -148,12 +148,6 @@ struct netem_skb_cb {
         psched_time_t   time_to_send;
  };
  
-
-static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
-{
-       return rb_entry(rb, struct sk_buff, rbnode);
-}
-
  static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
  {
         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
@@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch)
         struct rb_node *p = rb_first(&q->t_root);
  
         while (p) {
-               struct sk_buff *skb = netem_rb_to_skb(p);
+               struct sk_buff *skb = rb_to_skb(p);
  
                 p = rb_next(p);
                 rb_erase(&skb->rbnode, &q->t_root);
@@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
                 struct sk_buff *skb;
  
                 parent = *p;
-               skb = netem_rb_to_skb(parent);
+               skb = rb_to_skb(parent);
                 if (tnext >= netem_skb_cb(skb)->time_to_send)
                         p = &parent->rb_right;
                 else
@@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff *t_skb;
                                 struct netem_skb_cb *t_last;
  
-                               t_skb = netem_rb_to_skb(rb_last(&q->t_root));
+                               t_skb = skb_rb_last(&q->t_root);
                                 t_last = netem_skb_cb(t_skb);
                                 if (!last ||
                                     t_last->time_to_send > last->time_to_send) {
@@ -617,7 +611,7 @@ deliver:
         if (p) {
                 psched_time_t time_to_send;
  
-               skb = netem_rb_to_skb(p);
+               skb = rb_to_skb(p);
  
                 /* if more time remaining? */
                 time_to_send = netem_skb_cb(skb)->time_to_send;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c

index 9b5de31..c1841f2 100644 (file)
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2203,7 +2203,7 @@ static void xs_udp_setup_socket(struct work_struct *work)
         struct sock_xprt *transport =
                 container_of(work, struct sock_xprt, connect_worker.work);
         struct rpc_xprt *xprt = &transport->xprt;
-       struct socket *sock = transport->sock;
+       struct socket *sock;
         int status = -EIO;
  
         sock = xs_create_sock(xprt, transport,
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c

index 7d99029..a140dd4 100644 (file)
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -233,7 +233,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts,
         struct sk_buff_head xmitq;
         int rc = 0;
  
-       __skb_queue_head_init(&xmitq);
+       skb_queue_head_init(&xmitq);
         tipc_bcast_lock(net);
         if (tipc_link_bc_peers(l))
                 rc = tipc_link_xmit(l, pkts, &xmitq);
@@ -263,7 +263,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts,
         u32 dst, selector;
  
         selector = msg_link_selector(buf_msg(skb_peek(pkts)));
-       __skb_queue_head_init(&_pkts);
+       skb_queue_head_init(&_pkts);
  
         list_for_each_entry_safe(n, tmp, &dests->list, list) {
                 dst = n->value;
diff --git a/net/tipc/msg.c b/net/tipc/msg.c

index 121e59a..17146c1 100644 (file)
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -568,6 +568,14 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
         msg_set_destnode(msg, dnode);
         msg_set_destport(msg, dport);
         *err = TIPC_OK;
+
+       if (!skb_cloned(skb))
+               return true;
+
+       /* Unclone buffer in case it was bundled */
+       if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
+               return false;
+
         return true;
  }
  
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c

index 67a03f2..fce2cbe 100644 (file)
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -549,6 +549,14 @@ nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = {
         [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED },
  };
  
+/* policy for packet pattern attributes */
+static const struct nla_policy
+nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = {
+       [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, },
+       [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, },
+       [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 },
+};
+
  static int nl80211_prepare_wdev_dump(struct sk_buff *skb,
                                      struct netlink_callback *cb,
                                      struct cfg80211_registered_device **rdev,
@@ -10571,7 +10579,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
                         u8 *mask_pat;
  
                         nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
-                                        NULL, info->extack);
+                                        nl80211_packet_pattern_policy,
+                                        info->extack);
                         err = -EINVAL;
                         if (!pat_tb[NL80211_PKTPAT_MASK] ||
                             !pat_tb[NL80211_PKTPAT_PATTERN])
@@ -10820,7 +10829,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev,
                             rem) {
                 u8 *mask_pat;
  
-               nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL, NULL);
+               nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat,
+                                nl80211_packet_pattern_policy, NULL);
                 if (!pat_tb[NL80211_PKTPAT_MASK] ||
                     !pat_tb[NL80211_PKTPAT_PATTERN])
                         return -EINVAL;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c

index acf0010..30e5746 100644 (file)
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -91,6 +91,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
         }
  
         if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
+               xso->dev = NULL;
                 dev_put(dev);
                 return 0;
         }
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c

index 2515cd2..8ac9d32 100644 (file)
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -429,7 +429,8 @@ resume:
         nf_reset(skb);
  
         if (decaps) {
-               skb->sp->olen = 0;
+               if (skb->sp)
+                       skb->sp->olen = 0;
                 skb_dst_drop(skb);
                 gro_cells_receive(&gro_cells, skb);
                 return 0;
@@ -440,7 +441,8 @@ resume:
  
                 err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async);
                 if (xfrm_gro) {
-                       skb->sp->olen = 0;
+                       if (skb->sp)
+                               skb->sp->olen = 0;
                         skb_dst_drop(skb);
                         gro_cells_receive(&gro_cells, skb);
                         return err;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c

index 0dab1cd..1221347 100644 (file)
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -732,12 +732,12 @@ restart:
                         }
                 }
         }
+out:
+       spin_unlock_bh(&net->xfrm.xfrm_state_lock);
         if (cnt) {
                 err = 0;
                 xfrm_policy_cache_flush();
         }
-out:
-       spin_unlock_bh(&net->xfrm.xfrm_state_lock);
         return err;
  }
  EXPORT_SYMBOL(xfrm_state_flush);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c

index 2bfbd91..b997f13 100644 (file)
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -657,6 +657,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
  
         if (err < 0) {
                 x->km.state = XFRM_STATE_DEAD;
+               xfrm_dev_state_delete(x);
                 __xfrm_state_put(x);
                 goto out;
         }
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c

index 41b6115..a77a583 100644 (file)
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = {
  SEC("perf_event")
  int bpf_prog1(struct bpf_perf_event_data *ctx)
  {
+       char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+       char time_fmt2[] = "Get Time Failed, ErrCode: %d";
         char fmt[] = "CPU-%d period %lld ip %llx";
         u32 cpu = bpf_get_smp_processor_id();
+       struct bpf_perf_event_value value_buf;
         struct key_t key;
         u64 *val, one = 1;
+       int ret;
  
         if (ctx->sample_period < 10000)
                 /* ignore warmup */
@@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx)
                 return 0;
         }
  
+       ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+       if (!ret)
+         bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+       else
+         bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
         val = bpf_map_lookup_elem(&counts, &key);
         if (val)
                 (*val)++;
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c

index 7bd827b..bf4f1b6 100644 (file)
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
         int *pmu_fd = malloc(nr_cpus * sizeof(int));
         int i, error = 0;
  
+       /* system wide perf event, no need to inherit */
+       attr->inherit = 0;
+
         /* open perf_event on all cpus */
         for (i = 0; i < nr_cpus; i++) {
                 pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0);
@@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr)
  {
         int pmu_fd;
  
+       /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+        * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+        */
+       attr->inherit = 1;
+
         /* open task bound event */
         pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
         if (pmu_fd < 0) {
@@ -175,14 +183,12 @@ static void test_bpf_perf_event(void)
                 .freq = 1,
                 .type = PERF_TYPE_HARDWARE,
                 .config = PERF_COUNT_HW_CPU_CYCLES,
-               .inherit = 1,
         };
         struct perf_event_attr attr_type_sw = {
                 .sample_freq = SAMPLE_FREQ,
                 .freq = 1,
                 .type = PERF_TYPE_SOFTWARE,
                 .config = PERF_COUNT_SW_CPU_CLOCK,
-               .inherit = 1,
         };
         struct perf_event_attr attr_hw_cache_l1d = {
                 .sample_freq = SAMPLE_FREQ,
@@ -192,7 +198,6 @@ static void test_bpf_perf_event(void)
                         PERF_COUNT_HW_CACHE_L1D |
                         (PERF_COUNT_HW_CACHE_OP_READ << 8) |
                         (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
-               .inherit = 1,
         };
         struct perf_event_attr attr_hw_cache_branch_miss = {
                 .sample_freq = SAMPLE_FREQ,
@@ -202,7 +207,6 @@ static void test_bpf_perf_event(void)
                         PERF_COUNT_HW_CACHE_BPU |
                         (PERF_COUNT_HW_CACHE_OP_READ << 8) |
                         (PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
-               .inherit = 1,
         };
         struct perf_event_attr attr_type_raw = {
                 .sample_freq = SAMPLE_FREQ,
@@ -210,7 +214,6 @@ static void test_bpf_perf_event(void)
                 .type = PERF_TYPE_RAW,
                 /* Intel Instruction Retired */
                 .config = 0xc0,
-               .inherit = 1,
         };
  
         printf("Test HW_CPU_CYCLES\n");
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c

index e7d1803..46c557a 100644 (file)
--- a/samples/bpf/tracex6_kern.c
+++ b/samples/bpf/tracex6_kern.c
@@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = {
         .value_size = sizeof(u64),
         .max_entries = 64,
  };
+struct bpf_map_def SEC("maps") values2 = {
+       .type = BPF_MAP_TYPE_HASH,
+       .key_size = sizeof(int),
+       .value_size = sizeof(struct bpf_perf_event_value),
+       .max_entries = 64,
+};
  
  SEC("kprobe/htab_map_get_next_key")
  int bpf_prog1(struct pt_regs *ctx)
@@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx)
         return 0;
  }
  
+SEC("kprobe/htab_map_lookup_elem")
+int bpf_prog2(struct pt_regs *ctx)
+{
+       u32 key = bpf_get_smp_processor_id();
+       struct bpf_perf_event_value *val, buf;
+       int error;
+
+       error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
+       if (error)
+               return 0;
+
+       val = bpf_map_lookup_elem(&values2, &key);
+       if (val)
+               *val = buf;
+       else
+               bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
+
+       return 0;
+}
+
  char _license[] SEC("license") = "GPL";
  u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c

index a05a99a..3341a96 100644 (file)
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -22,6 +22,7 @@
  
  static void check_on_cpu(int cpu, struct perf_event_attr *attr)
  {
+       struct bpf_perf_event_value value2;
         int pmu_fd, error = 0;
         cpu_set_t set;
         __u64 value;
@@ -46,8 +47,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr)
                 fprintf(stderr, "Value missing for CPU %d\n", cpu);
                 error = 1;
                 goto on_exit;
+       } else {
+               fprintf(stderr, "CPU %d: %llu\n", cpu, value);
+       }
+       /* The above bpf_map_lookup_elem should trigger the second kprobe */
+       if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
+               fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
+               error = 1;
+               goto on_exit;
+       } else {
+               fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
+                       value2.counter, value2.enabled, value2.running);
         }
-       fprintf(stderr, "CPU %d: %llu\n", cpu, value);
  
  on_exit:
         assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c

index 74f3fd8..2fe2f76 100644 (file)
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -13,23 +13,27 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = {
         /* TODO: have entries for all possible errno's */
  };
  
+#define XDP_UNKNOWN    XDP_REDIRECT + 1
+struct bpf_map_def SEC("maps") exception_cnt = {
+       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
+       .key_size       = sizeof(u32),
+       .value_size     = sizeof(u64),
+       .max_entries    = XDP_UNKNOWN + 1,
+};
+
  /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
   * Code in:                kernel/include/trace/events/xdp.h
   */
  struct xdp_redirect_ctx {
-       unsigned short common_type;     //      offset:0;  size:2; signed:0;
-       unsigned char common_flags;     //      offset:2;  size:1; signed:0;
-       unsigned char common_preempt_count;//   offset:3;  size:1; signed:0;
-       int common_pid;                 //      offset:4;  size:4; signed:1;
-
-       int prog_id;                    //      offset:8;  size:4; signed:1;
-       u32 act;                        //      offset:12  size:4; signed:0;
-       int ifindex;                    //      offset:16  size:4; signed:1;
-       int err;                        //      offset:20  size:4; signed:1;
-       int to_ifindex;                 //      offset:24  size:4; signed:1;
-       u32 map_id;                     //      offset:28  size:4; signed:0;
-       int map_index;                  //      offset:32  size:4; signed:1;
-};                                     //      offset:36
+       u64 __pad;              // First 8 bytes are not accessible by bpf code
+       int prog_id;            //      offset:8;  size:4; signed:1;
+       u32 act;                //      offset:12  size:4; signed:0;
+       int ifindex;            //      offset:16  size:4; signed:1;
+       int err;                //      offset:20  size:4; signed:1;
+       int to_ifindex;         //      offset:24  size:4; signed:1;
+       u32 map_id;             //      offset:28  size:4; signed:0;
+       int map_index;          //      offset:32  size:4; signed:1;
+};                             //      offset:36
  
  enum {
         XDP_REDIRECT_SUCCESS = 0,
@@ -48,7 +52,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
  
         cnt  = bpf_map_lookup_elem(&redirect_err_cnt, &key);
         if (!cnt)
-               return 0;
+               return 1;
         *cnt += 1;
  
         return 0; /* Indicate event was filtered (no further processing)*/
@@ -86,3 +90,31 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
  {
         return xdp_redirect_collect_stat(ctx);
  }
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in:                kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+       u64 __pad;      // First 8 bytes are not accessible by bpf code
+       int prog_id;    //      offset:8;  size:4; signed:1;
+       u32 act;        //      offset:12; size:4; signed:0;
+       int ifindex;    //      offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+       u64 *cnt;;
+       u32 key;
+
+       key = ctx->act;
+       if (key > XDP_REDIRECT)
+               key = XDP_UNKNOWN;
+
+       cnt = bpf_map_lookup_elem(&exception_cnt, &key);
+       if (!cnt)
+               return 1;
+       *cnt += 1;
+
+       return 0;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c

index c5ab8b7..eaba165 100644 (file)
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -20,6 +20,7 @@ static const char *__doc_err_only__=
  #include <unistd.h>
  #include <locale.h>
  
+#include <sys/resource.h>
  #include <getopt.h>
  #include <net/if.h>
  #include <time.h>
@@ -89,6 +90,23 @@ static const char *err2str(int err)
                 return redir_names[err];
         return NULL;
  }
+/* enum xdp_action */
+#define XDP_UNKNOWN    XDP_REDIRECT + 1
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+       [XDP_ABORTED]   = "XDP_ABORTED",
+       [XDP_DROP]      = "XDP_DROP",
+       [XDP_PASS]      = "XDP_PASS",
+       [XDP_TX]        = "XDP_TX",
+       [XDP_REDIRECT]  = "XDP_REDIRECT",
+       [XDP_UNKNOWN]   = "XDP_UNKNOWN",
+};
+static const char *action2str(int action)
+{
+       if (action < XDP_ACTION_MAX)
+               return xdp_action_names[action];
+       return NULL;
+}
  
  struct record {
         __u64 counter;
@@ -97,6 +115,7 @@ struct record {
  
  struct stats_record {
         struct record xdp_redir[REDIR_RES_MAX];
+       struct record xdp_exception[XDP_ACTION_MAX];
  };
  
  static void stats_print_headers(bool err_only)
@@ -104,39 +123,72 @@ static void stats_print_headers(bool err_only)
         if (err_only)
                 printf("\n%s\n", __doc_err_only__);
  
-       printf("%-14s %-10s %-18s %-9s\n",
-              "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period");
+       printf("%-14s %-11s %-10s %-18s %-9s\n",
+              "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+       double period_ = 0;
+       __u64 period = 0;
+
+       period = r->timestamp - p->timestamp;
+       if (period > 0)
+               period_ = ((double) period / NANOSEC_PER_SEC);
+
+       return period_;
+}
+
+static double calc_pps(struct record *r, struct record *p, double period)
+{
+       __u64 packets = 0;
+       double pps = 0;
+
+       if (period > 0) {
+               packets = r->counter - p->counter;
+               pps = packets / period;
+       }
+       return pps;
  }
  
  static void stats_print(struct stats_record *rec,
                         struct stats_record *prev,
                         bool err_only)
  {
+       double period = 0, pps = 0;
+       struct record *r, *p;
         int i = 0;
  
+       char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+
+       /* tracepoint: xdp:xdp_redirect_* */
         if (err_only)
                 i = REDIR_ERROR;
  
         for (; i < REDIR_RES_MAX; i++) {
-               struct record *r = &rec->xdp_redir[i];
-               struct record *p = &prev->xdp_redir[i];
-               __u64 period  = 0;
-               __u64 packets = 0;
-               double pps = 0;
-               double period_ = 0;
+               r = &rec->xdp_redir[i];
+               p = &prev->xdp_redir[i];
  
                 if (p->timestamp) {
-                       packets = r->counter - p->counter;
-                       period  = r->timestamp - p->timestamp;
-                       if (period > 0) {
-                               period_ = ((double) period / NANOSEC_PER_SEC);
-                               pps = packets / period_;
-                       }
+                       period = calc_period(r, p);
+                       pps = calc_pps(r, p, period);
                 }
+               printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+       }
  
-               printf("%-14s %-10.0f %'-18.0f %f\n",
-                      err2str(i), pps, pps, period_);
+       /* tracepoint: xdp:xdp_exception */
+       for (i = 0; i < XDP_ACTION_MAX; i++) {
+               r = &rec->xdp_exception[i];
+               p = &prev->xdp_exception[i];
+               if (p->timestamp) {
+                       period = calc_period(r, p);
+                       pps = calc_pps(r, p, period);
+               }
+               if (pps > 0)
+                       printf(fmt, action2str(i), "Exception",
+                              pps, pps, period);
         }
+       printf("\n");
  }
  
  static __u64 get_key32_value64_percpu(int fd, __u32 key)
@@ -160,25 +212,33 @@ static __u64 get_key32_value64_percpu(int fd, __u32 key)
         return sum;
  }
  
-static bool stats_collect(int fd, struct stats_record *rec)
+static bool stats_collect(struct stats_record *rec)
  {
+       int fd;
         int i;
  
         /* TODO: Detect if someone unloaded the perf event_fd's, as
          * this can happen by someone running perf-record -e
          */
  
+       fd = map_data[0].fd; /* map0: redirect_err_cnt */
         for (i = 0; i < REDIR_RES_MAX; i++) {
                 rec->xdp_redir[i].timestamp = gettime();
                 rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
         }
+
+       fd = map_data[1].fd; /* map1: exception_cnt */
+       for (i = 0; i < XDP_ACTION_MAX; i++) {
+               rec->xdp_exception[i].timestamp = gettime();
+               rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+       }
+
         return true;
  }
  
  static void stats_poll(int interval, bool err_only)
  {
         struct stats_record rec, prev;
-       int map_fd;
  
         memset(&rec, 0, sizeof(rec));
  
@@ -190,16 +250,17 @@ static void stats_poll(int interval, bool err_only)
                 printf("\n%s", __doc__);
  
         /* TODO Need more advanced stats on error types */
-       if (verbose)
-               printf(" - Stats map: %s\n", map_data[0].name);
-       map_fd = map_data[0].fd;
-
-       stats_print_headers(err_only);
+       if (verbose) {
+               printf(" - Stats map0: %s\n", map_data[0].name);
+               printf(" - Stats map1: %s\n", map_data[1].name);
+               printf("\n");
+       }
         fflush(stdout);
  
         while (1) {
                 memcpy(&prev, &rec, sizeof(rec));
-               stats_collect(map_fd, &rec);
+               stats_collect(&rec);
+               stats_print_headers(err_only);
                 stats_print(&rec, &prev, err_only);
                 fflush(stdout);
                 sleep(interval);
@@ -235,6 +296,7 @@ static void print_bpf_prog_info(void)
  
  int main(int argc, char **argv)
  {
+       struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
         int longindex = 0, opt;
         int ret = EXIT_SUCCESS;
         char bpf_obj_file[256];
@@ -265,13 +327,18 @@ int main(int argc, char **argv)
                 }
         }
  
+       if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+               perror("setrlimit(RLIMIT_MEMLOCK)");
+               return EXIT_FAILURE;
+       }
+
         if (load_bpf_file(bpf_obj_file)) {
                 printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
-               return 1;
+               return EXIT_FAILURE;
         }
         if (!prog_fd[0]) {
                 printf("ERROR - load_bpf_file: %s\n", strerror(errno));
-               return 1;
+               return EXIT_FAILURE;
         }
  
         if (debug) {
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst

index 57fc4b9..04d12f7 100644 (file)
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -11,7 +11,7 @@ SYNOPSIS
  ========
  
  |      **bpftool** prog show [*PROG*]
-|      **bpftool** prog dump xlated *PROG*  file *FILE*
+|      **bpftool** prog dump xlated *PROG* [file *FILE*] [opcodes]
  |      **bpftool** prog dump jited  *PROG* [file *FILE*] [opcodes]
  |      **bpftool** prog pin *PROG* *FILE*
  |      **bpftool** prog help
@@ -28,9 +28,12 @@ DESCRIPTION
                   Output will start with program ID followed by program type and
                   zero or more named attributes (depending on kernel version).
  
-       **bpftool prog dump xlated** *PROG*  **file** *FILE*
-                 Dump eBPF instructions of the program from the kernel to a
-                 file.
+       **bpftool prog dump xlated** *PROG* [**file** *FILE*] [**opcodes**]
+                 Dump eBPF instructions of the program from the kernel.
+                 If *FILE* is specified image will be written to a file,
+                 otherwise it will be disassembled and printed to stdout.
+
+                 **opcodes** controls if raw opcodes will be printed.
  
         **bpftool prog dump jited**  *PROG* [**file** *FILE*] [**opcodes**]
                   Dump jited image (host machine code) of the program.
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile

index 8705ee4..4f33982 100644 (file)
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -51,7 +51,7 @@ CC = gcc
  
  CFLAGS += -O2
  CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow
-CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf
+CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/
  LIBS = -lelf -lbfd -lopcodes $(LIBBPF)
  
  include $(wildcard *.d)
@@ -59,7 +59,10 @@ include $(wildcard *.d)
  all: $(OUTPUT)bpftool
  
  SRCS=$(wildcard *.c)
-OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS))
+OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
+
+$(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
+       $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
  
  $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
         $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h

index 85d2d78..8e809b2 100644 (file)
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -36,11 +36,12 @@
  #ifndef __BPF_TOOL_H
  #define __BPF_TOOL_H
  
+/* BFD and kernel.h both define GCC_VERSION, differently */
+#undef GCC_VERSION
  #include <stdbool.h>
  #include <stdio.h>
  #include <linux/bpf.h>
-
-#define ARRAY_SIZE(a)  (sizeof(a) / sizeof(a[0]))
+#include <linux/kernel.h>
  
  #define err(msg...)    fprintf(stderr, "Error: " msg)
  #define warn(msg...)   fprintf(stderr, "Warning: " msg)
@@ -48,11 +49,6 @@
  
  #define ptr_to_u64(ptr)        ((__u64)(unsigned long)(ptr))
  
-#define min(a, b)                                                      \
-       ({ typeof(a) _a = (a); typeof(b) _b = (b); _a > _b ? _b : _a; })
-#define max(a, b)                                                      \
-       ({ typeof(a) _a = (a); typeof(b) _b = (b); _a < _b ? _b : _a; })
-
  #define NEXT_ARG()     ({ argc--; argv++; if (argc < 0) usage(); })
  #define NEXT_ARGP()    ({ (*argc)--; (*argv)++; if (*argc < 0) usage(); })
  #define BAD_ARG()      ({ err("what is '%s'?\n", *argv); -1; })
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c

index 421ba89..9e2681c 100644 (file)
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -35,6 +35,7 @@
  
  #include <errno.h>
  #include <fcntl.h>
+#include <stdarg.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
@@ -46,6 +47,7 @@
  #include <bpf.h>
  
  #include "main.h"
+#include "disasm.h"
  
  static const char * const prog_type_name[] = {
         [BPF_PROG_TYPE_UNSPEC]          = "unspec",
@@ -297,11 +299,39 @@ static int do_show(int argc, char **argv)
         return 0;
  }
  
+static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       vprintf(fmt, args);
+       va_end(args);
+}
+
+static void dump_xlated(void *buf, unsigned int len, bool opcodes)
+{
+       struct bpf_insn *insn = buf;
+       unsigned int i;
+
+       for (i = 0; i < len / sizeof(*insn); i++) {
+               printf("% 4d: ", i);
+               print_bpf_insn(print_insn, NULL, insn + i, true);
+
+               if (opcodes) {
+                       printf("       ");
+                       print_hex(insn + i, 8, " ");
+                       printf("\n");
+               }
+
+               if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW))
+                       i++;
+       }
+}
+
  static int do_dump(int argc, char **argv)
  {
         struct bpf_prog_info info = {};
         __u32 len = sizeof(info);
-       bool can_disasm = false;
         unsigned int buf_size;
         char *filepath = NULL;
         bool opcodes = false;
@@ -315,7 +345,6 @@ static int do_dump(int argc, char **argv)
         if (is_prefix(*argv, "jited")) {
                 member_len = &info.jited_prog_len;
                 member_ptr = &info.jited_prog_insns;
-               can_disasm = true;
         } else if (is_prefix(*argv, "xlated")) {
                 member_len = &info.xlated_prog_len;
                 member_ptr = &info.xlated_prog_insns;
@@ -346,10 +375,6 @@ static int do_dump(int argc, char **argv)
                 NEXT_ARG();
         }
  
-       if (!filepath && !can_disasm) {
-               err("expected 'file' got %s\n", *argv);
-               return -1;
-       }
         if (argc) {
                 usage();
                 return -1;
@@ -409,7 +434,10 @@ static int do_dump(int argc, char **argv)
                         goto err_free;
                 }
         } else {
-               disasm_print_insn(buf, *member_len, opcodes);
+               if (member_len == &info.jited_prog_len)
+                       disasm_print_insn(buf, *member_len, opcodes);
+               else
+                       dump_xlated(buf, *member_len, opcodes);
         }
  
         free(buf);
@@ -430,7 +458,7 @@ static int do_help(int argc, char **argv)
  {
         fprintf(stderr,
                 "Usage: %s %s show [PROG]\n"
-               "       %s %s dump xlated PROG  file FILE\n"
+               "       %s %s dump xlated PROG [file FILE] [opcodes]\n"
                 "       %s %s dump jited  PROG [file FILE] [opcodes]\n"
                 "       %s %s pin   PROG FILE\n"
                 "       %s %s help\n"
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index cb2b9f9..fb4fb81 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -230,7 +230,7 @@ union bpf_attr {
                 __u32   numa_node;      /* numa node (effective only if
                                          * BPF_F_NUMA_NODE is set).
                                          */
-               __u8    map_name[BPF_OBJ_NAME_LEN];
+               char    map_name[BPF_OBJ_NAME_LEN];
         };
  
         struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -253,7 +253,7 @@ union bpf_attr {
                 __aligned_u64   log_buf;        /* user supplied buffer */
                 __u32           kern_version;   /* checked when prog_type=kprobe */
                 __u32           prog_flags;
-               __u8            prog_name[BPF_OBJ_NAME_LEN];
+               char            prog_name[BPF_OBJ_NAME_LEN];
         };
  
         struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -697,7 +697,9 @@ union bpf_attr {
         FN(redirect_map),               \
         FN(sk_redirect_map),            \
         FN(sock_map_update),            \
-       FN(xdp_adjust_meta),
+       FN(xdp_adjust_meta),            \
+       FN(perf_event_read_value),      \
+       FN(perf_prog_read_value),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
   * function eBPF program intends to call
@@ -869,7 +871,7 @@ struct bpf_prog_info {
         __u32 created_by_uid;
         __u32 nr_map_ids;
         __aligned_u64 map_ids;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char  name[BPF_OBJ_NAME_LEN];
  } __attribute__((aligned(8)));
  
  struct bpf_map_info {
@@ -879,7 +881,7 @@ struct bpf_map_info {
         __u32 value_size;
         __u32 max_entries;
         __u32 map_flags;
-       __u8  name[BPF_OBJ_NAME_LEN];
+       char  name[BPF_OBJ_NAME_LEN];
  } __attribute__((aligned(8)));
  
  /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile

index 924af8d..2e7880e 100644 (file)
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -12,7 +12,7 @@ CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../i
  LDLIBS += -lcap -lelf
  
  TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
-       test_align
+       test_align test_verifier_log
  
  TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
         test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h

index a56053d..e25dbf6 100644 (file)
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -72,6 +72,12 @@ static int (*bpf_sk_redirect_map)(void *map, int key, int flags) =
  static int (*bpf_sock_map_update)(void *map, void *key, void *value,
                                   unsigned long long flags) =
         (void *) BPF_FUNC_sock_map_update;
+static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
+                                       void *buf, unsigned int buf_size) =
+       (void *) BPF_FUNC_perf_event_read_value;
+static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
+                                      unsigned int buf_size) =
+       (void *) BPF_FUNC_perf_prog_read_value;
  
  
  /* llvm builtin functions that eBPF C program may use to
diff --git a/tools/testing/selftests/bpf/test_verifier_log.c b/tools/testing/selftests/bpf/test_verifier_log.c

new file mode 100644 (file)

index 0000000..3cc0b56
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_verifier_log.c
@@ -0,0 +1,171 @@
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/unistd.h>
+
+#include <bpf/bpf.h>
+
+#define LOG_SIZE (1 << 20)
+
+#define err(str...)    printf("ERROR: " str)
+
+static const struct bpf_insn code_sample[] = {
+       /* We need a few instructions to pass the min log length */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+                    BPF_FUNC_map_lookup_elem),
+       BPF_EXIT_INSN(),
+};
+
+static inline __u64 ptr_to_u64(const void *ptr)
+{
+       return (__u64) (unsigned long) ptr;
+}
+
+static int load(char *log, size_t log_len, int log_level)
+{
+       union bpf_attr attr;
+
+       bzero(&attr, sizeof(attr));
+       attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+       attr.insn_cnt = (__u32)(sizeof(code_sample) / sizeof(struct bpf_insn));
+       attr.insns = ptr_to_u64(code_sample);
+       attr.license = ptr_to_u64("GPL");
+       attr.log_buf = ptr_to_u64(log);
+       attr.log_size = log_len;
+       attr.log_level = log_level;
+
+       return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+
+static void check_ret(int ret, int exp_errno)
+{
+       if (ret > 0) {
+               close(ret);
+               err("broken sample loaded successfully!?\n");
+               exit(1);
+       }
+
+       if (!ret || errno != exp_errno) {
+               err("Program load returned: ret:%d/errno:%d, expected ret:%d/errno:%d\n",
+                   ret, errno, -1, exp_errno);
+               exit(1);
+       }
+}
+
+static void check_ones(const char *buf, size_t len, const char *msg)
+{
+       while (len--)
+               if (buf[len] != 1) {
+                       err("%s", msg);
+                       exit(1);
+               }
+}
+
+static void test_log_good(char *log, size_t buf_len, size_t log_len,
+                         size_t exp_len, int exp_errno, const char *full_log)
+{
+       size_t len;
+       int ret;
+
+       memset(log, 1, buf_len);
+
+       ret = load(log, log_len, 1);
+       check_ret(ret, exp_errno);
+
+       len = strnlen(log, buf_len);
+       if (len == buf_len) {
+               err("verifier did not NULL terminate the log\n");
+               exit(1);
+       }
+       if (exp_len && len != exp_len) {
+               err("incorrect log length expected:%zd have:%zd\n",
+                   exp_len, len);
+               exit(1);
+       }
+
+       if (strchr(log, 1)) {
+               err("verifier leaked a byte through\n");
+               exit(1);
+       }
+
+       check_ones(log + len + 1, buf_len - len - 1,
+                  "verifier wrote bytes past NULL termination\n");
+
+       if (memcmp(full_log, log, LOG_SIZE)) {
+               err("log did not match expected output\n");
+               exit(1);
+       }
+}
+
+static void test_log_bad(char *log, size_t log_len, int log_level)
+{
+       int ret;
+
+       ret = load(log, log_len, log_level);
+       check_ret(ret, EINVAL);
+       if (log)
+               check_ones(log, LOG_SIZE,
+                          "verifier touched log with bad parameters\n");
+}
+
+int main(int argc, char **argv)
+{
+       char full_log[LOG_SIZE];
+       char log[LOG_SIZE];
+       size_t want_len;
+       int i;
+
+       memset(log, 1, LOG_SIZE);
+
+       /* Test incorrect attr */
+       printf("Test log_level 0...\n");
+       test_log_bad(log, LOG_SIZE, 0);
+
+       printf("Test log_size < 128...\n");
+       test_log_bad(log, 15, 1);
+
+       printf("Test log_buff = NULL...\n");
+       test_log_bad(NULL, LOG_SIZE, 1);
+
+       /* Test with log big enough */
+       printf("Test oversized buffer...\n");
+       test_log_good(full_log, LOG_SIZE, LOG_SIZE, 0, EACCES, full_log);
+
+       want_len = strlen(full_log);
+
+       printf("Test exact buffer...\n");
+       test_log_good(log, LOG_SIZE, want_len + 2, want_len, EACCES, full_log);
+
+       printf("Test undersized buffers...\n");
+       for (i = 0; i < 64; i++) {
+               full_log[want_len - i + 1] = 1;
+               full_log[want_len - i] = 0;
+
+               test_log_good(log, LOG_SIZE, want_len + 1 - i, want_len - i,
+                             ENOSPC, full_log);
+       }
+
+       printf("test_verifier_log: OK\n");
+       return 0;
+}
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh

index e8c86c4..a8a8cdf 100755 (executable)
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -37,6 +37,26 @@ kci_del_dummy()
         check_err $?
  }
  
+kci_test_netconf()
+{
+       dev="$1"
+       r=$ret
+
+       ip netconf show dev "$dev" > /dev/null
+       check_err $?
+
+       for f in 4 6; do
+               ip -$f netconf show dev "$dev" > /dev/null
+               check_err $?
+       done
+
+       if [ $ret -ne 0 ] ;then
+               echo "FAIL: ip netconf show $dev"
+               test $r -eq 0 && ret=0
+               return 1
+       fi
+}
+
  # add a bridge with vlans on top
  kci_test_bridge()
  {
@@ -63,6 +83,11 @@ kci_test_bridge()
         check_err $?
         ip r s t all > /dev/null
         check_err $?
+
+       for name in "$devbr" "$vlandev" "$devdummy" ; do
+               kci_test_netconf "$name"
+       done
+
         ip -6 addr del dev "$vlandev" dead:42::1234/64
         check_err $?
  
@@ -100,6 +125,9 @@ kci_test_gre()
         check_err $?
         ip addr > /dev/null
         check_err $?
+
+       kci_test_netconf "$gredev"
+
         ip addr del dev "$devdummy" 10.23.7.11/24
         check_err $?
  
diff --git a/tools/testing/selftests/networking/timestamping/rxtimestamp.c b/tools/testing/selftests/networking/timestamping/rxtimestamp.c

index 00f2866..dd4162f 100644 (file)
--- a/tools/testing/selftests/networking/timestamping/rxtimestamp.c
+++ b/tools/testing/selftests/networking/timestamping/rxtimestamp.c
@@ -341,7 +341,7 @@ int main(int argc, char **argv)
                         return 0;
                 case 'n':
                         t = atoi(optarg);
-                       if (t > ARRAY_SIZE(test_cases))
+                       if (t >= ARRAY_SIZE(test_cases))
                                 error(1, 0, "Invalid test case: %d", t);
                         all_tests = false;
                         test_cases[t].enabled = true;
author	David S. Miller <davem@davemloft.net>
	Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)
committer	David S. Miller <davem@davemloft.net>
	Wed, 11 Oct 2017 17:15:01 +0000 (10:15 -0700)
Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt		patch \| blob \| history
Documentation/filesystems/overlayfs.txt		patch \| blob \| history
Documentation/i2c/busses/i2c-i801		patch \| blob \| history
Documentation/networking/bonding.txt		patch \| blob \| history
Documentation/networking/netvsc.txt		patch \| blob \| history
MAINTAINERS		patch \| blob \| history
Makefile		patch \| blob \| history
arch/Kconfig		patch \| blob \| history
arch/arc/Kconfig		patch \| blob \| history
arch/arc/Makefile		patch \| blob \| history
arch/arc/boot/dts/axs10x_mb.dtsi		patch \| blob \| history
arch/arc/boot/dts/hsdk.dts		patch \| blob \| history
arch/arc/configs/axs101_defconfig		patch \| blob \| history
arch/arc/configs/axs103_defconfig		patch \| blob \| history
arch/arc/configs/axs103_smp_defconfig		patch \| blob \| history
arch/arc/configs/haps_hs_smp_defconfig		patch \| blob \| history
arch/arc/configs/hsdk_defconfig		patch \| blob \| history
arch/arc/configs/vdk_hs38_defconfig		patch \| blob \| history
arch/arc/configs/vdk_hs38_smp_defconfig		patch \| blob \| history
arch/arc/include/asm/arcregs.h		patch \| blob \| history
arch/arc/kernel/setup.c		patch \| blob \| history
arch/arc/plat-axs10x/axs10x.c		patch \| blob \| history
arch/arc/plat-hsdk/Kconfig		patch \| blob \| history
arch/arc/plat-hsdk/platform.c		patch \| blob \| history
arch/arm64/include/asm/memory.h		patch \| blob \| history
arch/arm64/kernel/armv8_deprecated.c		patch \| blob \| history
arch/arm64/kernel/cpufeature.c		patch \| blob \| history
arch/arm64/kernel/fpsimd.c		patch \| blob \| history
arch/arm64/mm/fault.c		patch \| blob \| history
arch/parisc/kernel/process.c		patch \| blob \| history
arch/powerpc/kernel/dt_cpu_ftrs.c		patch \| blob \| history
arch/powerpc/kernel/exceptions-64s.S		patch \| blob \| history
arch/powerpc/kernel/mce_power.c		patch \| blob \| history
arch/powerpc/kernel/setup-common.c		patch \| blob \| history
arch/powerpc/kernel/signal_64.c		patch \| blob \| history
arch/powerpc/kernel/watchdog.c		patch \| blob \| history
arch/powerpc/kvm/book3s_xive.c		patch \| blob \| history
arch/powerpc/kvm/book3s_xive.h		patch \| blob \| history
arch/powerpc/mm/pgtable_32.c		patch \| blob \| history
arch/powerpc/platforms/powernv/setup.c		patch \| blob \| history
arch/powerpc/sysdev/xive/common.c		patch \| blob \| history
arch/powerpc/sysdev/xive/spapr.c		patch \| blob \| history
arch/sparc/Kconfig		patch \| blob \| history
arch/x86/events/intel/core.c		patch \| blob \| history
arch/x86/include/asm/kvm_para.h		patch \| blob \| history
arch/x86/kernel/kvm.c		patch \| blob \| history
arch/x86/kvm/Kconfig		patch \| blob \| history
arch/x86/kvm/emulate.c		patch \| blob \| history
arch/x86/kvm/mmu.c		patch \| blob \| history
block/blk-mq-debugfs.c		patch \| blob \| history
block/blk-throttle.c		patch \| blob \| history
block/bsg-lib.c		patch \| blob \| history
drivers/acpi/arm64/iort.c		patch \| blob \| history
drivers/block/Kconfig		patch \| blob \| history
drivers/block/nbd.c		patch \| blob \| history
drivers/clk/clk-bulk.c		patch \| blob \| history
drivers/clk/rockchip/clk-rk3128.c		patch \| blob \| history
drivers/clk/samsung/clk-exynos4.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_audio.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_bios.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_csr.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_ddi.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_display.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_dpio_phy.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_modes.c		patch \| blob \| history
drivers/gpu/drm/i915/intel_runtime_pm.c		patch \| blob \| history
drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c		patch \| blob \| history
drivers/hwmon/xgene-hwmon.c		patch \| blob \| history
drivers/i2c/busses/Kconfig		patch \| blob \| history
drivers/i2c/busses/i2c-i801.c		patch \| blob \| history
drivers/i2c/busses/i2c-sprd.c		patch \| blob \| history
drivers/i2c/busses/i2c-stm32f7.c		patch \| blob \| history
drivers/ide/ide-probe.c		patch \| blob \| history
drivers/ide/ide-scan-pci.c		patch \| blob \| history
drivers/ide/setup-pci.c		patch \| blob \| history
drivers/infiniband/core/iwpm_msg.c		patch \| blob \| history
drivers/infiniband/core/iwpm_util.c		patch \| blob \| history
drivers/infiniband/hw/i40iw/i40iw_ctrl.c		patch \| blob \| history
drivers/infiniband/hw/i40iw/i40iw_p.h		patch \| blob \| history
drivers/infiniband/hw/i40iw/i40iw_puda.c		patch \| blob \| history
drivers/infiniband/hw/i40iw/i40iw_verbs.c		patch \| blob \| history
drivers/infiniband/hw/mlx5/main.c		patch \| blob \| history
drivers/infiniband/hw/qedr/qedr.h		patch \| blob \| history
drivers/infiniband/hw/qedr/qedr_cm.c		patch \| blob \| history
drivers/md/bcache/closure.c		patch \| blob \| history
drivers/misc/cxl/cxllib.c		patch \| blob \| history
drivers/mmc/core/block.c		patch \| blob \| history
drivers/mmc/core/mmc.c		patch \| blob \| history
drivers/mmc/core/queue.c		patch \| blob \| history
drivers/mmc/core/queue.h		patch \| blob \| history
drivers/mmc/host/cavium.c		patch \| blob \| history
drivers/mmc/host/meson-gx-mmc.c		patch \| blob \| history
drivers/mmc/host/pxamci.c		patch \| blob \| history
drivers/mmc/host/sdhci-xenon.c		patch \| blob \| history
drivers/mmc/host/sdhci-xenon.h		patch \| blob \| history
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c		patch \| blob \| history
drivers/net/ethernet/broadcom/bnxt/Makefile		patch \| blob \| history
drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c		patch \| blob \| history
drivers/net/ethernet/cavium/thunder/nicvf_main.c		patch \| blob \| history
drivers/net/ethernet/chelsio/cxgb4/t4_hw.c		patch \| blob \| history
drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h		patch \| blob \| history
drivers/net/ethernet/hisilicon/Kconfig		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hnae3.h		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h		patch \| blob \| history
drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c		patch \| blob \| history
drivers/net/ethernet/intel/e1000e/defines.h		patch \| blob \| history
drivers/net/ethernet/intel/e1000e/e1000.h		patch \| blob \| history
drivers/net/ethernet/intel/e1000e/mac.c		patch \| blob \| history
drivers/net/ethernet/intel/e1000e/netdev.c		patch \| blob \| history
drivers/net/ethernet/intel/e1000e/param.c		patch \| blob \| history
drivers/net/ethernet/intel/e1000e/phy.c		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e.h		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_common.c		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_debugfs.c		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_ethtool.c		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_main.c		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_nvm.c		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_register.h		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_txrx.c		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_txrx.h		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_type.h		patch \| blob \| history
drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c		patch \| blob \| history
drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h		patch \| blob \| history
drivers/net/ethernet/intel/i40evf/i40e_txrx.c		patch \| blob \| history
drivers/net/ethernet/intel/i40evf/i40e_txrx.h		patch \| blob \| history
drivers/net/ethernet/intel/i40evf/i40e_type.h		patch \| blob \| history
drivers/net/ethernet/intel/i40evf/i40evf.h		patch \| blob \| history
drivers/net/ethernet/intel/i40evf/i40evf_main.c		patch \| blob \| history
drivers/net/ethernet/intel/igb/igb_main.c		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe.h		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_common.c		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c		patch \| blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/en_ethtool.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/en_main.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/en_netdev.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/en_resources.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/en_rx.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/en_tx.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/fw.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/mlx4_en.h		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/qp.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx4/resource_tracker.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlx5/core/fs_core.h		patch \| blob \| history
drivers/net/ethernet/mellanox/mlxsw/spectrum.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlxsw/spectrum.h		patch \| blob \| history
drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c		patch \| blob \| history
drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h		patch \| blob \| history
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/Makefile		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/bpf/jit.c		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/bpf/main.c		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/bpf/main.h		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/flower/action.c		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/flower/cmsg.h		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/flower/match.c		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/flower/offload.c		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/nfp_app.h		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/nfp_asm.c	[new file with mode: 0644]	patch \| blob
drivers/net/ethernet/netronome/nfp/nfp_asm.h		patch \| blob \| history
drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h		patch \| blob \| history
drivers/net/ethernet/qlogic/qed/qed_dcbx.c		patch \| blob \| history
drivers/net/ethernet/qlogic/qed/qed_iwarp.c		patch \| blob \| history
drivers/net/ethernet/qlogic/qed/qed_iwarp.h		patch \| blob \| history
drivers/net/ethernet/qlogic/qed/qed_ll2.c		patch \| blob \| history
drivers/net/ethernet/qlogic/qed/qed_ll2.h		patch \| blob \| history
drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c		patch \| blob \| history
drivers/net/hyperv/hyperv_net.h		patch \| blob \| history
drivers/net/hyperv/netvsc_drv.c		patch \| blob \| history
drivers/net/phy/Kconfig		patch \| blob \| history
drivers/net/phy/Makefile		patch \| blob \| history
drivers/net/phy/uPD60620.c	[new file with mode: 0644]	patch \| blob
drivers/net/ppp/ppp_generic.c		patch \| blob \| history
drivers/net/usb/cdc_ether.c		patch \| blob \| history
drivers/nvme/host/core.c		patch \| blob \| history
drivers/nvme/host/pci.c		patch \| blob \| history
drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c		patch \| blob \| history
drivers/scsi/libiscsi.c		patch \| blob \| history
drivers/scsi/scsi_scan.c		patch \| blob \| history
drivers/scsi/scsi_transport_iscsi.c		patch \| blob \| history
drivers/scsi/sd.c		patch \| blob \| history
drivers/thunderbolt/nhi.c		patch \| blob \| history
drivers/thunderbolt/xdomain.c		patch \| blob \| history
drivers/vhost/net.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/ceph/mds_client.c		patch \| blob \| history
fs/ceph/snap.c		patch \| blob \| history
fs/namespace.c		patch \| blob \| history
fs/nfs/client.c		patch \| blob \| history
fs/nfs/filelayout/filelayout.c		patch \| blob \| history
fs/nfs/nfs4idmap.c		patch \| blob \| history
fs/nfs/nfs4proc.c		patch \| blob \| history
fs/nfs/nfs4xdr.c		patch \| blob \| history
fs/overlayfs/copy_up.c		patch \| blob \| history
fs/overlayfs/dir.c		patch \| blob \| history
fs/overlayfs/namei.c		patch \| blob \| history
fs/overlayfs/overlayfs.h		patch \| blob \| history
fs/overlayfs/ovl_entry.h		patch \| blob \| history
fs/overlayfs/readdir.c		patch \| blob \| history
fs/overlayfs/super.c		patch \| blob \| history
fs/overlayfs/util.c		patch \| blob \| history
fs/xfs/xfs_bmap_util.c		patch \| blob \| history
fs/xfs/xfs_reflink.c		patch \| blob \| history
include/linux/bpf.h		patch \| blob \| history
include/linux/bpf_verifier.h		patch \| blob \| history
include/linux/if_bridge.h		patch \| blob \| history
include/linux/if_phonet.h		patch \| blob \| history
include/linux/mmc/host.h		patch \| blob \| history
include/linux/netfilter_bridge/ebtables.h		patch \| blob \| history
include/linux/nmi.h		patch \| blob \| history
include/linux/once.h		patch \| blob \| history
include/linux/perf_event.h		patch \| blob \| history
include/linux/qed/qed_ll2_if.h		patch \| blob \| history
include/linux/skbuff.h		patch \| blob \| history
include/linux/smpboot.h		patch \| blob \| history
include/net/dst.h		patch \| blob \| history
include/net/dst_metadata.h		patch \| blob \| history
include/net/ip6_fib.h		patch \| blob \| history
include/net/ip6_route.h		patch \| blob \| history
include/net/ipv6.h		patch \| blob \| history
include/net/phonet/phonet.h		patch \| blob \| history
include/net/sock.h		patch \| blob \| history
include/net/switchdev.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
include/scsi/scsi_device.h		patch \| blob \| history
include/scsi/scsi_devinfo.h		patch \| blob \| history
include/scsi/scsi_transport_iscsi.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
include/uapi/linux/if_link.h		patch \| blob \| history
include/uapi/linux/if_tunnel.h		patch \| blob \| history
include/uapi/linux/netfilter/xt_bpf.h		patch \| blob \| history
include/uapi/linux/openvswitch.h		patch \| blob \| history
kernel/bpf/Makefile		patch \| blob \| history
kernel/bpf/arraymap.c		patch \| blob \| history
kernel/bpf/core.c		patch \| blob \| history
kernel/bpf/disasm.c	[new file with mode: 0644]	patch \| blob
kernel/bpf/disasm.h	[new file with mode: 0644]	patch \| blob
kernel/bpf/inode.c		patch \| blob \| history
kernel/bpf/syscall.c		patch \| blob \| history
kernel/bpf/verifier.c		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/events/core.c		patch \| blob \| history
kernel/smpboot.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
kernel/trace/bpf_trace.c		patch \| blob \| history
kernel/watchdog.c		patch \| blob \| history
kernel/watchdog_hld.c		patch \| blob \| history
lib/once.c		patch \| blob \| history
net/batman-adv/bat_iv_ogm.c		patch \| blob \| history
net/batman-adv/bat_v.c		patch \| blob \| history
net/batman-adv/bat_v_elp.c		patch \| blob \| history
net/batman-adv/bat_v_ogm.c		patch \| blob \| history
net/batman-adv/distributed-arp-table.c		patch \| blob \| history
net/batman-adv/gateway_client.c		patch \| blob \| history
net/batman-adv/gateway_common.c		patch \| blob \| history
net/batman-adv/hard-interface.c		patch \| blob \| history
net/batman-adv/icmp_socket.c		patch \| blob \| history
net/batman-adv/main.c		patch \| blob \| history
net/batman-adv/main.h		patch \| blob \| history
net/batman-adv/multicast.c		patch \| blob \| history
net/batman-adv/originator.c		patch \| blob \| history
net/batman-adv/routing.c		patch \| blob \| history
net/batman-adv/send.c		patch \| blob \| history
net/batman-adv/soft-interface.c		patch \| blob \| history
net/batman-adv/sysfs.c		patch \| blob \| history
net/batman-adv/tp_meter.c		patch \| blob \| history
net/bridge/Makefile		patch \| blob \| history
net/bridge/br_arp_nd_proxy.c	[new file with mode: 0644]	patch \| blob
net/bridge/br_device.c		patch \| blob \| history
net/bridge/br_forward.c		patch \| blob \| history
net/bridge/br_if.c		patch \| blob \| history
net/bridge/br_input.c		patch \| blob \| history
net/bridge/br_multicast.c		patch \| blob \| history
net/bridge/br_netlink.c		patch \| blob \| history
net/bridge/br_private.h		patch \| blob \| history
net/bridge/br_sysfs_if.c		patch \| blob \| history
net/bridge/netfilter/ebtable_broute.c		patch \| blob \| history
net/bridge/netfilter/ebtable_filter.c		patch \| blob \| history
net/bridge/netfilter/ebtable_nat.c		patch \| blob \| history
net/bridge/netfilter/ebtables.c		patch \| blob \| history
net/core/dst.c		patch \| blob \| history
net/core/filter.c		patch \| blob \| history
net/core/rtnetlink.c		patch \| blob \| history
net/core/skbuff.c		patch \| blob \| history
net/ipv4/gre_offload.c		patch \| blob \| history
net/ipv4/ip_gre.c		patch \| blob \| history
net/ipv4/netfilter/ipt_SYNPROXY.c		patch \| blob \| history
net/ipv4/route.c		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv4/tcp_fastopen.c		patch \| blob \| history
net/ipv4/tcp_input.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history
net/ipv4/tcp_timer.c		patch \| blob \| history
net/ipv4/udp.c		patch \| blob \| history
net/ipv4/udp_offload.c		patch \| blob \| history
net/ipv6/addrconf.c		patch \| blob \| history
net/ipv6/addrlabel.c		patch \| blob \| history
net/ipv6/icmp.c		patch \| blob \| history
net/ipv6/ip6_fib.c		patch \| blob \| history
net/ipv6/ip6_offload.c		patch \| blob \| history
net/ipv6/netfilter/ip6t_SYNPROXY.c		patch \| blob \| history
net/ipv6/ping.c		patch \| blob \| history
net/ipv6/route.c		patch \| blob \| history
net/mpls/af_mpls.c		patch \| blob \| history
net/netfilter/ipset/ip_set_core.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_ip.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_ipmark.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_ipport.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_ipportip.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_ipportnet.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_net.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_netiface.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_netnet.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_netport.c		patch \| blob \| history
net/netfilter/ipset/ip_set_hash_netportnet.c		patch \| blob \| history
net/netfilter/ipvs/ip_vs_xmit.c		patch \| blob \| history
net/netfilter/nf_tables_api.c		patch \| blob \| history
net/netfilter/x_tables.c		patch \| blob \| history
net/netfilter/xt_bpf.c		patch \| blob \| history
net/netfilter/xt_socket.c		patch \| blob \| history
net/netlink/af_netlink.c		patch \| blob \| history
net/openvswitch/actions.c		patch \| blob \| history
net/openvswitch/conntrack.c		patch \| blob \| history
net/openvswitch/conntrack.h		patch \| blob \| history
net/openvswitch/flow_netlink.c		patch \| blob \| history
net/phonet/af_phonet.c		patch \| blob \| history
net/phonet/datagram.c		patch \| blob \| history
net/phonet/pep.c		patch \| blob \| history
net/sched/sch_netem.c		patch \| blob \| history
net/sunrpc/xprtsock.c		patch \| blob \| history
net/tipc/bcast.c		patch \| blob \| history
net/tipc/msg.c		patch \| blob \| history
net/wireless/nl80211.c		patch \| blob \| history
net/xfrm/xfrm_device.c		patch \| blob \| history
net/xfrm/xfrm_input.c		patch \| blob \| history
net/xfrm/xfrm_state.c		patch \| blob \| history
net/xfrm/xfrm_user.c		patch \| blob \| history
samples/bpf/trace_event_kern.c		patch \| blob \| history
samples/bpf/trace_event_user.c		patch \| blob \| history
samples/bpf/tracex6_kern.c		patch \| blob \| history
samples/bpf/tracex6_user.c		patch \| blob \| history
samples/bpf/xdp_monitor_kern.c		patch \| blob \| history
samples/bpf/xdp_monitor_user.c		patch \| blob \| history
tools/bpf/bpftool/Documentation/bpftool-prog.rst		patch \| blob \| history
tools/bpf/bpftool/Makefile		patch \| blob \| history
tools/bpf/bpftool/main.h		patch \| blob \| history
tools/bpf/bpftool/prog.c		patch \| blob \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| history
tools/testing/selftests/bpf/Makefile		patch \| blob \| history
tools/testing/selftests/bpf/bpf_helpers.h		patch \| blob \| history
tools/testing/selftests/bpf/test_verifier_log.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/net/rtnetlink.sh		patch \| blob \| history
tools/testing/selftests/networking/timestamping/rxtimestamp.c		patch \| blob \| history