Merge tag 'drm-fixes-2021-07-16' of git://anongit.freedesktop.org/drm/drm
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 16 Jul 2021 18:14:54 +0000 (11:14 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 16 Jul 2021 18:14:54 +0000 (11:14 -0700)
Pull drm fixes from Dave Airlie:
 "Regular rc2 fixes though a bit more than usual at rc2 stage, people
  must have been testing early or else some fixes from last week got a
  bit laggy.

  There is one larger change in the amd fixes to amalgamate some power
  management code on the newer chips with the code from the older chips,
  it should only affects chips where support was introduced in rc1 and
  it should make future fixes easier to maintain probably a good idea to
  merge it now.

  Otherwise it's mostly fixes across the board.

  dma-buf:
   - Fix fence leak in sync_file_merge() error code

  drm/panel:
   - nt35510: Don't fail on DSI reads

  fbdev:
   - Avoid use-after-free by not deleting current video mode

  ttm:
   - Avoid NULL-ptr deref in ttm_range_man_fini()

  vmwgfx:
   - Fix a merge commit

  qxl:
   - fix a TTM regression

  amdgpu:
   - SR-IOV fixes
   - RAS fixes
   - eDP fixes
   - SMU13 code unification to facilitate fixes in the future
   - Add new renoir DID
   - Yellow Carp fixes
   - Beige Goby fixes
   - Revert a bunch of TLB fixes that caused regressions
   - Revert an LTTPR display regression

  amdkfd
   - Fix VRAM access regression
   - SVM fixes

  i915:
   - Fix -EDEADLK handling regression
   - Drop the page table optimisation"

* tag 'drm-fixes-2021-07-16' of git://anongit.freedesktop.org/drm/drm: (29 commits)
  drm/amdgpu: add another Renoir DID
  drm/ttm: add a check against null pointer dereference
  drm/i915/gtt: drop the page table optimisation
  drm/i915/gt: Fix -EDEADLK handling regression
  drm/amd/pm: Add waiting for response of mode-reset message for yellow carp
  Revert "drm/amdkfd: Add heavy-weight TLB flush after unmapping"
  Revert "drm/amdgpu: Add table_freed parameter to amdgpu_vm_bo_update"
  Revert "drm/amdkfd: Make TLB flush conditional on mapping"
  Revert "drm/amdgpu: Fix warning of Function parameter or member not described"
  Revert "drm/amdkfd: Add memory sync before TLB flush on unmap"
  drm/amd/pm: Fix BACO state setting for Beige_Goby
  drm/amdgpu: Restore msix after FLR
  drm/amdkfd: Allow CPU access for all VRAM BOs
  drm/amdgpu/display - only update eDP's backlight level when necessary
  drm/amdkfd: handle fault counters on invalid address
  drm/amdgpu: Correct the irq numbers for virtual crtc
  drm/amd/display: update header file name
  drm/amd/pm: drop smu_v13_0_1.c|h files for yellow carp
  drm/amd/display: remove faulty assert
  Revert "drm/amd/display: Always write repeater mode regardless of LTTPR"
  ...

289 files changed:
Documentation/ABI/testing/sysfs-ptp
Documentation/devicetree/bindings/net/gpmc-eth.txt
Documentation/devicetree/bindings/net/smsc,lan9115.yaml [new file with mode: 0644]
Documentation/devicetree/bindings/net/smsc911x.txt [deleted file]
Documentation/networking/ethtool-netlink.rst
Documentation/networking/nf_conntrack-sysctl.rst
Documentation/networking/tipc.rst
MAINTAINERS
Makefile
arch/arm/boot/dts/qcom-apq8060-dragonboard.dts
arch/mips/include/asm/fpu.h
arch/mips/mm/tlbex.c
arch/powerpc/platforms/powermac/smp.c
arch/s390/kernel/uprobes.c
arch/x86/kvm/cpuid.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/paging.h [new file with mode: 0644]
arch/x86/kvm/mmu/paging_tmpl.h
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/svm/nested.c
arch/x86/kvm/svm/sev.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/svm/svm.h
arch/x86/kvm/vmx/vmx.h
arch/x86/kvm/x86.c
arch/x86/net/bpf_jit_comp.c
drivers/char/powernv-op-panel.c
drivers/cpufreq/longhaul.c
drivers/dma/ipu/ipu_idmac.c
drivers/dma/mpc512x_dma.c
drivers/dma/ti/k3-udma.c
drivers/edac/Kconfig
drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
drivers/gpu/drm/msm/msm_gem.c
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
drivers/iommu/arm/arm-smmu/qcom_iommu.c
drivers/iommu/intel/iommu.c
drivers/iommu/rockchip-iommu.c
drivers/mmc/host/jz4740_mmc.c
drivers/mtd/chips/cfi_util.c
drivers/net/bonding/bond_main.c
drivers/net/caif/Kconfig
drivers/net/caif/Makefile
drivers/net/caif/caif_hsi.c [deleted file]
drivers/net/dsa/microchip/ksz_common.c
drivers/net/dsa/mv88e6xxx/chip.c
drivers/net/dsa/mv88e6xxx/serdes.c
drivers/net/dsa/sja1105/sja1105_main.c
drivers/net/ethernet/atheros/atl1c/atl1c_hw.c
drivers/net/ethernet/broadcom/genet/bcmgenet.c
drivers/net/ethernet/broadcom/genet/bcmgenet_wol.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
drivers/net/ethernet/google/gve/gve_main.c
drivers/net/ethernet/google/gve/gve_rx_dqo.c
drivers/net/ethernet/ibm/ibmvnic.c
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/fm10k/fm10k_pci.c
drivers/net/ethernet/intel/iavf/iavf_main.c
drivers/net/ethernet/intel/igb/igb_main.c
drivers/net/ethernet/intel/igc/igc.h
drivers/net/ethernet/intel/igc/igc_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/net/ethernet/intel/ixgbevf/ipsec.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/marvell/octeontx2/af/cgx.c
drivers/net/ethernet/marvell/octeontx2/af/cgx.h
drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
drivers/net/ethernet/marvell/octeontx2/af/mbox.h
drivers/net/ethernet/marvell/octeontx2/af/rvu.c
drivers/net/ethernet/marvell/octeontx2/af/rvu.h
drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_reg.h
drivers/net/ethernet/marvell/octeontx2/af/rvu_struct.h
drivers/net/ethernet/marvell/octeontx2/nic/Makefile
drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c
drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h
drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c [new file with mode: 0644]
drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
drivers/net/ethernet/microchip/sparx5/Kconfig
drivers/net/ethernet/moxa/moxart_ether.c
drivers/net/ethernet/mscc/ocelot_net.c
drivers/net/ethernet/netronome/nfp/flower/conntrack.c
drivers/net/ethernet/qualcomm/emac/emac.c
drivers/net/ethernet/sfc/efx_channels.c
drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c
drivers/net/ethernet/stmicro/stmmac/stmmac.h
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
drivers/net/ethernet/ti/tlan.c
drivers/net/fddi/defza.c
drivers/net/netdevsim/ipsec.c
drivers/net/phy/marvell10g.c
drivers/net/usb/asix_devices.c
drivers/net/virtio_net.c
drivers/net/vmxnet3/vmxnet3_ethtool.c
drivers/net/wan/hdlc_cisco.c
drivers/net/wan/hdlc_fr.c
drivers/net/wan/hdlc_ppp.c
drivers/net/wan/hdlc_raw.c
drivers/net/wan/hdlc_raw_eth.c
drivers/net/wan/hdlc_x25.c
drivers/net/wireless/mediatek/mt76/mt7921/main.c
drivers/net/wireless/mediatek/mt76/mt7921/mcu.c
drivers/net/wwan/iosm/iosm_ipc_imem_ops.c
drivers/net/wwan/iosm/iosm_ipc_imem_ops.h
drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
drivers/net/wwan/iosm/iosm_ipc_uevent.c
drivers/net/wwan/iosm/iosm_ipc_wwan.c
drivers/pci/proc.c
drivers/power/supply/ab8500_fg.c
drivers/power/supply/abx500_chargalg.c
drivers/ptp/Makefile
drivers/ptp/ptp_clock.c
drivers/ptp/ptp_private.h
drivers/ptp/ptp_sysfs.c
drivers/ptp/ptp_vclock.c [new file with mode: 0644]
drivers/pwm/pwm-berlin.c
drivers/pwm/pwm-ep93xx.c
drivers/pwm/pwm-spear.c
drivers/pwm/pwm-sprd.c
drivers/pwm/pwm-tiecap.c
drivers/s390/char/tape_char.c
drivers/s390/net/ctcm_fsms.c
drivers/s390/net/qeth_l3_main.c
drivers/scsi/libsas/sas_discover.c
drivers/scsi/sd.c
drivers/usb/gadget/udc/fsl_qe_udc.c
drivers/video/fbdev/xilinxfb.c
fs/btrfs/block-group.c
fs/btrfs/block-group.h
fs/btrfs/ctree.c
fs/btrfs/inode.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/configfs/file.c
fs/fcntl.c
fs/fs_context.c
fs/hfs/bfind.c
fs/hfs/bnode.c
fs/hfs/btree.h
fs/hfs/super.c
fs/vboxsf/dir.c
fs/vboxsf/file.c
fs/vboxsf/vfsmod.h
fs/xfs/libxfs/xfs_attr.c
include/linux/bpf.h
include/linux/ethtool.h
include/linux/fs_context.h
include/linux/kasan.h
include/linux/marvell_phy.h
include/linux/migrate.h
include/linux/mm.h
include/linux/ptp_clock_kernel.h
include/linux/rmap.h
include/linux/stmmac.h
include/math-emu/op-common.h
include/net/bonding.h
include/net/busy_poll.h
include/net/caif/caif_hsi.h [deleted file]
include/net/dst_metadata.h
include/net/ip6_route.h
include/net/mptcp.h
include/net/netfilter/nf_conntrack_core.h
include/net/netns/conntrack.h
include/net/sctp/constants.h
include/net/sock.h
include/net/tcp.h
include/uapi/linux/ethtool_netlink.h
include/uapi/linux/net_tstamp.h
include/uapi/linux/netfilter/nfnetlink_log.h
include/uapi/linux/netfilter/nfnetlink_queue.h
kernel/bpf/core.c
kernel/bpf/devmap.c
kernel/bpf/verifier.c
kernel/cgroup/cgroup-v1.c
kernel/debug/gdbstub.c
kernel/rcu/refscale.c
kernel/rcu/tasks.h
kernel/rcu/tree_stall.h
kernel/scftorture.c
lib/test_hmm.c
mm/hugetlb.c
mm/kasan/kasan.h
mm/migrate.c
mm/page_alloc.c
mm/rmap.c
mm/slab.h
mm/slub.c
mm/util.c
net/802/garp.c
net/802/mrp.c
net/bridge/br_if.c
net/bridge/br_multicast.c
net/core/dev.c
net/core/skbuff.c
net/core/sock.c
net/dsa/switch.c
net/ethtool/Makefile
net/ethtool/common.c
net/ethtool/netlink.c
net/ethtool/netlink.h
net/ethtool/phc_vclocks.c [new file with mode: 0644]
net/ipv4/fib_frontend.c
net/ipv4/inet_diag.c
net/ipv4/ip_tunnel.c
net/ipv4/ipmr.c
net/ipv4/raw_diag.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_output.c
net/ipv4/udp.c
net/ipv4/udp_diag.c
net/ipv4/udp_offload.c
net/ipv6/ip6_output.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c
net/ipv6/xfrm6_output.c
net/iucv/iucv.c
net/mptcp/mib.c
net/mptcp/mib.h
net/mptcp/mptcp_diag.c
net/mptcp/options.c
net/mptcp/protocol.c
net/mptcp/protocol.h
net/mptcp/sockopt.c
net/mptcp/subflow.c
net/mptcp/syncookies.c
net/ncsi/Kconfig
net/ncsi/internal.h
net/ncsi/ncsi-manage.c
net/ncsi/ncsi-rsp.c
net/netfilter/nf_conntrack_core.c
net/netfilter/nf_conntrack_netlink.c
net/netfilter/nf_conntrack_proto.c
net/netfilter/nf_conntrack_proto_gre.c
net/netfilter/nf_conntrack_proto_tcp.c
net/netfilter/nf_conntrack_standalone.c
net/netfilter/nf_tables_api.c
net/netfilter/nft_last.c
net/netlink/af_netlink.c
net/openvswitch/flow_table.c
net/sched/act_ct.c
net/sched/sch_taprio.c
net/sctp/diag.c
net/sctp/protocol.c
net/sctp/sm_make_chunk.c
net/sctp/transport.c
net/socket.c
net/unix/diag.c
samples/bpf/Makefile
samples/bpf/xdpsock_user.c
sound/soc/mediatek/mt8183/mt8183-dai-adda.c
tools/bpf/Makefile
tools/bpf/bpftool/jit_disasm.c
tools/bpf/runqslower/runqslower.bpf.c
tools/lib/bpf/libbpf.c
tools/testing/selftests/bpf/prog_tests/tailcalls.c
tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
tools/testing/selftests/kvm/include/kvm_util.h
tools/testing/selftests/kvm/lib/aarch64/processor.c
tools/testing/selftests/kvm/lib/guest_modes.c
tools/testing/selftests/kvm/lib/kvm_util.c
tools/testing/selftests/kvm/set_memory_region_test.c
tools/testing/selftests/kvm/x86_64/hyperv_features.c
tools/testing/selftests/kvm/x86_64/mmu_role_test.c
tools/testing/selftests/kvm/x86_64/smm_test.c
tools/testing/selftests/net/icmp_redirect.sh
tools/testing/selftests/net/mptcp/mptcp_join.sh
tools/testing/selftests/net/timestamping.c
tools/testing/selftests/netfilter/Makefile
tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh [new file with mode: 0755]
virt/kvm/coalesced_mmio.c
virt/kvm/kvm_main.c

index 2363ad8..d378f57 100644 (file)
@@ -33,6 +33,13 @@ Description:
                frequency adjustment value (a positive integer) in
                parts per billion.
 
+What:          /sys/class/ptp/ptpN/max_vclocks
+Date:          May 2021
+Contact:       Yangbo Lu <yangbo.lu@nxp.com>
+Description:
+               This file contains the maximum number of ptp vclocks.
+               Write integer to re-configure it.
+
 What:          /sys/class/ptp/ptpN/n_alarms
 Date:          September 2010
 Contact:       Richard Cochran <richardcochran@gmail.com>
@@ -61,6 +68,19 @@ Description:
                This file contains the number of programmable pins
                offered by the PTP hardware clock.
 
+What:          /sys/class/ptp/ptpN/n_vclocks
+Date:          May 2021
+Contact:       Yangbo Lu <yangbo.lu@nxp.com>
+Description:
+               This file contains the number of virtual PTP clocks in
+               use.  By default, the value is 0 meaning that only the
+               physical clock is in use.  Setting the value creates
+               the corresponding number of virtual clocks and causes
+               the physical clock to become free running.  Setting the
+               value back to 0 deletes the virtual clocks and
+               switches the physical clock back to normal, adjustable
+               operation.
+
 What:          /sys/class/ptp/ptpN/pins
 Date:          March 2014
 Contact:       Richard Cochran <richardcochran@gmail.com>
index f7da3d7..3282106 100644 (file)
@@ -13,7 +13,7 @@ Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
 
 For the properties relevant to the ethernet controller connected to the GPMC
 refer to the binding documentation of the device. For example, the documentation
-for the SMSC 911x is Documentation/devicetree/bindings/net/smsc911x.txt
+for the SMSC 911x is Documentation/devicetree/bindings/net/smsc,lan9115.yaml
 
 Child nodes need to specify the GPMC bus address width using the "bank-width"
 property but is possible that an ethernet controller also has a property to
diff --git a/Documentation/devicetree/bindings/net/smsc,lan9115.yaml b/Documentation/devicetree/bindings/net/smsc,lan9115.yaml
new file mode 100644 (file)
index 0000000..f86667c
--- /dev/null
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/smsc,lan9115.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Smart Mixed-Signal Connectivity (SMSC) LAN911x/912x Controller
+
+maintainers:
+  - Shawn Guo <shawnguo@kernel.org>
+
+allOf:
+  - $ref: ethernet-controller.yaml#
+
+properties:
+  compatible:
+    oneOf:
+      - const: smsc,lan9115
+      - items:
+          - enum:
+              - smsc,lan89218
+              - smsc,lan9117
+              - smsc,lan9118
+              - smsc,lan9220
+              - smsc,lan9221
+          - const: smsc,lan9115
+
+  reg:
+    maxItems: 1
+
+  reg-shift: true
+
+  reg-io-width:
+    enum: [ 2, 4 ]
+    default: 2
+
+  interrupts:
+    minItems: 1
+    items:
+      - description:
+          LAN interrupt line
+      - description:
+          Optional PME (power management event) interrupt that is able to wake
+          up the host system with a 50ms pulse on network activity
+
+  clocks:
+    maxItems: 1
+
+  phy-mode: true
+
+  smsc,irq-active-high:
+    type: boolean
+    description: Indicates the IRQ polarity is active-high
+
+  smsc,irq-push-pull:
+    type: boolean
+    description: Indicates the IRQ type is push-pull
+
+  smsc,force-internal-phy:
+    type: boolean
+    description: Forces SMSC LAN controller to use internal PHY
+
+  smsc,force-external-phy:
+    type: boolean
+    description: Forces SMSC LAN controller to use external PHY
+
+  smsc,save-mac-address:
+    type: boolean
+    description:
+      Indicates that MAC address needs to be saved before resetting the
+      controller
+
+  reset-gpios:
+    maxItems: 1
+    description:
+      A GPIO line connected to the RESET (active low) signal of the device.
+      On many systems this is wired high so the device goes out of reset at
+      power-on, but if it is under program control, this optional GPIO can
+      wake up in response to it.
+
+  vdd33a-supply:
+    description: 3.3V analog power supply
+
+  vddvario-supply:
+    description: IO logic power supply
+
+required:
+  - compatible
+  - reg
+  - interrupts
+
+# There are lots of bus-specific properties ("qcom,*", "samsung,*", "fsl,*",
+# "gpmc,*", ...) to be found, that actually depend on the compatible value of
+# the parent node.
+additionalProperties: true
+
+examples:
+  - |
+    #include <dt-bindings/gpio/gpio.h>
+
+    ethernet@f4000000 {
+            compatible = "smsc,lan9220", "smsc,lan9115";
+            reg = <0xf4000000 0x2000000>;
+            phy-mode = "mii";
+            interrupt-parent = <&gpio1>;
+            interrupts = <31>, <32>;
+            reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
+            reg-io-width = <4>;
+            smsc,irq-push-pull;
+    };
diff --git a/Documentation/devicetree/bindings/net/smsc911x.txt b/Documentation/devicetree/bindings/net/smsc911x.txt
deleted file mode 100644 (file)
index acfafc8..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-* Smart Mixed-Signal Connectivity (SMSC) LAN911x/912x Controller
-
-Required properties:
-- compatible : Should be "smsc,lan<model>", "smsc,lan9115"
-- reg : Address and length of the io space for SMSC LAN
-- interrupts : one or two interrupt specifiers
-  - The first interrupt is the SMSC LAN interrupt line
-  - The second interrupt (if present) is the PME (power
-    management event) interrupt that is able to wake up the host
-     system with a 50ms pulse on network activity
-- phy-mode : See ethernet.txt file in the same directory
-
-Optional properties:
-- reg-shift : Specify the quantity to shift the register offsets by
-- reg-io-width : Specify the size (in bytes) of the IO accesses that
-  should be performed on the device.  Valid value for SMSC LAN is
-  2 or 4.  If it's omitted or invalid, the size would be 2.
-- smsc,irq-active-high : Indicates the IRQ polarity is active-high
-- smsc,irq-push-pull : Indicates the IRQ type is push-pull
-- smsc,force-internal-phy : Forces SMSC LAN controller to use
-  internal PHY
-- smsc,force-external-phy : Forces SMSC LAN controller to use
-  external PHY
-- smsc,save-mac-address : Indicates that mac address needs to be saved
-  before resetting the controller
-- reset-gpios : a GPIO line connected to the RESET (active low) signal
-  of the device. On many systems this is wired high so the device goes
-  out of reset at power-on, but if it is under program control, this
-  optional GPIO can wake up in response to it.
-- vdd33a-supply, vddvario-supply : 3.3V analog and IO logic power supplies
-
-Examples:
-
-lan9220@f4000000 {
-       compatible = "smsc,lan9220", "smsc,lan9115";
-       reg = <0xf4000000 0x2000000>;
-       phy-mode = "mii";
-       interrupt-parent = <&gpio1>;
-       interrupts = <31>, <32>;
-       reset-gpios = <&gpio1 30 GPIO_ACTIVE_LOW>;
-       reg-io-width = <4>;
-       smsc,irq-push-pull;
-};
index 6ea91e4..c86628e 100644 (file)
@@ -212,6 +212,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_FEC_SET``               set FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET``     read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET``             get standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET``       get PHC virtual clocks info
   ===================================== ================================
 
 Kernel to userspace:
@@ -250,6 +251,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_FEC_NTF``                  FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY``  read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET_REPLY``          standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY``    PHC virtual clocks info
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1477,6 +1479,25 @@ Low and high bounds are inclusive, for example:
  etherStatsPkts512to1023Octets 512  1023
  ============================= ==== ====
 
+PHC_VCLOCKS_GET
+===============
+
+Query device PHC virtual clocks information.
+
+Request contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  request header
+  ====================================  ======  ==========================
+
+Kernel response contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  reply header
+  ``ETHTOOL_A_PHC_VCLOCKS_NUM``         u32     PHC virtual clocks number
+  ``ETHTOOL_A_PHC_VCLOCKS_INDEX``       s32     PHC index array
+  ====================================  ======  ==========================
+
 Request translation
 ===================
 
@@ -1575,4 +1596,5 @@ are netlink only.
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_ACT``
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT``
   n/a                                 ``ETHTOOL_MSG_TUNNEL_INFO_GET``
+  n/a                                 ``ETHTOOL_MSG_PHC_VCLOCKS_GET``
   =================================== =====================================
index 0467b30..d31ed6c 100644 (file)
@@ -110,6 +110,12 @@ nf_conntrack_tcp_be_liberal - BOOLEAN
        Be conservative in what you do, be liberal in what you accept from others.
        If it's non-zero, we mark only out of window RST segments as INVALID.
 
+nf_conntrack_tcp_ignore_invalid_rst - BOOLEAN
+       - 0 - disabled (default)
+       - 1 - enabled
+
+       If it's 1, we don't mark out of window RST segments as INVALID.
+
 nf_conntrack_tcp_loose - BOOLEAN
        - 0 - disabled
        - not 0 - enabled (default)
index 76775f2..ab63d29 100644 (file)
 Linux Kernel TIPC
 =================
 
-TIPC (Transparent Inter Process Communication) is a protocol that is
-specially designed for intra-cluster communication.
+Introduction
+============
 
-For more information about TIPC, see http://tipc.sourceforge.net.
+TIPC (Transparent Inter Process Communication) is a protocol that is specially
+designed for intra-cluster communication. It can be configured to transmit
+messages either on UDP or directly across Ethernet. Message delivery is
+sequence guaranteed, loss free and flow controlled. Latency times are shorter
+than with any other known protocol, while maximal throughput is comparable to
+that of TCP.
+
+TIPC Features
+-------------
+
+- Cluster wide IPC service
+
+  Have you ever wished you had the convenience of Unix Domain Sockets even when
+  transmitting data between cluster nodes? Where you yourself determine the
+  addresses you want to bind to and use? Where you don't have to perform DNS
+  lookups and worry about IP addresses? Where you don't have to start timers
+  to monitor the continuous existence of peer sockets? And yet without the
+  downsides of that socket type, such as the risk of lingering inodes?
+
+  Welcome to the Transparent Inter Process Communication service, TIPC in short,
+  which gives you all of this, and a lot more.
+
+- Service Addressing
+
+  A fundamental concept in TIPC is that of Service Addressing which makes it
+  possible for a programmer to chose his own address, bind it to a server
+  socket and let client programs use only that address for sending messages.
+
+- Service Tracking
+
+  A client wanting to wait for the availability of a server, uses the Service
+  Tracking mechanism to subscribe for binding and unbinding/close events for
+  sockets with the associated service address.
+
+  The service tracking mechanism can also be used for Cluster Topology Tracking,
+  i.e., subscribing for availability/non-availability of cluster nodes.
+
+  Likewise, the service tracking mechanism can be used for Cluster Connectivity
+  Tracking, i.e., subscribing for up/down events for individual links between
+  cluster nodes.
+
+- Transmission Modes
+
+  Using a service address, a client can send datagram messages to a server socket.
+
+  Using the same address type, it can establish a connection towards an accepting
+  server socket.
+
+  It can also use a service address to create and join a Communication Group,
+  which is the TIPC manifestation of a brokerless message bus.
+
+  Multicast with very good performance and scalability is available both in
+  datagram mode and in communication group mode.
+
+- Inter Node Links
+
+  Communication between any two nodes in a cluster is maintained by one or two
+  Inter Node Links, which both guarantee data traffic integrity and monitor
+  the peer node's availability.
+
+- Cluster Scalability
+
+  By applying the Overlapping Ring Monitoring algorithm on the inter node links
+  it is possible to scale TIPC clusters up to 1000 nodes with a maintained
+  neighbor failure discovery time of 1-2 seconds. For smaller clusters this
+  time can be made much shorter.
+
+- Neighbor Discovery
+
+  Neighbor Node Discovery in the cluster is done by Ethernet broadcast or UDP
+  multicast, when any of those services are available. If not, configured peer
+  IP addresses can be used.
+
+- Configuration
+
+  When running TIPC in single node mode no configuration whatsoever is needed.
+  When running in cluster mode TIPC must as a minimum be given a node address
+  (before Linux 4.17) and told which interface to attach to. The "tipc"
+  configuration tool makes is possible to add and maintain many more
+  configuration parameters.
+
+- Performance
+
+  TIPC message transfer latency times are better than in any other known protocol.
+  Maximal byte throughput for inter-node connections is still somewhat lower than
+  for TCP, while they are superior for intra-node and inter-container throughput
+  on the same host.
+
+- Language Support
+
+  The TIPC user API has support for C, Python, Perl, Ruby, D and Go.
+
+More Information
+----------------
+
+- How to set up TIPC:
+
+  http://tipc.io/getting_started.html
+
+- How to program with TIPC:
+
+  http://tipc.io/programming.html
+
+- How to contribute to TIPC:
+
+- http://tipc.io/contacts.html
+
+- More details about TIPC specification:
+
+  http://tipc.io/protocol.html
+
+
+Implementation
+==============
+
+TIPC is implemented as a kernel module in net/tipc/ directory.
 
 TIPC Base Types
 ---------------
index a61f4f3..6c8be73 100644 (file)
@@ -933,6 +933,7 @@ F:  drivers/video/fbdev/geode/
 
 AMD IOMMU (AMD-VI)
 M:     Joerg Roedel <joro@8bytes.org>
+R:     Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
 L:     iommu@lists.linux-foundation.org
 S:     Maintained
 T:     git git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git
@@ -15009,6 +15010,13 @@ F:     drivers/net/phy/dp83640*
 F:     drivers/ptp/*
 F:     include/linux/ptp_cl*
 
+PTP VIRTUAL CLOCK SUPPORT
+M:     Yangbo Lu <yangbo.lu@nxp.com>
+L:     netdev@vger.kernel.org
+S:     Maintained
+F:     drivers/ptp/ptp_vclock.c
+F:     net/ethtool/phc_vclocks.c
+
 PTRACE SUPPORT
 M:     Oleg Nesterov <oleg@redhat.com>
 S:     Maintained
index c3f9bd1..e97e754 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -802,7 +802,7 @@ else
 # Warn about unmarked fall-throughs in switch statement.
 # Disabled for clang while comment to attribute conversion happens and
 # https://github.com/ClangBuiltLinux/linux/issues/636 is discussed.
-KBUILD_CFLAGS += $(call cc-option,-Wimplicit-fallthrough,)
+KBUILD_CFLAGS += $(call cc-option,-Wimplicit-fallthrough=5,)
 endif
 
 # These warnings generated too much noise in a regular build.
index dace8ff..0a4ffd1 100644 (file)
                         * EBI2. This has a 25MHz chrystal next to it, so no
                         * clocking is needed.
                         */
-                       ethernet-ebi2@2,0 {
+                       ethernet@2,0 {
                                compatible = "smsc,lan9221", "smsc,lan9115";
                                reg = <2 0x0 0x100>;
                                /*
                                phy-mode = "mii";
                                reg-io-width = <2>;
                                smsc,force-external-phy;
-                               /* IRQ on edge falling = active low */
-                               smsc,irq-active-low;
                                smsc,irq-push-pull;
 
                                /*
index 08f9dd6..86310d6 100644 (file)
@@ -76,7 +76,7 @@ static inline int __enable_fpu(enum fpu_mode mode)
                /* we only have a 32-bit FPU */
                return SIGFPE;
 #endif
-               fallthrough;
+               /* fallthrough */
        case FPU_32BIT:
                if (cpu_has_fre) {
                        /* clear FRE */
index cd4afcd..9adad24 100644 (file)
@@ -1383,6 +1383,7 @@ static void build_r4000_tlb_refill_handler(void)
        switch (boot_cpu_type()) {
        default:
                if (sizeof(long) == 4) {
+               fallthrough;
        case CPU_LOONGSON2EF:
                /* Loongson2 ebase is different than r4k, we have more space */
                        if ((p - tlb_handler) > 64)
@@ -2169,6 +2170,7 @@ static void build_r4000_tlb_load_handler(void)
                default:
                        if (cpu_has_mips_r2_exec_hazard) {
                                uasm_i_ehb(&p);
+                       fallthrough;
 
                case CPU_CAVIUM_OCTEON:
                case CPU_CAVIUM_OCTEON_PLUS:
index bdfea6d..3256a31 100644 (file)
@@ -146,6 +146,7 @@ static inline void psurge_clr_ipi(int cpu)
                switch(psurge_type) {
                case PSURGE_DUAL:
                        out_8(psurge_sec_intr, ~0);
+                       break;
                case PSURGE_NONE:
                        break;
                default:
index bbf8622..bd3ef12 100644 (file)
@@ -126,6 +126,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
        case DIE_SSTEP:
                if (uprobe_post_sstep_notifier(regs))
                        return NOTIFY_STOP;
+               break;
        default:
                break;
        }
index c42613c..739be5d 100644 (file)
@@ -765,7 +765,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 
                edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS);
                edx.split.bit_width_fixed = cap.bit_width_fixed;
-               edx.split.anythread_deprecated = 1;
+               if (cap.version)
+                       edx.split.anythread_deprecated = 1;
                edx.split.reserved1 = 0;
                edx.split.reserved2 = 0;
 
@@ -940,8 +941,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
                unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
                unsigned phys_as = entry->eax & 0xff;
 
-               if (!g_phys_as)
+               /*
+                * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as
+                * the guest operates in the same PA space as the host, i.e.
+                * reductions in MAXPHYADDR for memory encryption affect shadow
+                * paging, too.
+                *
+                * If TDP is enabled but an explicit guest MAXPHYADDR is not
+                * provided, use the raw bare metal MAXPHYADDR as reductions to
+                * the HPAs do not affect GPAs.
+                */
+               if (!tdp_enabled)
+                       g_phys_as = boot_cpu_data.x86_phys_bits;
+               else if (!g_phys_as)
                        g_phys_as = phys_as;
+
                entry->eax = g_phys_as | (virt_as << 8);
                entry->edx = 0;
                cpuid_entry_override(entry, CPUID_8000_0008_EBX);
@@ -964,12 +978,18 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
        case 0x8000001a:
        case 0x8000001e:
                break;
-       /* Support memory encryption cpuid if host supports it */
        case 0x8000001F:
-               if (!kvm_cpu_cap_has(X86_FEATURE_SEV))
+               if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
                        entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
-               else
+               } else {
                        cpuid_entry_override(entry, CPUID_8000_001F_EAX);
+
+                       /*
+                        * Enumerate '0' for "PA bits reduction", the adjusted
+                        * MAXPHYADDR is enumerated directly (see 0x80000008).
+                        */
+                       entry->ebx &= ~GENMASK(11, 6);
+               }
                break;
        /*Add support for Centaur's CPUID instruction*/
        case 0xC0000000:
index 845d114..66f7f5b 100644 (file)
@@ -53,6 +53,8 @@
 #include <asm/kvm_page_track.h>
 #include "trace.h"
 
+#include "paging.h"
+
 extern bool itlb_multihit_kvm_mitigation;
 
 int __read_mostly nx_huge_pages = -1;
diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h
new file mode 100644 (file)
index 0000000..de8ab32
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Shadow paging constants/helpers that don't need to be #undef'd. */
+#ifndef __KVM_X86_PAGING_H
+#define __KVM_X86_PAGING_H
+
+#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_LVL_ADDR_MASK(level) \
+       (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
+                                               * PT64_LEVEL_BITS))) - 1))
+#define PT64_LVL_OFFSET_MASK(level) \
+       (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
+                                               * PT64_LEVEL_BITS))) - 1))
+#endif /* __KVM_X86_PAGING_H */
+
index 490a028..ee044d3 100644 (file)
@@ -24,7 +24,7 @@
        #define pt_element_t u64
        #define guest_walker guest_walker64
        #define FNAME(name) paging##64_##name
-       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+       #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
        #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
@@ -57,7 +57,7 @@
        #define pt_element_t u64
        #define guest_walker guest_walkerEPT
        #define FNAME(name) ept_##name
-       #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
+       #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK
        #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
index 7a5ce93..eb7b227 100644 (file)
@@ -38,12 +38,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
 #else
 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 #endif
-#define PT64_LVL_ADDR_MASK(level) \
-       (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
-                                               * PT64_LEVEL_BITS))) - 1))
-#define PT64_LVL_OFFSET_MASK(level) \
-       (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
-                                               * PT64_LEVEL_BITS))) - 1))
 
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
                        | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
index 21d03e3..3bd09c5 100644 (file)
@@ -154,6 +154,10 @@ void recalc_intercepts(struct vcpu_svm *svm)
 
        for (i = 0; i < MAX_INTERCEPT; i++)
                c->intercepts[i] |= g->intercepts[i];
+
+       /* If SMI is not intercepted, ignore guest SMI intercept as well  */
+       if (!intercept_smi)
+               vmcb_clr_intercept(c, INTERCEPT_SMI);
 }
 
 static void copy_vmcb_control_area(struct vmcb_control_area *dst,
@@ -304,8 +308,8 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
        return true;
 }
 
-static void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
-                                           struct vmcb_control_area *control)
+void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+                                    struct vmcb_control_area *control)
 {
        copy_vmcb_control_area(&svm->nested.ctl, control);
 
@@ -618,6 +622,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
        struct kvm_host_map map;
        u64 vmcb12_gpa;
 
+       if (!svm->nested.hsave_msr) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
        if (is_smm(vcpu)) {
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
@@ -692,6 +701,27 @@ out:
        return ret;
 }
 
+/* Copy state save area fields which are handled by VMRUN */
+void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
+                         struct vmcb_save_area *to_save)
+{
+       to_save->es = from_save->es;
+       to_save->cs = from_save->cs;
+       to_save->ss = from_save->ss;
+       to_save->ds = from_save->ds;
+       to_save->gdtr = from_save->gdtr;
+       to_save->idtr = from_save->idtr;
+       to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
+       to_save->efer = from_save->efer;
+       to_save->cr0 = from_save->cr0;
+       to_save->cr3 = from_save->cr3;
+       to_save->cr4 = from_save->cr4;
+       to_save->rax = from_save->rax;
+       to_save->rsp = from_save->rsp;
+       to_save->rip = from_save->rip;
+       to_save->cpl = 0;
+}
+
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 {
        to_vmcb->save.fs = from_vmcb->save.fs;
@@ -1355,28 +1385,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 
        svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
 
-       svm->vmcb01.ptr->save.es = save->es;
-       svm->vmcb01.ptr->save.cs = save->cs;
-       svm->vmcb01.ptr->save.ss = save->ss;
-       svm->vmcb01.ptr->save.ds = save->ds;
-       svm->vmcb01.ptr->save.gdtr = save->gdtr;
-       svm->vmcb01.ptr->save.idtr = save->idtr;
-       svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED;
-       svm->vmcb01.ptr->save.efer = save->efer;
-       svm->vmcb01.ptr->save.cr0 = save->cr0;
-       svm->vmcb01.ptr->save.cr3 = save->cr3;
-       svm->vmcb01.ptr->save.cr4 = save->cr4;
-       svm->vmcb01.ptr->save.rax = save->rax;
-       svm->vmcb01.ptr->save.rsp = save->rsp;
-       svm->vmcb01.ptr->save.rip = save->rip;
-       svm->vmcb01.ptr->save.cpl = 0;
-
+       svm_copy_vmrun_state(save, &svm->vmcb01.ptr->save);
        nested_load_control_from_vmcb12(svm, ctl);
 
        svm_switch_vmcb(svm, &svm->nested.vmcb02);
-
        nested_vmcb02_prepare_control(svm);
-
        kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
        ret = 0;
 out_free:
index 62926f1..6710d9e 100644 (file)
@@ -1272,8 +1272,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        /* Pin guest memory */
        guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
                                    PAGE_SIZE, &n, 0);
-       if (!guest_page)
-               return -EFAULT;
+       if (IS_ERR(guest_page))
+               return PTR_ERR(guest_page);
 
        /* allocate memory for header and transport buffer */
        ret = -ENOMEM;
@@ -1310,8 +1310,9 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        }
 
        /* Copy packet header to userspace. */
-       ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
-                               params.hdr_len);
+       if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+                        params.hdr_len))
+               ret = -EFAULT;
 
 e_free_trans_data:
        kfree(trans_data);
@@ -1463,11 +1464,12 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
        data.trans_len = params.trans_len;
 
        /* Pin guest memory */
-       ret = -EFAULT;
        guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
                                    PAGE_SIZE, &n, 0);
-       if (!guest_page)
+       if (IS_ERR(guest_page)) {
+               ret = PTR_ERR(guest_page);
                goto e_free_trans;
+       }
 
        /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
        data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
index 8834822..664d20f 100644 (file)
@@ -198,6 +198,11 @@ module_param(avic, bool, 0444);
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+
 static bool svm_gp_erratum_intercept = true;
 
 static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -1185,7 +1190,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 
        svm_set_intercept(svm, INTERCEPT_INTR);
        svm_set_intercept(svm, INTERCEPT_NMI);
-       svm_set_intercept(svm, INTERCEPT_SMI);
+
+       if (intercept_smi)
+               svm_set_intercept(svm, INTERCEPT_SMI);
+
        svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
        svm_set_intercept(svm, INTERCEPT_RDPMC);
        svm_set_intercept(svm, INTERCEPT_CPUID);
@@ -1923,7 +1931,7 @@ static int npf_interception(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       u64 fault_address = svm->vmcb->control.exit_info_2;
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        trace_kvm_page_fault(fault_address, error_code);
@@ -2106,6 +2114,11 @@ static int nmi_interception(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+       return 1;
+}
+
 static int intr_interception(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.irq_exits;
@@ -2941,7 +2954,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                        svm_disable_lbrv(vcpu);
                break;
        case MSR_VM_HSAVE_PA:
-               svm->nested.hsave_msr = data;
+               /*
+                * Old kernels did not validate the value written to
+                * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
+                * value to allow live migrating buggy or malicious guests
+                * originating from those kernels.
+                */
+               if (!msr->host_initiated && !page_address_valid(vcpu, data))
+                       return 1;
+
+               svm->nested.hsave_msr = data & PAGE_MASK;
                break;
        case MSR_VM_CR:
                return svm_set_vm_cr(vcpu, data);
@@ -3080,8 +3102,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
        [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
-       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
+       [SVM_EXIT_SMI]                          = smi_interception,
        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
        [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
        [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
@@ -4288,6 +4309,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_host_map map_save;
        int ret;
 
        if (is_guest_mode(vcpu)) {
@@ -4303,6 +4325,29 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
                ret = nested_svm_vmexit(svm);
                if (ret)
                        return ret;
+
+               /*
+                * KVM uses VMCB01 to store L1 host state while L2 runs but
+                * VMCB01 is going to be used during SMM and thus the state will
+                * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+                * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+                * format of the area is identical to guest save area offsetted
+                * by 0x400 (matches the offset of 'struct vmcb_save_area'
+                * within 'struct vmcb'). Note: HSAVE area may also be used by
+                * L1 hypervisor to save additional host context (e.g. KVM does
+                * that, see svm_prepare_guest_switch()) which must be
+                * preserved.
+                */
+               if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                &map_save) == -EINVAL)
+                       return 1;
+
+               BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+               svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
+                                    map_save.hva + 0x400);
+
+               kvm_vcpu_unmap(vcpu, &map_save, true);
        }
        return 0;
 }
@@ -4310,13 +4355,14 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct kvm_host_map map;
+       struct kvm_host_map map, map_save;
        int ret = 0;
 
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
                u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
                u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
                u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+               struct vmcb *vmcb12;
 
                if (guest) {
                        if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
@@ -4332,8 +4378,25 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                        if (svm_allocate_nested(svm))
                                return 1;
 
-                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       vmcb12 = map.hva;
+
+                       nested_load_control_from_vmcb12(svm, &vmcb12->control);
+
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12);
                        kvm_vcpu_unmap(vcpu, &map, true);
+
+                       /*
+                        * Restore L1 host state from L1 HSAVE area as VMCB01 was
+                        * used during SMM (see svm_enter_smm())
+                        */
+                       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                        &map_save) == -EINVAL)
+                               return 1;
+
+                       svm_copy_vmrun_state(map_save.hva + 0x400,
+                                            &svm->vmcb01.ptr->save);
+
+                       kvm_vcpu_unmap(vcpu, &map_save, true);
                }
        }
 
index f89b623..7e20907 100644 (file)
@@ -31,6 +31,7 @@
 #define MSRPM_OFFSETS  16
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
+extern bool intercept_smi;
 
 /*
  * Clean bits in VMCB.
@@ -463,6 +464,8 @@ void svm_leave_nested(struct vcpu_svm *svm);
 void svm_free_nested(struct vcpu_svm *svm);
 int svm_allocate_nested(struct vcpu_svm *svm);
 int nested_svm_vmrun(struct kvm_vcpu *vcpu);
+void svm_copy_vmrun_state(struct vmcb_save_area *from_save,
+                         struct vmcb_save_area *to_save);
 void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb);
 int nested_svm_vmexit(struct vcpu_svm *svm);
 
@@ -479,6 +482,8 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
 int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
                               bool has_error_code, u32 error_code);
 int nested_svm_exit_special(struct vcpu_svm *svm);
+void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
+                                    struct vmcb_control_area *control);
 void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
 void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
index 3979a94..db88ed4 100644 (file)
@@ -14,8 +14,6 @@
 #include "vmx_ops.h"
 #include "cpuid.h"
 
-extern const u32 vmx_msr_index[];
-
 #define MSR_TYPE_R     1
 #define MSR_TYPE_W     2
 #define MSR_TYPE_RW    3
index c6dc1b4..a4fd106 100644 (file)
@@ -9601,6 +9601,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                set_debugreg(vcpu->arch.eff_db[3], 3);
                set_debugreg(vcpu->arch.dr6, 6);
                vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+       } else if (unlikely(hw_breakpoint_active())) {
+               set_debugreg(0, 7);
        }
 
        for (;;) {
@@ -10985,9 +10987,6 @@ int kvm_arch_hardware_setup(void *opaque)
        int r;
 
        rdmsrl_safe(MSR_EFER, &host_efer);
-       if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
-                        !(host_efer & EFER_NX)))
-               return -EIO;
 
        if (boot_cpu_has(X86_FEATURE_XSAVES))
                rdmsrl(MSR_IA32_XSS, host_xss);
index e835164..4b95145 100644 (file)
@@ -570,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 
        for (i = 0; i < prog->aux->size_poke_tab; i++) {
                poke = &prog->aux->poke_tab[i];
+               if (poke->aux && poke->aux != prog->aux)
+                       continue;
+
                WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
 
                if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
index 027484e..3c99696 100644 (file)
@@ -75,6 +75,7 @@ static int __op_panel_update_display(void)
                                rc);
                        break;
                }
+               break;
        case OPAL_SUCCESS:
                break;
        default:
index 182a4db..c538a15 100644 (file)
@@ -942,8 +942,6 @@ static int __init longhaul_init(void)
                return cpufreq_register_driver(&longhaul_driver);
        case 10:
                pr_err("Use acpi-cpufreq driver for VIA C7\n");
-       default:
-               ;
        }
 
        return -ENODEV;
index 104ad42..baab1ca 100644 (file)
@@ -618,6 +618,7 @@ static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan)
        case IDMAC_SDC_1:
        case IDMAC_IC_7:
                ipu_channel_set_priority(ipu, channel, true);
+               break;
        default:
                break;
        }
@@ -978,6 +979,7 @@ static int ipu_init_channel(struct idmac *idmac, struct idmac_channel *ichan)
        case IDMAC_SDC_0:
        case IDMAC_SDC_1:
                n_desc = 4;
+               break;
        default:
                break;
        }
index c1a6914..4a51fdb 100644 (file)
@@ -813,6 +813,7 @@ inline bool is_buswidth_valid(u8 buswidth, bool is_mpc8308)
        case 16:
                if (is_mpc8308)
                        return false;
+               break;
        case 1:
        case 2:
        case 4:
index 96ad218..a358586 100644 (file)
@@ -4948,6 +4948,7 @@ static int setup_resources(struct udma_dev *ud)
                                                       ud->tchan_cnt),
                         ud->rchan_cnt - bitmap_weight(ud->rchan_map,
                                                       ud->rchan_cnt));
+               break;
        default:
                break;
        }
index 91164c5..2fc4c3f 100644 (file)
@@ -271,7 +271,7 @@ config EDAC_PND2
 config EDAC_IGEN6
        tristate "Intel client SoC Integrated MC"
        depends on PCI && PCI_MMCONFIG && ARCH_HAVE_NMI_SAFE_CMPXCHG
-       depends on X64_64 && X86_MCE_INTEL
+       depends on X86_64 && X86_MCE_INTEL
        help
          Support for error detection and correction on the Intel
          client SoC Integrated Memory Controller using In-Band ECC IP.
index f4fb68e..e382b7f 100644 (file)
@@ -62,6 +62,7 @@ static void try_to_writeback(struct drm_i915_gem_object *obj,
        switch (obj->mm.madv) {
        case I915_MADV_DONTNEED:
                i915_gem_object_truncate(obj);
+               return;
        case __I915_MADV_PURGED:
                return;
        }
index 1411787..1e8a971 100644 (file)
@@ -1169,7 +1169,7 @@ static int msm_gem_new_impl(struct drm_device *dev,
        case MSM_BO_CACHED_COHERENT:
                if (priv->has_cached_coherent)
                        break;
-               /* fallthrough */
+               fallthrough;
        default:
                DRM_DEV_ERROR(dev->dev, "invalid cache flag: %x\n",
                                (flags & MSM_BO_CACHE_MASK));
index dd20b01..235f9bd 100644 (file)
@@ -379,6 +379,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
        switch (idx) {
        case CMDQ_ERR_CERROR_ABT_IDX:
                dev_err(smmu->dev, "retrying command fetch\n");
+               return;
        case CMDQ_ERR_CERROR_NONE_IDX:
                return;
        case CMDQ_ERR_CERROR_ATC_INV_IDX:
index 25ed444..021cf8f 100644 (file)
@@ -849,12 +849,10 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
        ret = iommu_device_register(&qcom_iommu->iommu, &qcom_iommu_ops, dev);
        if (ret) {
                dev_err(dev, "Failed to register iommu\n");
-               goto err_sysfs_remove;
+               return ret;
        }
 
-       ret = bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
-       if (ret)
-               goto err_unregister_device;
+       bus_set_iommu(&platform_bus_type, &qcom_iommu_ops);
 
        if (qcom_iommu->local_base) {
                pm_runtime_get_sync(dev);
@@ -863,13 +861,6 @@ static int qcom_iommu_device_probe(struct platform_device *pdev)
        }
 
        return 0;
-
-err_unregister_device:
-       iommu_device_unregister(&qcom_iommu->iommu);
-
-err_sysfs_remove:
-       iommu_device_sysfs_remove(&qcom_iommu->iommu);
-       return ret;
 }
 
 static int qcom_iommu_device_remove(struct platform_device *pdev)
index a6a07d9..dd22fc7 100644 (file)
@@ -2429,10 +2429,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
        return 0;
 }
 
-static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
+static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
 {
-       unsigned long flags;
+       struct intel_iommu *iommu = info->iommu;
        struct context_entry *context;
+       unsigned long flags;
        u16 did_old;
 
        if (!iommu)
@@ -2444,7 +2445,16 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
                spin_unlock_irqrestore(&iommu->lock, flags);
                return;
        }
-       did_old = context_domain_id(context);
+
+       if (sm_supported(iommu)) {
+               if (hw_pass_through && domain_type_is_si(info->domain))
+                       did_old = FLPT_DEFAULT_DID;
+               else
+                       did_old = info->domain->iommu_did[iommu->seq_id];
+       } else {
+               did_old = context_domain_id(context);
+       }
+
        context_clear_entry(context);
        __iommu_flush_cache(iommu, context, sizeof(*context));
        spin_unlock_irqrestore(&iommu->lock, flags);
@@ -2462,6 +2472,8 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn
                                 0,
                                 0,
                                 DMA_TLB_DSI_FLUSH);
+
+       __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
 }
 
 static inline void unlink_domain_info(struct device_domain_info *info)
@@ -4425,9 +4437,9 @@ out_free_dmar:
 
 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
 {
-       struct intel_iommu *iommu = opaque;
+       struct device_domain_info *info = opaque;
 
-       domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+       domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
        return 0;
 }
 
@@ -4437,12 +4449,13 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op
  * devices, unbinding the driver from any one of them will possibly leave
  * the others unable to operate.
  */
-static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
+static void domain_context_clear(struct device_domain_info *info)
 {
-       if (!iommu || !dev || !dev_is_pci(dev))
+       if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
                return;
 
-       pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
+       pci_for_each_dma_alias(to_pci_dev(info->dev),
+                              &domain_context_clear_one_cb, info);
 }
 
 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
@@ -4459,14 +4472,13 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info)
        iommu = info->iommu;
        domain = info->domain;
 
-       if (info->dev) {
+       if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
                if (dev_is_pci(info->dev) && sm_supported(iommu))
                        intel_pasid_tear_down_entry(iommu, info->dev,
                                        PASID_RID2PASID, false);
 
                iommu_disable_dev_iotlb(info);
-               if (!dev_is_real_dma_subdevice(info->dev))
-                       domain_context_clear(iommu, info->dev);
+               domain_context_clear(info);
                intel_pasid_free_table(info->dev);
        }
 
index 94b9d8e..9febfb7 100644 (file)
@@ -544,12 +544,14 @@ static inline u32 rk_dma_addr_dte(dma_addr_t dt_dma)
 }
 
 #define DT_HI_MASK GENMASK_ULL(39, 32)
+#define DTE_BASE_HI_MASK GENMASK(11, 4)
 #define DT_SHIFT   28
 
 static inline phys_addr_t rk_dte_addr_phys_v2(u32 addr)
 {
-       return (phys_addr_t)(addr & RK_DTE_PT_ADDRESS_MASK) |
-              ((addr & DT_HI_MASK) << DT_SHIFT);
+       u64 addr64 = addr;
+       return (phys_addr_t)(addr64 & RK_DTE_PT_ADDRESS_MASK) |
+              ((addr64 & DTE_BASE_HI_MASK) << DT_SHIFT);
 }
 
 static inline u32 rk_dma_addr_dte_v2(dma_addr_t dt_dma)
index 0db17bc..cb1a64a 100644 (file)
@@ -789,6 +789,8 @@ static irqreturn_t jz_mmc_irq_worker(int irq, void *devid)
                                break;
                        }
                }
+               fallthrough;
+
        case JZ4740_MMC_STATE_DONE:
                break;
        }
index 99b7986..6a6a2a2 100644 (file)
@@ -108,8 +108,8 @@ map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private *cfi
 #if BITS_PER_LONG >= 64
        case 8:
                onecmd |= (onecmd << (chip_mode * 32));
-#endif
                fallthrough;
+#endif
        case 4:
                onecmd |= (onecmd << (chip_mode * 16));
                fallthrough;
@@ -164,8 +164,8 @@ unsigned long cfi_merge_status(map_word val, struct map_info *map,
 #if BITS_PER_LONG >= 64
        case 8:
                res |= (onestat >> (chip_mode * 32));
-#endif
                fallthrough;
+#endif
        case 4:
                res |= (onestat >> (chip_mode * 16));
                fallthrough;
index 0ff7567..d22d783 100644 (file)
@@ -401,24 +401,85 @@ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
 static int bond_ipsec_add_sa(struct xfrm_state *xs)
 {
        struct net_device *bond_dev = xs->xso.dev;
+       struct bond_ipsec *ipsec;
        struct bonding *bond;
        struct slave *slave;
+       int err;
 
        if (!bond_dev)
                return -EINVAL;
 
+       rcu_read_lock();
        bond = netdev_priv(bond_dev);
        slave = rcu_dereference(bond->curr_active_slave);
-       xs->xso.real_dev = slave->dev;
-       bond->xs = xs;
+       if (!slave) {
+               rcu_read_unlock();
+               return -ENODEV;
+       }
 
-       if (!(slave->dev->xfrmdev_ops
-             && slave->dev->xfrmdev_ops->xdo_dev_state_add)) {
+       if (!slave->dev->xfrmdev_ops ||
+           !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
+           netif_is_bond_master(slave->dev)) {
                slave_warn(bond_dev, slave->dev, "Slave does not support ipsec offload\n");
+               rcu_read_unlock();
                return -EINVAL;
        }
 
-       return slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
+       ipsec = kmalloc(sizeof(*ipsec), GFP_ATOMIC);
+       if (!ipsec) {
+               rcu_read_unlock();
+               return -ENOMEM;
+       }
+       xs->xso.real_dev = slave->dev;
+
+       err = slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
+       if (!err) {
+               ipsec->xs = xs;
+               INIT_LIST_HEAD(&ipsec->list);
+               spin_lock_bh(&bond->ipsec_lock);
+               list_add(&ipsec->list, &bond->ipsec_list);
+               spin_unlock_bh(&bond->ipsec_lock);
+       } else {
+               kfree(ipsec);
+       }
+       rcu_read_unlock();
+       return err;
+}
+
+static void bond_ipsec_add_sa_all(struct bonding *bond)
+{
+       struct net_device *bond_dev = bond->dev;
+       struct bond_ipsec *ipsec;
+       struct slave *slave;
+
+       rcu_read_lock();
+       slave = rcu_dereference(bond->curr_active_slave);
+       if (!slave)
+               goto out;
+
+       if (!slave->dev->xfrmdev_ops ||
+           !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
+           netif_is_bond_master(slave->dev)) {
+               spin_lock_bh(&bond->ipsec_lock);
+               if (!list_empty(&bond->ipsec_list))
+                       slave_warn(bond_dev, slave->dev,
+                                  "%s: no slave xdo_dev_state_add\n",
+                                  __func__);
+               spin_unlock_bh(&bond->ipsec_lock);
+               goto out;
+       }
+
+       spin_lock_bh(&bond->ipsec_lock);
+       list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+               ipsec->xs->xso.real_dev = slave->dev;
+               if (slave->dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs)) {
+                       slave_warn(bond_dev, slave->dev, "%s: failed to add SA\n", __func__);
+                       ipsec->xs->xso.real_dev = NULL;
+               }
+       }
+       spin_unlock_bh(&bond->ipsec_lock);
+out:
+       rcu_read_unlock();
 }
 
 /**
@@ -428,27 +489,77 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 static void bond_ipsec_del_sa(struct xfrm_state *xs)
 {
        struct net_device *bond_dev = xs->xso.dev;
+       struct bond_ipsec *ipsec;
        struct bonding *bond;
        struct slave *slave;
 
        if (!bond_dev)
                return;
 
+       rcu_read_lock();
        bond = netdev_priv(bond_dev);
        slave = rcu_dereference(bond->curr_active_slave);
 
        if (!slave)
-               return;
+               goto out;
 
-       xs->xso.real_dev = slave->dev;
+       if (!xs->xso.real_dev)
+               goto out;
+
+       WARN_ON(xs->xso.real_dev != slave->dev);
 
-       if (!(slave->dev->xfrmdev_ops
-             && slave->dev->xfrmdev_ops->xdo_dev_state_delete)) {
+       if (!slave->dev->xfrmdev_ops ||
+           !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
+           netif_is_bond_master(slave->dev)) {
                slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__);
-               return;
+               goto out;
        }
 
        slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs);
+out:
+       spin_lock_bh(&bond->ipsec_lock);
+       list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+               if (ipsec->xs == xs) {
+                       list_del(&ipsec->list);
+                       kfree(ipsec);
+                       break;
+               }
+       }
+       spin_unlock_bh(&bond->ipsec_lock);
+       rcu_read_unlock();
+}
+
+static void bond_ipsec_del_sa_all(struct bonding *bond)
+{
+       struct net_device *bond_dev = bond->dev;
+       struct bond_ipsec *ipsec;
+       struct slave *slave;
+
+       rcu_read_lock();
+       slave = rcu_dereference(bond->curr_active_slave);
+       if (!slave) {
+               rcu_read_unlock();
+               return;
+       }
+
+       spin_lock_bh(&bond->ipsec_lock);
+       list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+               if (!ipsec->xs->xso.real_dev)
+                       continue;
+
+               if (!slave->dev->xfrmdev_ops ||
+                   !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
+                   netif_is_bond_master(slave->dev)) {
+                       slave_warn(bond_dev, slave->dev,
+                                  "%s: no slave xdo_dev_state_delete\n",
+                                  __func__);
+               } else {
+                       slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs);
+               }
+               ipsec->xs->xso.real_dev = NULL;
+       }
+       spin_unlock_bh(&bond->ipsec_lock);
+       rcu_read_unlock();
 }
 
 /**
@@ -459,21 +570,37 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 {
        struct net_device *bond_dev = xs->xso.dev;
-       struct bonding *bond = netdev_priv(bond_dev);
-       struct slave *curr_active = rcu_dereference(bond->curr_active_slave);
-       struct net_device *slave_dev = curr_active->dev;
+       struct net_device *real_dev;
+       struct slave *curr_active;
+       struct bonding *bond;
+       int err;
 
-       if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
-               return true;
+       bond = netdev_priv(bond_dev);
+       rcu_read_lock();
+       curr_active = rcu_dereference(bond->curr_active_slave);
+       real_dev = curr_active->dev;
 
-       if (!(slave_dev->xfrmdev_ops
-             && slave_dev->xfrmdev_ops->xdo_dev_offload_ok)) {
-               slave_warn(bond_dev, slave_dev, "%s: no slave xdo_dev_offload_ok\n", __func__);
-               return false;
+       if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
+               err = false;
+               goto out;
        }
 
-       xs->xso.real_dev = slave_dev;
-       return slave_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+       if (!xs->xso.real_dev) {
+               err = false;
+               goto out;
+       }
+
+       if (!real_dev->xfrmdev_ops ||
+           !real_dev->xfrmdev_ops->xdo_dev_offload_ok ||
+           netif_is_bond_master(real_dev)) {
+               err = false;
+               goto out;
+       }
+
+       err = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+out:
+       rcu_read_unlock();
+       return err;
 }
 
 static const struct xfrmdev_ops bond_xfrmdev_ops = {
@@ -990,8 +1117,7 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
                return;
 
 #ifdef CONFIG_XFRM_OFFLOAD
-       if (old_active && bond->xs)
-               bond_ipsec_del_sa(bond->xs);
+       bond_ipsec_del_sa_all(bond);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
        if (new_active) {
@@ -1066,10 +1192,7 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
        }
 
 #ifdef CONFIG_XFRM_OFFLOAD
-       if (new_active && bond->xs) {
-               xfrm_dev_state_flush(dev_net(bond->dev), bond->dev, true);
-               bond_ipsec_add_sa(bond->xs);
-       }
+       bond_ipsec_add_sa_all(bond);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
        /* resend IGMP joins since active slave has changed or
@@ -3327,6 +3450,7 @@ static int bond_master_netdev_event(unsigned long event,
                return bond_event_changename(event_bond);
        case NETDEV_UNREGISTER:
                bond_remove_proc_entry(event_bond);
+               xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true);
                break;
        case NETDEV_REGISTER:
                bond_create_proc_entry(event_bond);
@@ -4894,7 +5018,8 @@ void bond_setup(struct net_device *bond_dev)
 #ifdef CONFIG_XFRM_OFFLOAD
        /* set up xfrm device ops (only supported in active-backup right now) */
        bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
-       bond->xs = NULL;
+       INIT_LIST_HEAD(&bond->ipsec_list);
+       spin_lock_init(&bond->ipsec_lock);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
        /* don't acquire bond device's netif_tx_lock when transmitting */
index a77124b..709660c 100644 (file)
@@ -20,15 +20,6 @@ config CAIF_TTY
          identified as N_CAIF. When this ldisc is opened from user space
          it will redirect the TTY's traffic into the CAIF stack.
 
-config CAIF_HSI
-       tristate "CAIF HSI transport driver"
-       depends on CAIF
-       default n
-       help
-         The CAIF low level driver for CAIF over HSI.
-         Be aware that if you enable this then you also need to
-         enable a low-level HSI driver.
-
 config CAIF_VIRTIO
        tristate "CAIF virtio transport driver"
        depends on CAIF && HAS_DMA
index b1918c8..97f664f 100644 (file)
@@ -4,8 +4,5 @@ ccflags-$(CONFIG_CAIF_DEBUG) := -DDEBUG
 # Serial interface
 obj-$(CONFIG_CAIF_TTY) += caif_serial.o
 
-# HSI interface
-obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
-
 # Virtio interface
 obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
deleted file mode 100644 (file)
index 3d63b15..0000000
+++ /dev/null
@@ -1,1454 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author:  Daniel Martensson
- *         Dmitry.Tarnyagin  / dmitry.tarnyagin@lockless.no
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME fmt
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <linux/list.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/if_arp.h>
-#include <linux/timer.h>
-#include <net/rtnetlink.h>
-#include <linux/pkt_sched.h>
-#include <net/caif/caif_layer.h>
-#include <net/caif/caif_hsi.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Daniel Martensson");
-MODULE_DESCRIPTION("CAIF HSI driver");
-
-/* Returns the number of padding bytes for alignment. */
-#define PAD_POW2(x, pow) ((((x)&((pow)-1)) == 0) ? 0 :\
-                               (((pow)-((x)&((pow)-1)))))
-
-static const struct cfhsi_config  hsi_default_config = {
-
-       /* Inactivity timeout on HSI, ms */
-       .inactivity_timeout = HZ,
-
-       /* Aggregation timeout (ms) of zero means no aggregation is done*/
-       .aggregation_timeout = 1,
-
-       /*
-        * HSI link layer flow-control thresholds.
-        * Threshold values for the HSI packet queue. Flow-control will be
-        * asserted when the number of packets exceeds q_high_mark. It will
-        * not be de-asserted before the number of packets drops below
-        * q_low_mark.
-        * Warning: A high threshold value might increase throughput but it
-        * will at the same time prevent channel prioritization and increase
-        * the risk of flooding the modem. The high threshold should be above
-        * the low.
-        */
-       .q_high_mark = 100,
-       .q_low_mark = 50,
-
-       /*
-        * HSI padding options.
-        * Warning: must be a base of 2 (& operation used) and can not be zero !
-        */
-       .head_align = 4,
-       .tail_align = 4,
-};
-
-#define ON 1
-#define OFF 0
-
-static LIST_HEAD(cfhsi_list);
-
-static void cfhsi_inactivity_tout(struct timer_list *t)
-{
-       struct cfhsi *cfhsi = from_timer(cfhsi, t, inactivity_timer);
-
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       /* Schedule power down work queue. */
-       if (!test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               queue_work(cfhsi->wq, &cfhsi->wake_down_work);
-}
-
-static void cfhsi_update_aggregation_stats(struct cfhsi *cfhsi,
-                                          const struct sk_buff *skb,
-                                          int direction)
-{
-       struct caif_payload_info *info;
-       int hpad, tpad, len;
-
-       info = (struct caif_payload_info *)&skb->cb;
-       hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-       tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-       len = skb->len + hpad + tpad;
-
-       if (direction > 0)
-               cfhsi->aggregation_len += len;
-       else if (direction < 0)
-               cfhsi->aggregation_len -= len;
-}
-
-static bool cfhsi_can_send_aggregate(struct cfhsi *cfhsi)
-{
-       int i;
-
-       if (cfhsi->cfg.aggregation_timeout == 0)
-               return true;
-
-       for (i = 0; i < CFHSI_PRIO_BEBK; ++i) {
-               if (cfhsi->qhead[i].qlen)
-                       return true;
-       }
-
-       /* TODO: Use aggregation_len instead */
-       if (cfhsi->qhead[CFHSI_PRIO_BEBK].qlen >= CFHSI_MAX_PKTS)
-               return true;
-
-       return false;
-}
-
-static struct sk_buff *cfhsi_dequeue(struct cfhsi *cfhsi)
-{
-       struct sk_buff *skb;
-       int i;
-
-       for (i = 0; i < CFHSI_PRIO_LAST; ++i) {
-               skb = skb_dequeue(&cfhsi->qhead[i]);
-               if (skb)
-                       break;
-       }
-
-       return skb;
-}
-
-static int cfhsi_tx_queue_len(struct cfhsi *cfhsi)
-{
-       int i, len = 0;
-       for (i = 0; i < CFHSI_PRIO_LAST; ++i)
-               len += skb_queue_len(&cfhsi->qhead[i]);
-       return len;
-}
-
-static void cfhsi_abort_tx(struct cfhsi *cfhsi)
-{
-       struct sk_buff *skb;
-
-       for (;;) {
-               spin_lock_bh(&cfhsi->lock);
-               skb = cfhsi_dequeue(cfhsi);
-               if (!skb)
-                       break;
-
-               cfhsi->ndev->stats.tx_errors++;
-               cfhsi->ndev->stats.tx_dropped++;
-               cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-               spin_unlock_bh(&cfhsi->lock);
-               kfree_skb(skb);
-       }
-       cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-       if (!test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               mod_timer(&cfhsi->inactivity_timer,
-                       jiffies + cfhsi->cfg.inactivity_timeout);
-       spin_unlock_bh(&cfhsi->lock);
-}
-
-static int cfhsi_flush_fifo(struct cfhsi *cfhsi)
-{
-       char buffer[32]; /* Any reasonable value */
-       size_t fifo_occupancy;
-       int ret;
-
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       do {
-               ret = cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-                               &fifo_occupancy);
-               if (ret) {
-                       netdev_warn(cfhsi->ndev,
-                               "%s: can't get FIFO occupancy: %d.\n",
-                               __func__, ret);
-                       break;
-               } else if (!fifo_occupancy)
-                       /* No more data, exitting normally */
-                       break;
-
-               fifo_occupancy = min(sizeof(buffer), fifo_occupancy);
-               set_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits);
-               ret = cfhsi->ops->cfhsi_rx(buffer, fifo_occupancy,
-                               cfhsi->ops);
-               if (ret) {
-                       clear_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits);
-                       netdev_warn(cfhsi->ndev,
-                               "%s: can't read data: %d.\n",
-                               __func__, ret);
-                       break;
-               }
-
-               ret = 5 * HZ;
-               ret = wait_event_interruptible_timeout(cfhsi->flush_fifo_wait,
-                        !test_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits), ret);
-
-               if (ret < 0) {
-                       netdev_warn(cfhsi->ndev,
-                               "%s: can't wait for flush complete: %d.\n",
-                               __func__, ret);
-                       break;
-               } else if (!ret) {
-                       ret = -ETIMEDOUT;
-                       netdev_warn(cfhsi->ndev,
-                               "%s: timeout waiting for flush complete.\n",
-                               __func__);
-                       break;
-               }
-       } while (1);
-
-       return ret;
-}
-
-static int cfhsi_tx_frm(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-       int nfrms = 0;
-       int pld_len = 0;
-       struct sk_buff *skb;
-       u8 *pfrm = desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ;
-
-       skb = cfhsi_dequeue(cfhsi);
-       if (!skb)
-               return 0;
-
-       /* Clear offset. */
-       desc->offset = 0;
-
-       /* Check if we can embed a CAIF frame. */
-       if (skb->len < CFHSI_MAX_EMB_FRM_SZ) {
-               struct caif_payload_info *info;
-               int hpad;
-               int tpad;
-
-               /* Calculate needed head alignment and tail alignment. */
-               info = (struct caif_payload_info *)&skb->cb;
-
-               hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-               tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-
-               /* Check if frame still fits with added alignment. */
-               if ((skb->len + hpad + tpad) <= CFHSI_MAX_EMB_FRM_SZ) {
-                       u8 *pemb = desc->emb_frm;
-                       desc->offset = CFHSI_DESC_SHORT_SZ;
-                       *pemb = (u8)(hpad - 1);
-                       pemb += hpad;
-
-                       /* Update network statistics. */
-                       spin_lock_bh(&cfhsi->lock);
-                       cfhsi->ndev->stats.tx_packets++;
-                       cfhsi->ndev->stats.tx_bytes += skb->len;
-                       cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-                       spin_unlock_bh(&cfhsi->lock);
-
-                       /* Copy in embedded CAIF frame. */
-                       skb_copy_bits(skb, 0, pemb, skb->len);
-
-                       /* Consume the SKB */
-                       consume_skb(skb);
-                       skb = NULL;
-               }
-       }
-
-       /* Create payload CAIF frames. */
-       while (nfrms < CFHSI_MAX_PKTS) {
-               struct caif_payload_info *info;
-               int hpad;
-               int tpad;
-
-               if (!skb)
-                       skb = cfhsi_dequeue(cfhsi);
-
-               if (!skb)
-                       break;
-
-               /* Calculate needed head alignment and tail alignment. */
-               info = (struct caif_payload_info *)&skb->cb;
-
-               hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-               tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-
-               /* Fill in CAIF frame length in descriptor. */
-               desc->cffrm_len[nfrms] = hpad + skb->len + tpad;
-
-               /* Fill head padding information. */
-               *pfrm = (u8)(hpad - 1);
-               pfrm += hpad;
-
-               /* Update network statistics. */
-               spin_lock_bh(&cfhsi->lock);
-               cfhsi->ndev->stats.tx_packets++;
-               cfhsi->ndev->stats.tx_bytes += skb->len;
-               cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-               spin_unlock_bh(&cfhsi->lock);
-
-               /* Copy in CAIF frame. */
-               skb_copy_bits(skb, 0, pfrm, skb->len);
-
-               /* Update payload length. */
-               pld_len += desc->cffrm_len[nfrms];
-
-               /* Update frame pointer. */
-               pfrm += skb->len + tpad;
-
-               /* Consume the SKB */
-               consume_skb(skb);
-               skb = NULL;
-
-               /* Update number of frames. */
-               nfrms++;
-       }
-
-       /* Unused length fields should be zero-filled (according to SPEC). */
-       while (nfrms < CFHSI_MAX_PKTS) {
-               desc->cffrm_len[nfrms] = 0x0000;
-               nfrms++;
-       }
-
-       /* Check if we can piggy-back another descriptor. */
-       if (cfhsi_can_send_aggregate(cfhsi))
-               desc->header |= CFHSI_PIGGY_DESC;
-       else
-               desc->header &= ~CFHSI_PIGGY_DESC;
-
-       return CFHSI_DESC_SZ + pld_len;
-}
-
-static void cfhsi_start_tx(struct cfhsi *cfhsi)
-{
-       struct cfhsi_desc *desc = (struct cfhsi_desc *)cfhsi->tx_buf;
-       int len, res;
-
-       netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-
-       do {
-               /* Create HSI frame. */
-               len = cfhsi_tx_frm(desc, cfhsi);
-               if (!len) {
-                       spin_lock_bh(&cfhsi->lock);
-                       if (unlikely(cfhsi_tx_queue_len(cfhsi))) {
-                               spin_unlock_bh(&cfhsi->lock);
-                               res = -EAGAIN;
-                               continue;
-                       }
-                       cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-                       /* Start inactivity timer. */
-                       mod_timer(&cfhsi->inactivity_timer,
-                               jiffies + cfhsi->cfg.inactivity_timeout);
-                       spin_unlock_bh(&cfhsi->lock);
-                       break;
-               }
-
-               /* Set up new transfer. */
-               res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-               if (WARN_ON(res < 0))
-                       netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-                               __func__, res);
-       } while (res < 0);
-}
-
-static void cfhsi_tx_done(struct cfhsi *cfhsi)
-{
-       netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-
-       /*
-        * Send flow on if flow off has been previously signalled
-        * and number of packets is below low water mark.
-        */
-       spin_lock_bh(&cfhsi->lock);
-       if (cfhsi->flow_off_sent &&
-                       cfhsi_tx_queue_len(cfhsi) <= cfhsi->cfg.q_low_mark &&
-                       cfhsi->cfdev.flowctrl) {
-
-               cfhsi->flow_off_sent = 0;
-               cfhsi->cfdev.flowctrl(cfhsi->ndev, ON);
-       }
-
-       if (cfhsi_can_send_aggregate(cfhsi)) {
-               spin_unlock_bh(&cfhsi->lock);
-               cfhsi_start_tx(cfhsi);
-       } else {
-               mod_timer(&cfhsi->aggregation_timer,
-                       jiffies + cfhsi->cfg.aggregation_timeout);
-               spin_unlock_bh(&cfhsi->lock);
-       }
-
-       return;
-}
-
-static void cfhsi_tx_done_cb(struct cfhsi_cb_ops *cb_ops)
-{
-       struct cfhsi *cfhsi;
-
-       cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-       cfhsi_tx_done(cfhsi);
-}
-
-static int cfhsi_rx_desc(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-       int xfer_sz = 0;
-       int nfrms = 0;
-       u16 *plen = NULL;
-       u8 *pfrm = NULL;
-
-       if ((desc->header & ~CFHSI_PIGGY_DESC) ||
-                       (desc->offset > CFHSI_MAX_EMB_FRM_SZ)) {
-               netdev_err(cfhsi->ndev, "%s: Invalid descriptor.\n",
-                       __func__);
-               return -EPROTO;
-       }
-
-       /* Check for embedded CAIF frame. */
-       if (desc->offset) {
-               struct sk_buff *skb;
-               int len = 0;
-               pfrm = ((u8 *)desc) + desc->offset;
-
-               /* Remove offset padding. */
-               pfrm += *pfrm + 1;
-
-               /* Read length of CAIF frame (little endian). */
-               len = *pfrm;
-               len |= ((*(pfrm+1)) << 8) & 0xFF00;
-               len += 2;       /* Add FCS fields. */
-
-               /* Sanity check length of CAIF frame. */
-               if (unlikely(len > CFHSI_MAX_CAIF_FRAME_SZ)) {
-                       netdev_err(cfhsi->ndev, "%s: Invalid length.\n",
-                               __func__);
-                       return -EPROTO;
-               }
-
-               /* Allocate SKB (OK even in IRQ context). */
-               skb = alloc_skb(len + 1, GFP_ATOMIC);
-               if (!skb) {
-                       netdev_err(cfhsi->ndev, "%s: Out of memory !\n",
-                               __func__);
-                       return -ENOMEM;
-               }
-               caif_assert(skb != NULL);
-
-               skb_put_data(skb, pfrm, len);
-
-               skb->protocol = htons(ETH_P_CAIF);
-               skb_reset_mac_header(skb);
-               skb->dev = cfhsi->ndev;
-
-               netif_rx_any_context(skb);
-
-               /* Update network statistics. */
-               cfhsi->ndev->stats.rx_packets++;
-               cfhsi->ndev->stats.rx_bytes += len;
-       }
-
-       /* Calculate transfer length. */
-       plen = desc->cffrm_len;
-       while (nfrms < CFHSI_MAX_PKTS && *plen) {
-               xfer_sz += *plen;
-               plen++;
-               nfrms++;
-       }
-
-       /* Check for piggy-backed descriptor. */
-       if (desc->header & CFHSI_PIGGY_DESC)
-               xfer_sz += CFHSI_DESC_SZ;
-
-       if ((xfer_sz % 4) || (xfer_sz > (CFHSI_BUF_SZ_RX - CFHSI_DESC_SZ))) {
-               netdev_err(cfhsi->ndev,
-                               "%s: Invalid payload len: %d, ignored.\n",
-                       __func__, xfer_sz);
-               return -EPROTO;
-       }
-       return xfer_sz;
-}
-
-static int cfhsi_rx_desc_len(struct cfhsi_desc *desc)
-{
-       int xfer_sz = 0;
-       int nfrms = 0;
-       u16 *plen;
-
-       if ((desc->header & ~CFHSI_PIGGY_DESC) ||
-                       (desc->offset > CFHSI_MAX_EMB_FRM_SZ)) {
-
-               pr_err("Invalid descriptor. %x %x\n", desc->header,
-                               desc->offset);
-               return -EPROTO;
-       }
-
-       /* Calculate transfer length. */
-       plen = desc->cffrm_len;
-       while (nfrms < CFHSI_MAX_PKTS && *plen) {
-               xfer_sz += *plen;
-               plen++;
-               nfrms++;
-       }
-
-       if (xfer_sz % 4) {
-               pr_err("Invalid payload len: %d, ignored.\n", xfer_sz);
-               return -EPROTO;
-       }
-       return xfer_sz;
-}
-
-static int cfhsi_rx_pld(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-       int rx_sz = 0;
-       int nfrms = 0;
-       u16 *plen = NULL;
-       u8 *pfrm = NULL;
-
-       /* Sanity check header and offset. */
-       if (WARN_ON((desc->header & ~CFHSI_PIGGY_DESC) ||
-                       (desc->offset > CFHSI_MAX_EMB_FRM_SZ))) {
-               netdev_err(cfhsi->ndev, "%s: Invalid descriptor.\n",
-                       __func__);
-               return -EPROTO;
-       }
-
-       /* Set frame pointer to start of payload. */
-       pfrm = desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ;
-       plen = desc->cffrm_len;
-
-       /* Skip already processed frames. */
-       while (nfrms < cfhsi->rx_state.nfrms) {
-               pfrm += *plen;
-               rx_sz += *plen;
-               plen++;
-               nfrms++;
-       }
-
-       /* Parse payload. */
-       while (nfrms < CFHSI_MAX_PKTS && *plen) {
-               struct sk_buff *skb;
-               u8 *pcffrm = NULL;
-               int len;
-
-               /* CAIF frame starts after head padding. */
-               pcffrm = pfrm + *pfrm + 1;
-
-               /* Read length of CAIF frame (little endian). */
-               len = *pcffrm;
-               len |= ((*(pcffrm + 1)) << 8) & 0xFF00;
-               len += 2;       /* Add FCS fields. */
-
-               /* Sanity check length of CAIF frames. */
-               if (unlikely(len > CFHSI_MAX_CAIF_FRAME_SZ)) {
-                       netdev_err(cfhsi->ndev, "%s: Invalid length.\n",
-                               __func__);
-                       return -EPROTO;
-               }
-
-               /* Allocate SKB (OK even in IRQ context). */
-               skb = alloc_skb(len + 1, GFP_ATOMIC);
-               if (!skb) {
-                       netdev_err(cfhsi->ndev, "%s: Out of memory !\n",
-                               __func__);
-                       cfhsi->rx_state.nfrms = nfrms;
-                       return -ENOMEM;
-               }
-               caif_assert(skb != NULL);
-
-               skb_put_data(skb, pcffrm, len);
-
-               skb->protocol = htons(ETH_P_CAIF);
-               skb_reset_mac_header(skb);
-               skb->dev = cfhsi->ndev;
-
-               netif_rx_any_context(skb);
-
-               /* Update network statistics. */
-               cfhsi->ndev->stats.rx_packets++;
-               cfhsi->ndev->stats.rx_bytes += len;
-
-               pfrm += *plen;
-               rx_sz += *plen;
-               plen++;
-               nfrms++;
-       }
-
-       return rx_sz;
-}
-
-static void cfhsi_rx_done(struct cfhsi *cfhsi)
-{
-       int res;
-       int desc_pld_len = 0, rx_len, rx_state;
-       struct cfhsi_desc *desc = NULL;
-       u8 *rx_ptr, *rx_buf;
-       struct cfhsi_desc *piggy_desc = NULL;
-
-       desc = (struct cfhsi_desc *)cfhsi->rx_buf;
-
-       netdev_dbg(cfhsi->ndev, "%s\n", __func__);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-
-       /* Update inactivity timer if pending. */
-       spin_lock_bh(&cfhsi->lock);
-       mod_timer_pending(&cfhsi->inactivity_timer,
-                       jiffies + cfhsi->cfg.inactivity_timeout);
-       spin_unlock_bh(&cfhsi->lock);
-
-       if (cfhsi->rx_state.state == CFHSI_RX_STATE_DESC) {
-               desc_pld_len = cfhsi_rx_desc_len(desc);
-
-               if (desc_pld_len < 0)
-                       goto out_of_sync;
-
-               rx_buf = cfhsi->rx_buf;
-               rx_len = desc_pld_len;
-               if (desc_pld_len > 0 && (desc->header & CFHSI_PIGGY_DESC))
-                       rx_len += CFHSI_DESC_SZ;
-               if (desc_pld_len == 0)
-                       rx_buf = cfhsi->rx_flip_buf;
-       } else {
-               rx_buf = cfhsi->rx_flip_buf;
-
-               rx_len = CFHSI_DESC_SZ;
-               if (cfhsi->rx_state.pld_len > 0 &&
-                               (desc->header & CFHSI_PIGGY_DESC)) {
-
-                       piggy_desc = (struct cfhsi_desc *)
-                               (desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ +
-                                               cfhsi->rx_state.pld_len);
-
-                       cfhsi->rx_state.piggy_desc = true;
-
-                       /* Extract payload len from piggy-backed descriptor. */
-                       desc_pld_len = cfhsi_rx_desc_len(piggy_desc);
-                       if (desc_pld_len < 0)
-                               goto out_of_sync;
-
-                       if (desc_pld_len > 0) {
-                               rx_len = desc_pld_len;
-                               if (piggy_desc->header & CFHSI_PIGGY_DESC)
-                                       rx_len += CFHSI_DESC_SZ;
-                       }
-
-                       /*
-                        * Copy needed information from the piggy-backed
-                        * descriptor to the descriptor in the start.
-                        */
-                       memcpy(rx_buf, (u8 *)piggy_desc,
-                                       CFHSI_DESC_SHORT_SZ);
-               }
-       }
-
-       if (desc_pld_len) {
-               rx_state = CFHSI_RX_STATE_PAYLOAD;
-               rx_ptr = rx_buf + CFHSI_DESC_SZ;
-       } else {
-               rx_state = CFHSI_RX_STATE_DESC;
-               rx_ptr = rx_buf;
-               rx_len = CFHSI_DESC_SZ;
-       }
-
-       /* Initiate next read */
-       if (test_bit(CFHSI_AWAKE, &cfhsi->bits)) {
-               /* Set up new transfer. */
-               netdev_dbg(cfhsi->ndev, "%s: Start RX.\n",
-                               __func__);
-
-               res = cfhsi->ops->cfhsi_rx(rx_ptr, rx_len,
-                               cfhsi->ops);
-               if (WARN_ON(res < 0)) {
-                       netdev_err(cfhsi->ndev, "%s: RX error %d.\n",
-                               __func__, res);
-                       cfhsi->ndev->stats.rx_errors++;
-                       cfhsi->ndev->stats.rx_dropped++;
-               }
-       }
-
-       if (cfhsi->rx_state.state == CFHSI_RX_STATE_DESC) {
-               /* Extract payload from descriptor */
-               if (cfhsi_rx_desc(desc, cfhsi) < 0)
-                       goto out_of_sync;
-       } else {
-               /* Extract payload */
-               if (cfhsi_rx_pld(desc, cfhsi) < 0)
-                       goto out_of_sync;
-               if (piggy_desc) {
-                       /* Extract any payload in piggyback descriptor. */
-                       if (cfhsi_rx_desc(piggy_desc, cfhsi) < 0)
-                               goto out_of_sync;
-                       /* Mark no embedded frame after extracting it */
-                       piggy_desc->offset = 0;
-               }
-       }
-
-       /* Update state info */
-       memset(&cfhsi->rx_state, 0, sizeof(cfhsi->rx_state));
-       cfhsi->rx_state.state = rx_state;
-       cfhsi->rx_ptr = rx_ptr;
-       cfhsi->rx_len = rx_len;
-       cfhsi->rx_state.pld_len = desc_pld_len;
-       cfhsi->rx_state.piggy_desc = desc->header & CFHSI_PIGGY_DESC;
-
-       if (rx_buf != cfhsi->rx_buf)
-               swap(cfhsi->rx_buf, cfhsi->rx_flip_buf);
-       return;
-
-out_of_sync:
-       netdev_err(cfhsi->ndev, "%s: Out of sync.\n", __func__);
-       print_hex_dump_bytes("--> ", DUMP_PREFIX_NONE,
-                       cfhsi->rx_buf, CFHSI_DESC_SZ);
-       schedule_work(&cfhsi->out_of_sync_work);
-}
-
-static void cfhsi_rx_slowpath(struct timer_list *t)
-{
-       struct cfhsi *cfhsi = from_timer(cfhsi, t, rx_slowpath_timer);
-
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       cfhsi_rx_done(cfhsi);
-}
-
-static void cfhsi_rx_done_cb(struct cfhsi_cb_ops *cb_ops)
-{
-       struct cfhsi *cfhsi;
-
-       cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-
-       if (test_and_clear_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits))
-               wake_up_interruptible(&cfhsi->flush_fifo_wait);
-       else
-               cfhsi_rx_done(cfhsi);
-}
-
-static void cfhsi_wake_up(struct work_struct *work)
-{
-       struct cfhsi *cfhsi = NULL;
-       int res;
-       int len;
-       long ret;
-
-       cfhsi = container_of(work, struct cfhsi, wake_up_work);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-
-       if (unlikely(test_bit(CFHSI_AWAKE, &cfhsi->bits))) {
-               /* It happenes when wakeup is requested by
-                * both ends at the same time. */
-               clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-               clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-               return;
-       }
-
-       /* Activate wake line. */
-       cfhsi->ops->cfhsi_wake_up(cfhsi->ops);
-
-       netdev_dbg(cfhsi->ndev, "%s: Start waiting.\n",
-               __func__);
-
-       /* Wait for acknowledge. */
-       ret = CFHSI_WAKE_TOUT;
-       ret = wait_event_interruptible_timeout(cfhsi->wake_up_wait,
-                                       test_and_clear_bit(CFHSI_WAKE_UP_ACK,
-                                                       &cfhsi->bits), ret);
-       if (unlikely(ret < 0)) {
-               /* Interrupted by signal. */
-               netdev_err(cfhsi->ndev, "%s: Signalled: %ld.\n",
-                       __func__, ret);
-
-               clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-               cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-               return;
-       } else if (!ret) {
-               bool ca_wake = false;
-               size_t fifo_occupancy = 0;
-
-               /* Wakeup timeout */
-               netdev_dbg(cfhsi->ndev, "%s: Timeout.\n",
-                       __func__);
-
-               /* Check FIFO to check if modem has sent something. */
-               WARN_ON(cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-                                       &fifo_occupancy));
-
-               netdev_dbg(cfhsi->ndev, "%s: Bytes in FIFO: %u.\n",
-                               __func__, (unsigned) fifo_occupancy);
-
-               /* Check if we misssed the interrupt. */
-               WARN_ON(cfhsi->ops->cfhsi_get_peer_wake(cfhsi->ops,
-                                                       &ca_wake));
-
-               if (ca_wake) {
-                       netdev_err(cfhsi->ndev, "%s: CA Wake missed !.\n",
-                               __func__);
-
-                       /* Clear the CFHSI_WAKE_UP_ACK bit to prevent race. */
-                       clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-
-                       /* Continue execution. */
-                       goto wake_ack;
-               }
-
-               clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-               cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-               return;
-       }
-wake_ack:
-       netdev_dbg(cfhsi->ndev, "%s: Woken.\n",
-               __func__);
-
-       /* Clear power up bit. */
-       set_bit(CFHSI_AWAKE, &cfhsi->bits);
-       clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-
-       /* Resume read operation. */
-       netdev_dbg(cfhsi->ndev, "%s: Start RX.\n", __func__);
-       res = cfhsi->ops->cfhsi_rx(cfhsi->rx_ptr, cfhsi->rx_len, cfhsi->ops);
-
-       if (WARN_ON(res < 0))
-               netdev_err(cfhsi->ndev, "%s: RX err %d.\n", __func__, res);
-
-       /* Clear power up acknowledment. */
-       clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-
-       spin_lock_bh(&cfhsi->lock);
-
-       /* Resume transmit if queues are not empty. */
-       if (!cfhsi_tx_queue_len(cfhsi)) {
-               netdev_dbg(cfhsi->ndev, "%s: Peer wake, start timer.\n",
-                       __func__);
-               /* Start inactivity timer. */
-               mod_timer(&cfhsi->inactivity_timer,
-                               jiffies + cfhsi->cfg.inactivity_timeout);
-               spin_unlock_bh(&cfhsi->lock);
-               return;
-       }
-
-       netdev_dbg(cfhsi->ndev, "%s: Host wake.\n",
-               __func__);
-
-       spin_unlock_bh(&cfhsi->lock);
-
-       /* Create HSI frame. */
-       len = cfhsi_tx_frm((struct cfhsi_desc *)cfhsi->tx_buf, cfhsi);
-
-       if (likely(len > 0)) {
-               /* Set up new transfer. */
-               res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-               if (WARN_ON(res < 0)) {
-                       netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-                               __func__, res);
-                       cfhsi_abort_tx(cfhsi);
-               }
-       } else {
-               netdev_err(cfhsi->ndev,
-                               "%s: Failed to create HSI frame: %d.\n",
-                               __func__, len);
-       }
-}
-
-static void cfhsi_wake_down(struct work_struct *work)
-{
-       long ret;
-       struct cfhsi *cfhsi = NULL;
-       size_t fifo_occupancy = 0;
-       int retry = CFHSI_WAKE_TOUT;
-
-       cfhsi = container_of(work, struct cfhsi, wake_down_work);
-       netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-
-       /* Deactivate wake line. */
-       cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-
-       /* Wait for acknowledge. */
-       ret = CFHSI_WAKE_TOUT;
-       ret = wait_event_interruptible_timeout(cfhsi->wake_down_wait,
-                                       test_and_clear_bit(CFHSI_WAKE_DOWN_ACK,
-                                                       &cfhsi->bits), ret);
-       if (ret < 0) {
-               /* Interrupted by signal. */
-               netdev_err(cfhsi->ndev, "%s: Signalled: %ld.\n",
-                       __func__, ret);
-               return;
-       } else if (!ret) {
-               bool ca_wake = true;
-
-               /* Timeout */
-               netdev_err(cfhsi->ndev, "%s: Timeout.\n", __func__);
-
-               /* Check if we misssed the interrupt. */
-               WARN_ON(cfhsi->ops->cfhsi_get_peer_wake(cfhsi->ops,
-                                                       &ca_wake));
-               if (!ca_wake)
-                       netdev_err(cfhsi->ndev, "%s: CA Wake missed !.\n",
-                               __func__);
-       }
-
-       /* Check FIFO occupancy. */
-       while (retry) {
-               WARN_ON(cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-                                                       &fifo_occupancy));
-
-               if (!fifo_occupancy)
-                       break;
-
-               set_current_state(TASK_INTERRUPTIBLE);
-               schedule_timeout(1);
-               retry--;
-       }
-
-       if (!retry)
-               netdev_err(cfhsi->ndev, "%s: FIFO Timeout.\n", __func__);
-
-       /* Clear AWAKE condition. */
-       clear_bit(CFHSI_AWAKE, &cfhsi->bits);
-
-       /* Cancel pending RX requests. */
-       cfhsi->ops->cfhsi_rx_cancel(cfhsi->ops);
-}
-
-static void cfhsi_out_of_sync(struct work_struct *work)
-{
-       struct cfhsi *cfhsi = NULL;
-
-       cfhsi = container_of(work, struct cfhsi, out_of_sync_work);
-
-       rtnl_lock();
-       dev_close(cfhsi->ndev);
-       rtnl_unlock();
-}
-
-static void cfhsi_wake_up_cb(struct cfhsi_cb_ops *cb_ops)
-{
-       struct cfhsi *cfhsi = NULL;
-
-       cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       set_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-       wake_up_interruptible(&cfhsi->wake_up_wait);
-
-       if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-               return;
-
-       /* Schedule wake up work queue if the peer initiates. */
-       if (!test_and_set_bit(CFHSI_WAKE_UP, &cfhsi->bits))
-               queue_work(cfhsi->wq, &cfhsi->wake_up_work);
-}
-
-static void cfhsi_wake_down_cb(struct cfhsi_cb_ops *cb_ops)
-{
-       struct cfhsi *cfhsi = NULL;
-
-       cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       /* Initiating low power is only permitted by the host (us). */
-       set_bit(CFHSI_WAKE_DOWN_ACK, &cfhsi->bits);
-       wake_up_interruptible(&cfhsi->wake_down_wait);
-}
-
-static void cfhsi_aggregation_tout(struct timer_list *t)
-{
-       struct cfhsi *cfhsi = from_timer(cfhsi, t, aggregation_timer);
-
-       netdev_dbg(cfhsi->ndev, "%s.\n",
-               __func__);
-
-       cfhsi_start_tx(cfhsi);
-}
-
-static netdev_tx_t cfhsi_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-       struct cfhsi *cfhsi = NULL;
-       int start_xfer = 0;
-       int timer_active;
-       int prio;
-
-       if (!dev)
-               return -EINVAL;
-
-       cfhsi = netdev_priv(dev);
-
-       switch (skb->priority) {
-       case TC_PRIO_BESTEFFORT:
-       case TC_PRIO_FILLER:
-       case TC_PRIO_BULK:
-               prio = CFHSI_PRIO_BEBK;
-               break;
-       case TC_PRIO_INTERACTIVE_BULK:
-               prio = CFHSI_PRIO_VI;
-               break;
-       case TC_PRIO_INTERACTIVE:
-               prio = CFHSI_PRIO_VO;
-               break;
-       case TC_PRIO_CONTROL:
-       default:
-               prio = CFHSI_PRIO_CTL;
-               break;
-       }
-
-       spin_lock_bh(&cfhsi->lock);
-
-       /* Update aggregation statistics  */
-       cfhsi_update_aggregation_stats(cfhsi, skb, 1);
-
-       /* Queue the SKB */
-       skb_queue_tail(&cfhsi->qhead[prio], skb);
-
-       /* Sanity check; xmit should not be called after unregister_netdev */
-       if (WARN_ON(test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))) {
-               spin_unlock_bh(&cfhsi->lock);
-               cfhsi_abort_tx(cfhsi);
-               return -EINVAL;
-       }
-
-       /* Send flow off if number of packets is above high water mark. */
-       if (!cfhsi->flow_off_sent &&
-               cfhsi_tx_queue_len(cfhsi) > cfhsi->cfg.q_high_mark &&
-               cfhsi->cfdev.flowctrl) {
-               cfhsi->flow_off_sent = 1;
-               cfhsi->cfdev.flowctrl(cfhsi->ndev, OFF);
-       }
-
-       if (cfhsi->tx_state == CFHSI_TX_STATE_IDLE) {
-               cfhsi->tx_state = CFHSI_TX_STATE_XFER;
-               start_xfer = 1;
-       }
-
-       if (!start_xfer) {
-               /* Send aggregate if it is possible */
-               bool aggregate_ready =
-                       cfhsi_can_send_aggregate(cfhsi) &&
-                       del_timer(&cfhsi->aggregation_timer) > 0;
-               spin_unlock_bh(&cfhsi->lock);
-               if (aggregate_ready)
-                       cfhsi_start_tx(cfhsi);
-               return NETDEV_TX_OK;
-       }
-
-       /* Delete inactivity timer if started. */
-       timer_active = del_timer_sync(&cfhsi->inactivity_timer);
-
-       spin_unlock_bh(&cfhsi->lock);
-
-       if (timer_active) {
-               struct cfhsi_desc *desc = (struct cfhsi_desc *)cfhsi->tx_buf;
-               int len;
-               int res;
-
-               /* Create HSI frame. */
-               len = cfhsi_tx_frm(desc, cfhsi);
-               WARN_ON(!len);
-
-               /* Set up new transfer. */
-               res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-               if (WARN_ON(res < 0)) {
-                       netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-                               __func__, res);
-                       cfhsi_abort_tx(cfhsi);
-               }
-       } else {
-               /* Schedule wake up work queue if the we initiate. */
-               if (!test_and_set_bit(CFHSI_WAKE_UP, &cfhsi->bits))
-                       queue_work(cfhsi->wq, &cfhsi->wake_up_work);
-       }
-
-       return NETDEV_TX_OK;
-}
-
-static const struct net_device_ops cfhsi_netdevops;
-
-static void cfhsi_setup(struct net_device *dev)
-{
-       int i;
-       struct cfhsi *cfhsi = netdev_priv(dev);
-       dev->features = 0;
-       dev->type = ARPHRD_CAIF;
-       dev->flags = IFF_POINTOPOINT | IFF_NOARP;
-       dev->mtu = CFHSI_MAX_CAIF_FRAME_SZ;
-       dev->priv_flags |= IFF_NO_QUEUE;
-       dev->needs_free_netdev = true;
-       dev->netdev_ops = &cfhsi_netdevops;
-       for (i = 0; i < CFHSI_PRIO_LAST; ++i)
-               skb_queue_head_init(&cfhsi->qhead[i]);
-       cfhsi->cfdev.link_select = CAIF_LINK_HIGH_BANDW;
-       cfhsi->cfdev.use_frag = false;
-       cfhsi->cfdev.use_stx = false;
-       cfhsi->cfdev.use_fcs = false;
-       cfhsi->ndev = dev;
-       cfhsi->cfg = hsi_default_config;
-}
-
-static int cfhsi_open(struct net_device *ndev)
-{
-       struct cfhsi *cfhsi = netdev_priv(ndev);
-       int res;
-
-       clear_bit(CFHSI_SHUTDOWN, &cfhsi->bits);
-
-       /* Initialize state vaiables. */
-       cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-       cfhsi->rx_state.state = CFHSI_RX_STATE_DESC;
-
-       /* Set flow info */
-       cfhsi->flow_off_sent = 0;
-
-       /*
-        * Allocate a TX buffer with the size of a HSI packet descriptors
-        * and the necessary room for CAIF payload frames.
-        */
-       cfhsi->tx_buf = kzalloc(CFHSI_BUF_SZ_TX, GFP_KERNEL);
-       if (!cfhsi->tx_buf) {
-               res = -ENODEV;
-               goto err_alloc_tx;
-       }
-
-       /*
-        * Allocate a RX buffer with the size of two HSI packet descriptors and
-        * the necessary room for CAIF payload frames.
-        */
-       cfhsi->rx_buf = kzalloc(CFHSI_BUF_SZ_RX, GFP_KERNEL);
-       if (!cfhsi->rx_buf) {
-               res = -ENODEV;
-               goto err_alloc_rx;
-       }
-
-       cfhsi->rx_flip_buf = kzalloc(CFHSI_BUF_SZ_RX, GFP_KERNEL);
-       if (!cfhsi->rx_flip_buf) {
-               res = -ENODEV;
-               goto err_alloc_rx_flip;
-       }
-
-       /* Initialize aggregation timeout */
-       cfhsi->cfg.aggregation_timeout = hsi_default_config.aggregation_timeout;
-
-       /* Initialize recieve vaiables. */
-       cfhsi->rx_ptr = cfhsi->rx_buf;
-       cfhsi->rx_len = CFHSI_DESC_SZ;
-
-       /* Initialize spin locks. */
-       spin_lock_init(&cfhsi->lock);
-
-       /* Set up the driver. */
-       cfhsi->cb_ops.tx_done_cb = cfhsi_tx_done_cb;
-       cfhsi->cb_ops.rx_done_cb = cfhsi_rx_done_cb;
-       cfhsi->cb_ops.wake_up_cb = cfhsi_wake_up_cb;
-       cfhsi->cb_ops.wake_down_cb = cfhsi_wake_down_cb;
-
-       /* Initialize the work queues. */
-       INIT_WORK(&cfhsi->wake_up_work, cfhsi_wake_up);
-       INIT_WORK(&cfhsi->wake_down_work, cfhsi_wake_down);
-       INIT_WORK(&cfhsi->out_of_sync_work, cfhsi_out_of_sync);
-
-       /* Clear all bit fields. */
-       clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-       clear_bit(CFHSI_WAKE_DOWN_ACK, &cfhsi->bits);
-       clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-       clear_bit(CFHSI_AWAKE, &cfhsi->bits);
-
-       /* Create work thread. */
-       cfhsi->wq = alloc_ordered_workqueue(cfhsi->ndev->name, WQ_MEM_RECLAIM);
-       if (!cfhsi->wq) {
-               netdev_err(cfhsi->ndev, "%s: Failed to create work queue.\n",
-                       __func__);
-               res = -ENODEV;
-               goto err_create_wq;
-       }
-
-       /* Initialize wait queues. */
-       init_waitqueue_head(&cfhsi->wake_up_wait);
-       init_waitqueue_head(&cfhsi->wake_down_wait);
-       init_waitqueue_head(&cfhsi->flush_fifo_wait);
-
-       /* Setup the inactivity timer. */
-       timer_setup(&cfhsi->inactivity_timer, cfhsi_inactivity_tout, 0);
-       /* Setup the slowpath RX timer. */
-       timer_setup(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath, 0);
-       /* Setup the aggregation timer. */
-       timer_setup(&cfhsi->aggregation_timer, cfhsi_aggregation_tout, 0);
-
-       /* Activate HSI interface. */
-       res = cfhsi->ops->cfhsi_up(cfhsi->ops);
-       if (res) {
-               netdev_err(cfhsi->ndev,
-                       "%s: can't activate HSI interface: %d.\n",
-                       __func__, res);
-               goto err_activate;
-       }
-
-       /* Flush FIFO */
-       res = cfhsi_flush_fifo(cfhsi);
-       if (res) {
-               netdev_err(cfhsi->ndev, "%s: Can't flush FIFO: %d.\n",
-                       __func__, res);
-               goto err_net_reg;
-       }
-       return res;
-
- err_net_reg:
-       cfhsi->ops->cfhsi_down(cfhsi->ops);
- err_activate:
-       destroy_workqueue(cfhsi->wq);
- err_create_wq:
-       kfree(cfhsi->rx_flip_buf);
- err_alloc_rx_flip:
-       kfree(cfhsi->rx_buf);
- err_alloc_rx:
-       kfree(cfhsi->tx_buf);
- err_alloc_tx:
-       return res;
-}
-
-static int cfhsi_close(struct net_device *ndev)
-{
-       struct cfhsi *cfhsi = netdev_priv(ndev);
-       u8 *tx_buf, *rx_buf, *flip_buf;
-
-       /* going to shutdown driver */
-       set_bit(CFHSI_SHUTDOWN, &cfhsi->bits);
-
-       /* Delete timers if pending */
-       del_timer_sync(&cfhsi->inactivity_timer);
-       del_timer_sync(&cfhsi->rx_slowpath_timer);
-       del_timer_sync(&cfhsi->aggregation_timer);
-
-       /* Cancel pending RX request (if any) */
-       cfhsi->ops->cfhsi_rx_cancel(cfhsi->ops);
-
-       /* Destroy workqueue */
-       destroy_workqueue(cfhsi->wq);
-
-       /* Store bufferes: will be freed later. */
-       tx_buf = cfhsi->tx_buf;
-       rx_buf = cfhsi->rx_buf;
-       flip_buf = cfhsi->rx_flip_buf;
-       /* Flush transmit queues. */
-       cfhsi_abort_tx(cfhsi);
-
-       /* Deactivate interface */
-       cfhsi->ops->cfhsi_down(cfhsi->ops);
-
-       /* Free buffers. */
-       kfree(tx_buf);
-       kfree(rx_buf);
-       kfree(flip_buf);
-       return 0;
-}
-
-static void cfhsi_uninit(struct net_device *dev)
-{
-       struct cfhsi *cfhsi = netdev_priv(dev);
-       ASSERT_RTNL();
-       symbol_put(cfhsi_get_device);
-       list_del(&cfhsi->list);
-}
-
-static const struct net_device_ops cfhsi_netdevops = {
-       .ndo_uninit = cfhsi_uninit,
-       .ndo_open = cfhsi_open,
-       .ndo_stop = cfhsi_close,
-       .ndo_start_xmit = cfhsi_xmit
-};
-
-static void cfhsi_netlink_parms(struct nlattr *data[], struct cfhsi *cfhsi)
-{
-       int i;
-
-       if (!data) {
-               pr_debug("no params data found\n");
-               return;
-       }
-
-       i = __IFLA_CAIF_HSI_INACTIVITY_TOUT;
-       /*
-        * Inactivity timeout in millisecs. Lowest possible value is 1,
-        * and highest possible is NEXT_TIMER_MAX_DELTA.
-        */
-       if (data[i]) {
-               u32 inactivity_timeout = nla_get_u32(data[i]);
-               /* Pre-calculate inactivity timeout. */
-               cfhsi->cfg.inactivity_timeout = inactivity_timeout * HZ / 1000;
-               if (cfhsi->cfg.inactivity_timeout == 0)
-                       cfhsi->cfg.inactivity_timeout = 1;
-               else if (cfhsi->cfg.inactivity_timeout > NEXT_TIMER_MAX_DELTA)
-                       cfhsi->cfg.inactivity_timeout = NEXT_TIMER_MAX_DELTA;
-       }
-
-       i = __IFLA_CAIF_HSI_AGGREGATION_TOUT;
-       if (data[i])
-               cfhsi->cfg.aggregation_timeout = nla_get_u32(data[i]);
-
-       i = __IFLA_CAIF_HSI_HEAD_ALIGN;
-       if (data[i])
-               cfhsi->cfg.head_align = nla_get_u32(data[i]);
-
-       i = __IFLA_CAIF_HSI_TAIL_ALIGN;
-       if (data[i])
-               cfhsi->cfg.tail_align = nla_get_u32(data[i]);
-
-       i = __IFLA_CAIF_HSI_QHIGH_WATERMARK;
-       if (data[i])
-               cfhsi->cfg.q_high_mark = nla_get_u32(data[i]);
-
-       i = __IFLA_CAIF_HSI_QLOW_WATERMARK;
-       if (data[i])
-               cfhsi->cfg.q_low_mark = nla_get_u32(data[i]);
-}
-
-static int caif_hsi_changelink(struct net_device *dev, struct nlattr *tb[],
-                              struct nlattr *data[],
-                              struct netlink_ext_ack *extack)
-{
-       cfhsi_netlink_parms(data, netdev_priv(dev));
-       netdev_state_change(dev);
-       return 0;
-}
-
-static const struct nla_policy caif_hsi_policy[__IFLA_CAIF_HSI_MAX + 1] = {
-       [__IFLA_CAIF_HSI_INACTIVITY_TOUT] = { .type = NLA_U32, .len = 4 },
-       [__IFLA_CAIF_HSI_AGGREGATION_TOUT] = { .type = NLA_U32, .len = 4 },
-       [__IFLA_CAIF_HSI_HEAD_ALIGN] = { .type = NLA_U32, .len = 4 },
-       [__IFLA_CAIF_HSI_TAIL_ALIGN] = { .type = NLA_U32, .len = 4 },
-       [__IFLA_CAIF_HSI_QHIGH_WATERMARK] = { .type = NLA_U32, .len = 4 },
-       [__IFLA_CAIF_HSI_QLOW_WATERMARK] = { .type = NLA_U32, .len = 4 },
-};
-
-static size_t caif_hsi_get_size(const struct net_device *dev)
-{
-       int i;
-       size_t s = 0;
-       for (i = __IFLA_CAIF_HSI_UNSPEC + 1; i < __IFLA_CAIF_HSI_MAX; i++)
-               s += nla_total_size(caif_hsi_policy[i].len);
-       return s;
-}
-
-static int caif_hsi_fill_info(struct sk_buff *skb, const struct net_device *dev)
-{
-       struct cfhsi *cfhsi = netdev_priv(dev);
-
-       if (nla_put_u32(skb, __IFLA_CAIF_HSI_INACTIVITY_TOUT,
-                       cfhsi->cfg.inactivity_timeout) ||
-           nla_put_u32(skb, __IFLA_CAIF_HSI_AGGREGATION_TOUT,
-                       cfhsi->cfg.aggregation_timeout) ||
-           nla_put_u32(skb, __IFLA_CAIF_HSI_HEAD_ALIGN,
-                       cfhsi->cfg.head_align) ||
-           nla_put_u32(skb, __IFLA_CAIF_HSI_TAIL_ALIGN,
-                       cfhsi->cfg.tail_align) ||
-           nla_put_u32(skb, __IFLA_CAIF_HSI_QHIGH_WATERMARK,
-                       cfhsi->cfg.q_high_mark) ||
-           nla_put_u32(skb, __IFLA_CAIF_HSI_QLOW_WATERMARK,
-                       cfhsi->cfg.q_low_mark))
-               return -EMSGSIZE;
-
-       return 0;
-}
-
-static int caif_hsi_newlink(struct net *src_net, struct net_device *dev,
-                           struct nlattr *tb[], struct nlattr *data[],
-                           struct netlink_ext_ack *extack)
-{
-       struct cfhsi *cfhsi = NULL;
-       struct cfhsi_ops *(*get_ops)(void);
-
-       ASSERT_RTNL();
-
-       cfhsi = netdev_priv(dev);
-       cfhsi_netlink_parms(data, cfhsi);
-
-       get_ops = symbol_get(cfhsi_get_ops);
-       if (!get_ops) {
-               pr_err("%s: failed to get the cfhsi_ops\n", __func__);
-               return -ENODEV;
-       }
-
-       /* Assign the HSI device. */
-       cfhsi->ops = (*get_ops)();
-       if (!cfhsi->ops) {
-               pr_err("%s: failed to get the cfhsi_ops\n", __func__);
-               goto err;
-       }
-
-       /* Assign the driver to this HSI device. */
-       cfhsi->ops->cb_ops = &cfhsi->cb_ops;
-       if (register_netdevice(dev)) {
-               pr_warn("%s: caif_hsi device registration failed\n", __func__);
-               goto err;
-       }
-       /* Add CAIF HSI device to list. */
-       list_add_tail(&cfhsi->list, &cfhsi_list);
-
-       return 0;
-err:
-       symbol_put(cfhsi_get_ops);
-       return -ENODEV;
-}
-
-static struct rtnl_link_ops caif_hsi_link_ops __read_mostly = {
-       .kind           = "cfhsi",
-       .priv_size      = sizeof(struct cfhsi),
-       .setup          = cfhsi_setup,
-       .maxtype        = __IFLA_CAIF_HSI_MAX,
-       .policy = caif_hsi_policy,
-       .newlink        = caif_hsi_newlink,
-       .changelink     = caif_hsi_changelink,
-       .get_size       = caif_hsi_get_size,
-       .fill_info      = caif_hsi_fill_info,
-};
-
-static void __exit cfhsi_exit_module(void)
-{
-       struct list_head *list_node;
-       struct list_head *n;
-       struct cfhsi *cfhsi;
-
-       rtnl_link_unregister(&caif_hsi_link_ops);
-
-       rtnl_lock();
-       list_for_each_safe(list_node, n, &cfhsi_list) {
-               cfhsi = list_entry(list_node, struct cfhsi, list);
-               unregister_netdevice(cfhsi->ndev);
-       }
-       rtnl_unlock();
-}
-
-static int __init cfhsi_init_module(void)
-{
-       return rtnl_link_register(&caif_hsi_link_ops);
-}
-
-module_init(cfhsi_init_module);
-module_exit(cfhsi_exit_module);
index a7e5ac6..1542bfb 100644 (file)
@@ -419,8 +419,10 @@ int ksz_switch_register(struct ksz_device *dev,
                                if (of_property_read_u32(port, "reg",
                                                         &port_num))
                                        continue;
-                               if (!(dev->port_mask & BIT(port_num)))
+                               if (!(dev->port_mask & BIT(port_num))) {
+                                       of_node_put(port);
                                        return -EINVAL;
+                               }
                                of_get_phy_mode(port,
                                                &dev->ports[port_num].interface);
                        }
index 961fa6b..beb4157 100644 (file)
@@ -3583,6 +3583,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
        .port_set_speed_duplex = mv88e6341_port_set_speed_duplex,
        .port_max_speed_mode = mv88e6341_port_max_speed_mode,
        .port_tag_remap = mv88e6095_port_tag_remap,
+       .port_set_policy = mv88e6352_port_set_policy,
        .port_set_frame_mode = mv88e6351_port_set_frame_mode,
        .port_set_ucast_flood = mv88e6352_port_set_ucast_flood,
        .port_set_mcast_flood = mv88e6352_port_set_mcast_flood,
@@ -3596,7 +3597,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
        .port_set_cmode = mv88e6341_port_set_cmode,
        .port_setup_message_port = mv88e6xxx_setup_message_port,
        .stats_snapshot = mv88e6390_g1_stats_snapshot,
-       .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
+       .stats_set_histogram = mv88e6390_g1_stats_set_histogram,
        .stats_get_sset_count = mv88e6320_stats_get_sset_count,
        .stats_get_strings = mv88e6320_stats_get_strings,
        .stats_get_stats = mv88e6390_stats_get_stats,
@@ -3606,6 +3607,9 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
        .mgmt_rsvd2cpu =  mv88e6390_g1_mgmt_rsvd2cpu,
        .pot_clear = mv88e6xxx_g2_pot_clear,
        .reset = mv88e6352_g1_reset,
+       .rmu_disable = mv88e6390_g1_rmu_disable,
+       .atu_get_hash = mv88e6165_g1_atu_get_hash,
+       .atu_set_hash = mv88e6165_g1_atu_set_hash,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
        .serdes_power = mv88e6390_serdes_power,
@@ -3619,6 +3623,11 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
        .serdes_irq_enable = mv88e6390_serdes_irq_enable,
        .serdes_irq_status = mv88e6390_serdes_irq_status,
        .gpio_ops = &mv88e6352_gpio_ops,
+       .serdes_get_sset_count = mv88e6390_serdes_get_sset_count,
+       .serdes_get_strings = mv88e6390_serdes_get_strings,
+       .serdes_get_stats = mv88e6390_serdes_get_stats,
+       .serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
+       .serdes_get_regs = mv88e6390_serdes_get_regs,
        .phylink_validate = mv88e6341_phylink_validate,
 };
 
@@ -4383,6 +4392,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
        .port_set_speed_duplex = mv88e6341_port_set_speed_duplex,
        .port_max_speed_mode = mv88e6341_port_max_speed_mode,
        .port_tag_remap = mv88e6095_port_tag_remap,
+       .port_set_policy = mv88e6352_port_set_policy,
        .port_set_frame_mode = mv88e6351_port_set_frame_mode,
        .port_set_ucast_flood = mv88e6352_port_set_ucast_flood,
        .port_set_mcast_flood = mv88e6352_port_set_mcast_flood,
@@ -4396,7 +4406,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
        .port_set_cmode = mv88e6341_port_set_cmode,
        .port_setup_message_port = mv88e6xxx_setup_message_port,
        .stats_snapshot = mv88e6390_g1_stats_snapshot,
-       .stats_set_histogram = mv88e6095_g1_stats_set_histogram,
+       .stats_set_histogram = mv88e6390_g1_stats_set_histogram,
        .stats_get_sset_count = mv88e6320_stats_get_sset_count,
        .stats_get_strings = mv88e6320_stats_get_strings,
        .stats_get_stats = mv88e6390_stats_get_stats,
@@ -4406,6 +4416,9 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
        .mgmt_rsvd2cpu =  mv88e6390_g1_mgmt_rsvd2cpu,
        .pot_clear = mv88e6xxx_g2_pot_clear,
        .reset = mv88e6352_g1_reset,
+       .rmu_disable = mv88e6390_g1_rmu_disable,
+       .atu_get_hash = mv88e6165_g1_atu_get_hash,
+       .atu_set_hash = mv88e6165_g1_atu_set_hash,
        .vtu_getnext = mv88e6352_g1_vtu_getnext,
        .vtu_loadpurge = mv88e6352_g1_vtu_loadpurge,
        .serdes_power = mv88e6390_serdes_power,
@@ -4421,6 +4434,11 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
        .gpio_ops = &mv88e6352_gpio_ops,
        .avb_ops = &mv88e6390_avb_ops,
        .ptp_ops = &mv88e6352_ptp_ops,
+       .serdes_get_sset_count = mv88e6390_serdes_get_sset_count,
+       .serdes_get_strings = mv88e6390_serdes_get_strings,
+       .serdes_get_stats = mv88e6390_serdes_get_stats,
+       .serdes_get_regs_len = mv88e6390_serdes_get_regs_len,
+       .serdes_get_regs = mv88e6390_serdes_get_regs,
        .phylink_validate = mv88e6341_phylink_validate,
 };
 
index e4fbef8..b1d46dd 100644 (file)
@@ -722,7 +722,7 @@ static struct mv88e6390_serdes_hw_stat mv88e6390_serdes_hw_stats[] = {
 
 int mv88e6390_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port)
 {
-       if (mv88e6390_serdes_get_lane(chip, port) < 0)
+       if (mv88e6xxx_serdes_get_lane(chip, port) < 0)
                return 0;
 
        return ARRAY_SIZE(mv88e6390_serdes_hw_stats);
@@ -734,7 +734,7 @@ int mv88e6390_serdes_get_strings(struct mv88e6xxx_chip *chip,
        struct mv88e6390_serdes_hw_stat *stat;
        int i;
 
-       if (mv88e6390_serdes_get_lane(chip, port) < 0)
+       if (mv88e6xxx_serdes_get_lane(chip, port) < 0)
                return 0;
 
        for (i = 0; i < ARRAY_SIZE(mv88e6390_serdes_hw_stats); i++) {
@@ -770,7 +770,7 @@ int mv88e6390_serdes_get_stats(struct mv88e6xxx_chip *chip, int port,
        int lane;
        int i;
 
-       lane = mv88e6390_serdes_get_lane(chip, port);
+       lane = mv88e6xxx_serdes_get_lane(chip, port);
        if (lane < 0)
                return 0;
 
index 4f05456..ced8c9c 100644 (file)
@@ -122,14 +122,12 @@ static int sja1105_init_mac_settings(struct sja1105_private *priv)
 
        for (i = 0; i < ds->num_ports; i++) {
                mac[i] = default_mac;
-               if (i == dsa_upstream_port(priv->ds, i)) {
-                       /* STP doesn't get called for CPU port, so we need to
-                        * set the I/O parameters statically.
-                        */
-                       mac[i].dyn_learn = true;
-                       mac[i].ingress = true;
-                       mac[i].egress = true;
-               }
+
+               /* Let sja1105_bridge_stp_state_set() keep address learning
+                * enabled for the CPU port.
+                */
+               if (dsa_is_cpu_port(ds, i))
+                       priv->learn_ena |= BIT(i);
        }
 
        return 0;
index 7dff203..f19370c 100644 (file)
@@ -594,6 +594,11 @@ int atl1c_phy_init(struct atl1c_hw *hw)
        int ret_val;
        u16 mii_bmcr_data = BMCR_RESET;
 
+       if (hw->nic_type == athr_mt) {
+               hw->phy_configured = true;
+               return 0;
+       }
+
        if ((atl1c_read_phy_reg(hw, MII_PHYSID1, &hw->phy_id1) != 0) ||
                (atl1c_read_phy_reg(hw, MII_PHYSID2, &hw->phy_id2) != 0)) {
                dev_err(&pdev->dev, "Error get phy ID\n");
index 41f7f07..db74241 100644 (file)
@@ -1640,7 +1640,8 @@ static void bcmgenet_power_up(struct bcmgenet_priv *priv,
 
        switch (mode) {
        case GENET_POWER_PASSIVE:
-               reg &= ~(EXT_PWR_DOWN_DLL | EXT_PWR_DOWN_BIAS);
+               reg &= ~(EXT_PWR_DOWN_DLL | EXT_PWR_DOWN_BIAS |
+                        EXT_ENERGY_DET_MASK);
                if (GENET_IS_V5(priv)) {
                        reg &= ~(EXT_PWR_DOWN_PHY_EN |
                                 EXT_PWR_DOWN_PHY_RD |
@@ -3237,15 +3238,21 @@ static void bcmgenet_get_hw_addr(struct bcmgenet_priv *priv,
 /* Returns a reusable dma control register value */
 static u32 bcmgenet_dma_disable(struct bcmgenet_priv *priv)
 {
+       unsigned int i;
        u32 reg;
        u32 dma_ctrl;
 
        /* disable DMA */
        dma_ctrl = 1 << (DESC_INDEX + DMA_RING_BUF_EN_SHIFT) | DMA_EN;
+       for (i = 0; i < priv->hw_params->tx_queues; i++)
+               dma_ctrl |= (1 << (i + DMA_RING_BUF_EN_SHIFT));
        reg = bcmgenet_tdma_readl(priv, DMA_CTRL);
        reg &= ~dma_ctrl;
        bcmgenet_tdma_writel(priv, reg, DMA_CTRL);
 
+       dma_ctrl = 1 << (DESC_INDEX + DMA_RING_BUF_EN_SHIFT) | DMA_EN;
+       for (i = 0; i < priv->hw_params->rx_queues; i++)
+               dma_ctrl |= (1 << (i + DMA_RING_BUF_EN_SHIFT));
        reg = bcmgenet_rdma_readl(priv, DMA_CTRL);
        reg &= ~dma_ctrl;
        bcmgenet_rdma_writel(priv, reg, DMA_CTRL);
@@ -3292,7 +3299,6 @@ static int bcmgenet_open(struct net_device *dev)
 {
        struct bcmgenet_priv *priv = netdev_priv(dev);
        unsigned long dma_ctrl;
-       u32 reg;
        int ret;
 
        netif_dbg(priv, ifup, dev, "bcmgenet_open\n");
@@ -3318,12 +3324,6 @@ static int bcmgenet_open(struct net_device *dev)
 
        bcmgenet_set_hw_addr(priv, dev->dev_addr);
 
-       if (priv->internal_phy) {
-               reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
-               reg |= EXT_ENERGY_DET_MASK;
-               bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-       }
-
        /* Disable RX/TX DMA and flush TX queues */
        dma_ctrl = bcmgenet_dma_disable(priv);
 
@@ -4139,7 +4139,6 @@ static int bcmgenet_resume(struct device *d)
        struct bcmgenet_priv *priv = netdev_priv(dev);
        struct bcmgenet_rxnfc_rule *rule;
        unsigned long dma_ctrl;
-       u32 reg;
        int ret;
 
        if (!netif_running(dev))
@@ -4176,12 +4175,6 @@ static int bcmgenet_resume(struct device *d)
                if (rule->state != BCMGENET_RXNFC_STATE_UNUSED)
                        bcmgenet_hfb_create_rxnfc_filter(priv, rule);
 
-       if (priv->internal_phy) {
-               reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
-               reg |= EXT_ENERGY_DET_MASK;
-               bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-       }
-
        /* Disable RX/TX DMA and flush TX queues */
        dma_ctrl = bcmgenet_dma_disable(priv);
 
index facde82..e31a5a3 100644 (file)
@@ -186,12 +186,6 @@ int bcmgenet_wol_power_down_cfg(struct bcmgenet_priv *priv,
        reg |= CMD_RX_EN;
        bcmgenet_umac_writel(priv, reg, UMAC_CMD);
 
-       if (priv->hw_params->flags & GENET_HAS_EXT) {
-               reg = bcmgenet_ext_readl(priv, EXT_EXT_PWR_MGMT);
-               reg &= ~EXT_ENERGY_DET_MASK;
-               bcmgenet_ext_writel(priv, reg, EXT_EXT_PWR_MGMT);
-       }
-
        reg = UMAC_IRQ_MPD_R;
        if (hfb_enable)
                reg |=  UMAC_IRQ_HFB_SM | UMAC_IRQ_HFB_MM;
index 9a2b166..dbf9a0e 100644 (file)
@@ -2643,6 +2643,9 @@ static void detach_ulds(struct adapter *adap)
 {
        unsigned int i;
 
+       if (!is_uld(adap))
+               return;
+
        mutex_lock(&uld_mutex);
        list_del(&adap->list_node);
 
@@ -7141,10 +7144,13 @@ static void remove_one(struct pci_dev *pdev)
                 */
                destroy_workqueue(adapter->workq);
 
-               if (is_uld(adapter)) {
-                       detach_ulds(adapter);
-                       t4_uld_clean_up(adapter);
-               }
+               detach_ulds(adapter);
+
+               for_each_port(adapter, i)
+                       if (adapter->port[i]->reg_state == NETREG_REGISTERED)
+                               unregister_netdev(adapter->port[i]);
+
+               t4_uld_clean_up(adapter);
 
                adap_free_hma_mem(adapter);
 
@@ -7152,10 +7158,6 @@ static void remove_one(struct pci_dev *pdev)
 
                cxgb4_free_mps_ref_entries(adapter);
 
-               for_each_port(adapter, i)
-                       if (adapter->port[i]->reg_state == NETREG_REGISTERED)
-                               unregister_netdev(adapter->port[i]);
-
                debugfs_remove_recursive(adapter->debugfs_root);
 
                if (!is_t4(adapter->params.chip))
index 743af9e..17faac7 100644 (file)
@@ -581,6 +581,9 @@ void t4_uld_clean_up(struct adapter *adap)
 {
        unsigned int i;
 
+       if (!is_uld(adap))
+               return;
+
        mutex_lock(&uld_mutex);
        for (i = 0; i < CXGB4_ULD_MAX; i++) {
                if (!adap->uld[i].handle)
index 867e87a..099a2bc 100644 (file)
@@ -1469,7 +1469,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        err = pci_enable_device(pdev);
        if (err)
-               return -ENXIO;
+               return err;
 
        err = pci_request_regions(pdev, "gvnic-cfg");
        if (err)
@@ -1477,19 +1477,12 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        pci_set_master(pdev);
 
-       err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+       err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
        if (err) {
                dev_err(&pdev->dev, "Failed to set dma mask: err=%d\n", err);
                goto abort_with_pci_region;
        }
 
-       err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-       if (err) {
-               dev_err(&pdev->dev,
-                       "Failed to set consistent dma mask: err=%d\n", err);
-               goto abort_with_pci_region;
-       }
-
        reg_bar = pci_iomap(pdev, GVE_REGISTER_BAR, 0);
        if (!reg_bar) {
                dev_err(&pdev->dev, "Failed to map pci bar!\n");
@@ -1512,6 +1505,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues);
        if (!dev) {
                dev_err(&pdev->dev, "could not allocate netdev\n");
+               err = -ENOMEM;
                goto abort_with_db_bar;
        }
        SET_NETDEV_DEV(dev, &pdev->dev);
@@ -1565,7 +1559,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
        err = register_netdev(dev);
        if (err)
-               goto abort_with_wq;
+               goto abort_with_gve_init;
 
        dev_info(&pdev->dev, "GVE version %s\n", gve_version_str);
        dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format);
@@ -1573,6 +1567,9 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
        queue_work(priv->gve_wq, &priv->service_task);
        return 0;
 
+abort_with_gve_init:
+       gve_teardown_priv_resources(priv);
+
 abort_with_wq:
        destroy_workqueue(priv->gve_wq);
 
@@ -1590,7 +1587,7 @@ abort_with_pci_region:
 
 abort_with_enabled:
        pci_disable_device(pdev);
-       return -ENXIO;
+       return err;
 }
 
 static void gve_remove(struct pci_dev *pdev)
index 77bb822..8500621 100644 (file)
@@ -566,13 +566,6 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx,
                return 0;
        }
 
-       /* Prefetch the payload header. */
-       prefetch((char *)buf_state->addr + buf_state->page_info.page_offset);
-#if L1_CACHE_BYTES < 128
-       prefetch((char *)buf_state->addr + buf_state->page_info.page_offset +
-                L1_CACHE_BYTES);
-#endif
-
        if (eop && buf_len <= priv->rx_copybreak) {
                rx->skb_head = gve_rx_copy(priv->dev, napi,
                                           &buf_state->page_info, buf_len, 0);
index 374a75d..ed77191 100644 (file)
@@ -2420,9 +2420,10 @@ out:
 
 static void __ibmvnic_reset(struct work_struct *work)
 {
-       struct ibmvnic_rwi *rwi;
        struct ibmvnic_adapter *adapter;
        bool saved_state = false;
+       struct ibmvnic_rwi *tmprwi;
+       struct ibmvnic_rwi *rwi;
        unsigned long flags;
        u32 reset_state;
        int rc = 0;
@@ -2489,7 +2490,7 @@ static void __ibmvnic_reset(struct work_struct *work)
                } else {
                        rc = do_reset(adapter, rwi, reset_state);
                }
-               kfree(rwi);
+               tmprwi = rwi;
                adapter->last_reset_time = jiffies;
 
                if (rc)
@@ -2497,8 +2498,23 @@ static void __ibmvnic_reset(struct work_struct *work)
 
                rwi = get_next_rwi(adapter);
 
+               /*
+                * If there is another reset queued, free the previous rwi
+                * and process the new reset even if previous reset failed
+                * (the previous reset could have failed because of a fail
+                * over for instance, so process the fail over).
+                *
+                * If there are no resets queued and the previous reset failed,
+                * the adapter would be in an undefined state. So retry the
+                * previous reset as a hard reset.
+                */
+               if (rwi)
+                       kfree(tmprwi);
+               else if (rc)
+                       rwi = tmprwi;
+
                if (rwi && (rwi->reset_reason == VNIC_RESET_FAILOVER ||
-                           rwi->reset_reason == VNIC_RESET_MOBILITY))
+                           rwi->reset_reason == VNIC_RESET_MOBILITY || rc))
                        adapter->force_reset_recovery = true;
        }
 
index d150dad..757a54c 100644 (file)
@@ -7664,6 +7664,7 @@ err_flashmap:
 err_ioremap:
        free_netdev(netdev);
 err_alloc_etherdev:
+       pci_disable_pcie_error_reporting(pdev);
        pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
index dbcae92..adfa276 100644 (file)
@@ -2227,6 +2227,7 @@ err_sw_init:
 err_ioremap:
        free_netdev(netdev);
 err_alloc_netdev:
+       pci_disable_pcie_error_reporting(pdev);
        pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
index e612c24..44bafed 100644 (file)
@@ -3798,6 +3798,7 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 err_ioremap:
        free_netdev(netdev);
 err_alloc_etherdev:
+       pci_disable_pcie_error_reporting(pdev);
        pci_release_regions(pdev);
 err_pci_reg:
 err_dma:
index 7e6435d..171a7a6 100644 (file)
@@ -931,6 +931,7 @@ static void igb_configure_msix(struct igb_adapter *adapter)
  **/
 static int igb_request_msix(struct igb_adapter *adapter)
 {
+       unsigned int num_q_vectors = adapter->num_q_vectors;
        struct net_device *netdev = adapter->netdev;
        int i, err = 0, vector = 0, free_vector = 0;
 
@@ -939,7 +940,13 @@ static int igb_request_msix(struct igb_adapter *adapter)
        if (err)
                goto err_out;
 
-       for (i = 0; i < adapter->num_q_vectors; i++) {
+       if (num_q_vectors > MAX_Q_VECTORS) {
+               num_q_vectors = MAX_Q_VECTORS;
+               dev_warn(&adapter->pdev->dev,
+                        "The number of queue vectors (%d) is higher than max allowed (%d)\n",
+                        adapter->num_q_vectors, MAX_Q_VECTORS);
+       }
+       for (i = 0; i < num_q_vectors; i++) {
                struct igb_q_vector *q_vector = adapter->q_vector[i];
 
                vector++;
@@ -1678,14 +1685,15 @@ static bool is_any_txtime_enabled(struct igb_adapter *adapter)
  **/
 static void igb_config_tx_modes(struct igb_adapter *adapter, int queue)
 {
-       struct igb_ring *ring = adapter->tx_ring[queue];
        struct net_device *netdev = adapter->netdev;
        struct e1000_hw *hw = &adapter->hw;
+       struct igb_ring *ring;
        u32 tqavcc, tqavctrl;
        u16 value;
 
        WARN_ON(hw->mac.type != e1000_i210);
        WARN_ON(queue < 0 || queue > 1);
+       ring = adapter->tx_ring[queue];
 
        /* If any of the Qav features is enabled, configure queues as SR and
         * with HIGH PRIO. If none is, then configure them with LOW PRIO and
@@ -3615,6 +3623,7 @@ err_sw_init:
 err_ioremap:
        free_netdev(netdev);
 err_alloc_etherdev:
+       pci_disable_pcie_error_reporting(pdev);
        pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
@@ -4835,6 +4844,8 @@ static void igb_clean_tx_ring(struct igb_ring *tx_ring)
                                               DMA_TO_DEVICE);
                }
 
+               tx_buffer->next_to_watch = NULL;
+
                /* move us one more past the eop_desc for start of next pkt */
                tx_buffer++;
                i++;
index 9e0bbb2..5901ed9 100644 (file)
@@ -578,7 +578,7 @@ static inline s32 igc_read_phy_reg(struct igc_hw *hw, u32 offset, u16 *data)
        if (hw->phy.ops.read_reg)
                return hw->phy.ops.read_reg(hw, offset, data);
 
-       return 0;
+       return -EOPNOTSUPP;
 }
 
 void igc_reinit_locked(struct igc_adapter *);
index 9532309..e29aadb 100644 (file)
@@ -232,6 +232,8 @@ static void igc_clean_tx_ring(struct igc_ring *tx_ring)
                                igc_unmap_tx_buffer(tx_ring->dev, tx_buffer);
                }
 
+               tx_buffer->next_to_watch = NULL;
+
                /* move us one more past the eop_desc for start of next pkt */
                tx_buffer++;
                i++;
@@ -6054,6 +6056,7 @@ err_sw_init:
 err_ioremap:
        free_netdev(netdev);
 err_alloc_etherdev:
+       pci_disable_pcie_error_reporting(pdev);
        pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
index ffff69e..913253f 100644 (file)
@@ -11067,6 +11067,7 @@ err_ioremap:
        disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state);
        free_netdev(netdev);
 err_alloc_etherdev:
+       pci_disable_pcie_error_reporting(pdev);
        pci_release_mem_regions(pdev);
 err_pci_reg:
 err_dma:
index caaea2c..e3e4676 100644 (file)
@@ -211,7 +211,7 @@ struct xfrm_state *ixgbevf_ipsec_find_rx_state(struct ixgbevf_ipsec *ipsec,
 static int ixgbevf_ipsec_parse_proto_keys(struct xfrm_state *xs,
                                          u32 *mykey, u32 *mysalt)
 {
-       struct net_device *dev = xs->xso.dev;
+       struct net_device *dev = xs->xso.real_dev;
        unsigned char *key_data;
        char *alg_name = NULL;
        int key_len;
@@ -260,12 +260,15 @@ static int ixgbevf_ipsec_parse_proto_keys(struct xfrm_state *xs,
  **/
 static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs)
 {
-       struct net_device *dev = xs->xso.dev;
-       struct ixgbevf_adapter *adapter = netdev_priv(dev);
-       struct ixgbevf_ipsec *ipsec = adapter->ipsec;
+       struct net_device *dev = xs->xso.real_dev;
+       struct ixgbevf_adapter *adapter;
+       struct ixgbevf_ipsec *ipsec;
        u16 sa_idx;
        int ret;
 
+       adapter = netdev_priv(dev);
+       ipsec = adapter->ipsec;
+
        if (xs->id.proto != IPPROTO_ESP && xs->id.proto != IPPROTO_AH) {
                netdev_err(dev, "Unsupported protocol 0x%04x for IPsec offload\n",
                           xs->id.proto);
@@ -383,11 +386,14 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs)
  **/
 static void ixgbevf_ipsec_del_sa(struct xfrm_state *xs)
 {
-       struct net_device *dev = xs->xso.dev;
-       struct ixgbevf_adapter *adapter = netdev_priv(dev);
-       struct ixgbevf_ipsec *ipsec = adapter->ipsec;
+       struct net_device *dev = xs->xso.real_dev;
+       struct ixgbevf_adapter *adapter;
+       struct ixgbevf_ipsec *ipsec;
        u16 sa_idx;
 
+       adapter = netdev_priv(dev);
+       ipsec = adapter->ipsec;
+
        if (xs->xso.flags & XFRM_OFFLOAD_INBOUND) {
                sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_RX_INDEX;
 
index 361bc4f..76a7777 100644 (file)
@@ -2299,19 +2299,19 @@ mvneta_swbm_add_rx_fragment(struct mvneta_port *pp,
                skb_frag_off_set(frag, pp->rx_offset_correction);
                skb_frag_size_set(frag, data_len);
                __skb_frag_set_page(frag, page);
-
-               /* last fragment */
-               if (len == *size) {
-                       struct skb_shared_info *sinfo;
-
-                       sinfo = xdp_get_shared_info_from_buff(xdp);
-                       sinfo->nr_frags = xdp_sinfo->nr_frags;
-                       memcpy(sinfo->frags, xdp_sinfo->frags,
-                              sinfo->nr_frags * sizeof(skb_frag_t));
-               }
        } else {
                page_pool_put_full_page(rxq->page_pool, page, true);
        }
+
+       /* last fragment */
+       if (len == *size) {
+               struct skb_shared_info *sinfo;
+
+               sinfo = xdp_get_shared_info_from_buff(xdp);
+               sinfo->nr_frags = xdp_sinfo->nr_frags;
+               memcpy(sinfo->frags, xdp_sinfo->frags,
+                      sinfo->nr_frags * sizeof(skb_frag_t));
+       }
        *size -= len;
 }
 
index fac6474..9169849 100644 (file)
@@ -86,6 +86,22 @@ bool is_lmac_valid(struct cgx *cgx, int lmac_id)
        return test_bit(lmac_id, &cgx->lmac_bmap);
 }
 
+/* Helper function to get sequential index
+ * given the enabled LMAC of a CGX
+ */
+static int get_sequence_id_of_lmac(struct cgx *cgx, int lmac_id)
+{
+       int tmp, id = 0;
+
+       for_each_set_bit(tmp, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
+               if (tmp == lmac_id)
+                       break;
+               id++;
+       }
+
+       return id;
+}
+
 struct mac_ops *get_mac_ops(void *cgxd)
 {
        if (!cgxd)
@@ -211,37 +227,257 @@ static u64 mac2u64 (u8 *mac_addr)
        return mac;
 }
 
+static void cfg2mac(u64 cfg, u8 *mac_addr)
+{
+       int i, index = 0;
+
+       for (i = ETH_ALEN - 1; i >= 0; i--, index++)
+               mac_addr[i] = (cfg >> (8 * index)) & 0xFF;
+}
+
 int cgx_lmac_addr_set(u8 cgx_id, u8 lmac_id, u8 *mac_addr)
 {
        struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+       struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
        struct mac_ops *mac_ops;
+       int index, id;
        u64 cfg;
 
+       /* access mac_ops to know csr_offset */
        mac_ops = cgx_dev->mac_ops;
+
        /* copy 6bytes from macaddr */
        /* memcpy(&cfg, mac_addr, 6); */
 
        cfg = mac2u64 (mac_addr);
 
-       cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (lmac_id * 0x8)),
+       id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+       index = id * lmac->mac_to_index_bmap.max;
+
+       cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)),
                  cfg | CGX_DMAC_CAM_ADDR_ENABLE | ((u64)lmac_id << 49));
 
        cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
-       cfg |= CGX_DMAC_CTL0_CAM_ENABLE;
+       cfg |= (CGX_DMAC_CTL0_CAM_ENABLE | CGX_DMAC_BCAST_MODE |
+               CGX_DMAC_MCAST_MODE);
        cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
 
        return 0;
 }
 
+u64 cgx_read_dmac_ctrl(void *cgxd, int lmac_id)
+{
+       struct mac_ops *mac_ops;
+       struct cgx *cgx = cgxd;
+
+       if (!cgxd || !is_lmac_valid(cgxd, lmac_id))
+               return 0;
+
+       cgx = cgxd;
+       /* Get mac_ops to know csr offset */
+       mac_ops = cgx->mac_ops;
+
+       return cgx_read(cgxd, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+}
+
+u64 cgx_read_dmac_entry(void *cgxd, int index)
+{
+       struct mac_ops *mac_ops;
+       struct cgx *cgx;
+
+       if (!cgxd)
+               return 0;
+
+       cgx = cgxd;
+       mac_ops = cgx->mac_ops;
+       return cgx_read(cgx, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 8)));
+}
+
+int cgx_lmac_addr_add(u8 cgx_id, u8 lmac_id, u8 *mac_addr)
+{
+       struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+       struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+       struct mac_ops *mac_ops;
+       int index, idx;
+       u64 cfg = 0;
+       int id;
+
+       if (!lmac)
+               return -ENODEV;
+
+       mac_ops = cgx_dev->mac_ops;
+       /* Get available index where entry is to be installed */
+       idx = rvu_alloc_rsrc(&lmac->mac_to_index_bmap);
+       if (idx < 0)
+               return idx;
+
+       id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+       index = id * lmac->mac_to_index_bmap.max + idx;
+
+       cfg = mac2u64 (mac_addr);
+       cfg |= CGX_DMAC_CAM_ADDR_ENABLE;
+       cfg |= ((u64)lmac_id << 49);
+       cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), cfg);
+
+       cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+       cfg |= (CGX_DMAC_BCAST_MODE | CGX_DMAC_CAM_ACCEPT);
+
+       if (is_multicast_ether_addr(mac_addr)) {
+               cfg &= ~GENMASK_ULL(2, 1);
+               cfg |= CGX_DMAC_MCAST_MODE_CAM;
+               lmac->mcast_filters_count++;
+       } else if (!lmac->mcast_filters_count) {
+               cfg |= CGX_DMAC_MCAST_MODE;
+       }
+
+       cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
+
+       return idx;
+}
+
+int cgx_lmac_addr_reset(u8 cgx_id, u8 lmac_id)
+{
+       struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+       struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+       struct mac_ops *mac_ops;
+       u8 index = 0, id;
+       u64 cfg;
+
+       if (!lmac)
+               return -ENODEV;
+
+       mac_ops = cgx_dev->mac_ops;
+       /* Restore index 0 to its default init value as done during
+        * cgx_lmac_init
+        */
+       set_bit(0, lmac->mac_to_index_bmap.bmap);
+
+       id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+       index = id * lmac->mac_to_index_bmap.max + index;
+       cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), 0);
+
+       /* Reset CGXX_CMRX_RX_DMAC_CTL0 register to default state */
+       cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+       cfg &= ~CGX_DMAC_CAM_ACCEPT;
+       cfg |= (CGX_DMAC_BCAST_MODE | CGX_DMAC_MCAST_MODE);
+       cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
+
+       return 0;
+}
+
+/* Allows caller to change macaddress associated with index
+ * in dmac filter table including index 0 reserved for
+ * interface mac address
+ */
+int cgx_lmac_addr_update(u8 cgx_id, u8 lmac_id, u8 *mac_addr, u8 index)
+{
+       struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+       struct mac_ops *mac_ops;
+       struct lmac *lmac;
+       u64 cfg;
+       int id;
+
+       lmac = lmac_pdata(lmac_id, cgx_dev);
+       if (!lmac)
+               return -ENODEV;
+
+       mac_ops = cgx_dev->mac_ops;
+       /* Validate the index */
+       if (index >= lmac->mac_to_index_bmap.max)
+               return -EINVAL;
+
+       /* ensure index is already set */
+       if (!test_bit(index, lmac->mac_to_index_bmap.bmap))
+               return -EINVAL;
+
+       id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+       index = id * lmac->mac_to_index_bmap.max + index;
+
+       cfg = cgx_read(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)));
+       cfg &= ~CGX_RX_DMAC_ADR_MASK;
+       cfg |= mac2u64 (mac_addr);
+
+       cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), cfg);
+       return 0;
+}
+
+int cgx_lmac_addr_del(u8 cgx_id, u8 lmac_id, u8 index)
+{
+       struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+       struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+       struct mac_ops *mac_ops;
+       u8 mac[ETH_ALEN];
+       u64 cfg;
+       int id;
+
+       if (!lmac)
+               return -ENODEV;
+
+       mac_ops = cgx_dev->mac_ops;
+       /* Validate the index */
+       if (index >= lmac->mac_to_index_bmap.max)
+               return -EINVAL;
+
+       /* Skip deletion for reserved index i.e. index 0 */
+       if (index == 0)
+               return 0;
+
+       rvu_free_rsrc(&lmac->mac_to_index_bmap, index);
+
+       id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+       index = id * lmac->mac_to_index_bmap.max + index;
+
+       /* Read MAC address to check whether it is ucast or mcast */
+       cfg = cgx_read(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)));
+
+       cfg2mac(cfg, mac);
+       if (is_multicast_ether_addr(mac))
+               lmac->mcast_filters_count--;
+
+       if (!lmac->mcast_filters_count) {
+               cfg = cgx_read(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
+               cfg &= ~GENMASK_ULL(2, 1);
+               cfg |= CGX_DMAC_MCAST_MODE;
+               cgx_write(cgx_dev, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
+       }
+
+       cgx_write(cgx_dev, 0, (CGXX_CMRX_RX_DMAC_CAM0 + (index * 0x8)), 0);
+
+       return 0;
+}
+
+int cgx_lmac_addr_max_entries_get(u8 cgx_id, u8 lmac_id)
+{
+       struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+       struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
+
+       if (lmac)
+               return lmac->mac_to_index_bmap.max;
+
+       return 0;
+}
+
 u64 cgx_lmac_addr_get(u8 cgx_id, u8 lmac_id)
 {
        struct cgx *cgx_dev = cgx_get_pdata(cgx_id);
+       struct lmac *lmac = lmac_pdata(lmac_id, cgx_dev);
        struct mac_ops *mac_ops;
+       int index;
        u64 cfg;
+       int id;
 
        mac_ops = cgx_dev->mac_ops;
 
-       cfg = cgx_read(cgx_dev, 0, CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8);
+       id = get_sequence_id_of_lmac(cgx_dev, lmac_id);
+
+       index = id * lmac->mac_to_index_bmap.max;
+
+       cfg = cgx_read(cgx_dev, 0, CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8);
        return cfg & CGX_RX_DMAC_ADR_MASK;
 }
 
@@ -297,35 +533,51 @@ int cgx_lmac_internal_loopback(void *cgxd, int lmac_id, bool enable)
 void cgx_lmac_promisc_config(int cgx_id, int lmac_id, bool enable)
 {
        struct cgx *cgx = cgx_get_pdata(cgx_id);
+       struct lmac *lmac = lmac_pdata(lmac_id, cgx);
+       u16 max_dmac = lmac->mac_to_index_bmap.max;
        struct mac_ops *mac_ops;
+       int index, i;
        u64 cfg = 0;
+       int id;
 
        if (!cgx)
                return;
 
+       id = get_sequence_id_of_lmac(cgx, lmac_id);
+
        mac_ops = cgx->mac_ops;
        if (enable) {
                /* Enable promiscuous mode on LMAC */
                cfg = cgx_read(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
-               cfg &= ~(CGX_DMAC_CAM_ACCEPT | CGX_DMAC_MCAST_MODE);
-               cfg |= CGX_DMAC_BCAST_MODE;
+               cfg &= ~CGX_DMAC_CAM_ACCEPT;
+               cfg |= (CGX_DMAC_BCAST_MODE | CGX_DMAC_MCAST_MODE);
                cgx_write(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
 
-               cfg = cgx_read(cgx, 0,
-                              (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8));
-               cfg &= ~CGX_DMAC_CAM_ADDR_ENABLE;
-               cgx_write(cgx, 0,
-                         (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8), cfg);
+               for (i = 0; i < max_dmac; i++) {
+                       index = id * max_dmac + i;
+                       cfg = cgx_read(cgx, 0,
+                                      (CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8));
+                       cfg &= ~CGX_DMAC_CAM_ADDR_ENABLE;
+                       cgx_write(cgx, 0,
+                                 (CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8), cfg);
+               }
        } else {
                /* Disable promiscuous mode */
                cfg = cgx_read(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0);
                cfg |= CGX_DMAC_CAM_ACCEPT | CGX_DMAC_MCAST_MODE;
                cgx_write(cgx, lmac_id, CGXX_CMRX_RX_DMAC_CTL0, cfg);
-               cfg = cgx_read(cgx, 0,
-                              (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8));
-               cfg |= CGX_DMAC_CAM_ADDR_ENABLE;
-               cgx_write(cgx, 0,
-                         (CGXX_CMRX_RX_DMAC_CAM0 + lmac_id * 0x8), cfg);
+               for (i = 0; i < max_dmac; i++) {
+                       index = id * max_dmac + i;
+                       cfg = cgx_read(cgx, 0,
+                                      (CGXX_CMRX_RX_DMAC_CAM0 + index * 0x8));
+                       if ((cfg & CGX_RX_DMAC_ADR_MASK) != 0) {
+                               cfg |= CGX_DMAC_CAM_ADDR_ENABLE;
+                               cgx_write(cgx, 0,
+                                         (CGXX_CMRX_RX_DMAC_CAM0 +
+                                          index * 0x8),
+                                         cfg);
+                       }
+               }
        }
 }
 
@@ -1234,6 +1486,15 @@ static int cgx_lmac_init(struct cgx *cgx)
                }
 
                lmac->cgx = cgx;
+               lmac->mac_to_index_bmap.max =
+                               MAX_DMAC_ENTRIES_PER_CGX / cgx->lmac_count;
+               err = rvu_alloc_bitmap(&lmac->mac_to_index_bmap);
+               if (err)
+                       return err;
+
+               /* Reserve first entry for default MAC address */
+               set_bit(0, lmac->mac_to_index_bmap.bmap);
+
                init_waitqueue_head(&lmac->wq_cmd_cmplt);
                mutex_init(&lmac->cmd_lock);
                spin_lock_init(&lmac->event_cb_lock);
@@ -1274,6 +1535,7 @@ static int cgx_lmac_exit(struct cgx *cgx)
                        continue;
                cgx->mac_ops->mac_pause_frm_config(cgx, lmac->lmac_id, false);
                cgx_configure_interrupt(cgx, lmac, lmac->lmac_id, true);
+               kfree(lmac->mac_to_index_bmap.bmap);
                kfree(lmac->name);
                kfree(lmac);
        }
index 1252126..237ba2b 100644 (file)
@@ -23,6 +23,7 @@
 
 #define CGX_ID_MASK                    0x7
 #define MAX_LMAC_PER_CGX               4
+#define MAX_DMAC_ENTRIES_PER_CGX       32
 #define CGX_FIFO_LEN                   65536 /* 64K for both Rx & Tx */
 #define CGX_OFFSET(x)                  ((x) * MAX_LMAC_PER_CGX)
 
 #define CGXX_CMRX_RX_DMAC_CTL0         (0x1F8 + mac_ops->csr_offset)
 #define CGX_DMAC_CTL0_CAM_ENABLE       BIT_ULL(3)
 #define CGX_DMAC_CAM_ACCEPT            BIT_ULL(3)
+#define CGX_DMAC_MCAST_MODE_CAM                BIT_ULL(2)
 #define CGX_DMAC_MCAST_MODE            BIT_ULL(1)
 #define CGX_DMAC_BCAST_MODE            BIT_ULL(0)
 #define CGXX_CMRX_RX_DMAC_CAM0         (0x200 + mac_ops->csr_offset)
 #define CGX_DMAC_CAM_ADDR_ENABLE       BIT_ULL(48)
+#define CGX_DMAC_CAM_ENTRY_LMACID      GENMASK_ULL(50, 49)
 #define CGXX_CMRX_RX_DMAC_CAM1         0x400
 #define CGX_RX_DMAC_ADR_MASK           GENMASK_ULL(47, 0)
 #define CGXX_CMRX_TX_STAT0             0x700
@@ -139,7 +142,11 @@ int cgx_get_rx_stats(void *cgxd, int lmac_id, int idx, u64 *rx_stat);
 int cgx_lmac_rx_tx_enable(void *cgxd, int lmac_id, bool enable);
 int cgx_lmac_tx_enable(void *cgxd, int lmac_id, bool enable);
 int cgx_lmac_addr_set(u8 cgx_id, u8 lmac_id, u8 *mac_addr);
+int cgx_lmac_addr_reset(u8 cgx_id, u8 lmac_id);
 u64 cgx_lmac_addr_get(u8 cgx_id, u8 lmac_id);
+int cgx_lmac_addr_add(u8 cgx_id, u8 lmac_id, u8 *mac_addr);
+int cgx_lmac_addr_del(u8 cgx_id, u8 lmac_id, u8 index);
+int cgx_lmac_addr_max_entries_get(u8 cgx_id, u8 lmac_id);
 void cgx_lmac_promisc_config(int cgx_id, int lmac_id, bool enable);
 void cgx_lmac_enadis_rx_pause_fwding(void *cgxd, int lmac_id, bool enable);
 int cgx_lmac_internal_loopback(void *cgxd, int lmac_id, bool enable);
@@ -165,4 +172,7 @@ u8 cgx_get_lmacid(void *cgxd, u8 lmac_index);
 unsigned long cgx_get_lmac_bmap(void *cgxd);
 void cgx_lmac_write(int cgx_id, int lmac_id, u64 offset, u64 val);
 u64 cgx_lmac_read(int cgx_id, int lmac_id, u64 offset);
+int cgx_lmac_addr_update(u8 cgx_id, u8 lmac_id, u8 *mac_addr, u8 index);
+u64 cgx_read_dmac_ctrl(void *cgxd, int lmac_id);
+u64 cgx_read_dmac_entry(void *cgxd, int index);
 #endif /* CGX_H */
index 45706fd..a8b7b1c 100644 (file)
 #include "rvu.h"
 #include "cgx.h"
 /**
- * struct lmac
+ * struct lmac - per lmac locks and properties
  * @wq_cmd_cmplt:      waitq to keep the process blocked until cmd completion
  * @cmd_lock:          Lock to serialize the command interface
  * @resp:              command response
  * @link_info:         link related information
+ * @mac_to_index_bmap: Mac address to CGX table index mapping
  * @event_cb:          callback for linkchange events
  * @event_cb_lock:     lock for serializing callback with unregister
- * @cmd_pend:          flag set before new command is started
- *                     flag cleared after command response is received
  * @cgx:               parent cgx port
+ * @mcast_filters_count:  Number of multicast filters installed
  * @lmac_id:           lmac port id
+ * @cmd_pend:          flag set before new command is started
+ *                     flag cleared after command response is received
  * @name:              lmac port name
  */
 struct lmac {
@@ -29,12 +31,14 @@ struct lmac {
        struct mutex cmd_lock;
        u64 resp;
        struct cgx_link_user_info link_info;
+       struct rsrc_bmap mac_to_index_bmap;
        struct cgx_event_cb event_cb;
        /* lock for serializing callback with unregister */
        spinlock_t event_cb_lock;
-       bool cmd_pend;
        struct cgx *cgx;
+       u8 mcast_filters_count;
        u8 lmac_id;
+       bool cmd_pend;
        char *name;
 };
 
index 770d862..f5ec39d 100644 (file)
@@ -134,6 +134,8 @@ M(MSIX_OFFSET,              0x005, msix_offset, msg_req, msix_offset_rsp)   \
 M(VF_FLR,              0x006, vf_flr, msg_req, msg_rsp)                \
 M(PTP_OP,              0x007, ptp_op, ptp_req, ptp_rsp)                \
 M(GET_HW_CAP,          0x008, get_hw_cap, msg_req, get_hw_cap_rsp)     \
+M(LMTST_TBL_SETUP,     0x00a, lmtst_tbl_setup, lmtst_tbl_setup_req,    \
+                               msg_rsp)                                \
 M(SET_VF_PERM,         0x00b, set_vf_perm, set_vf_perm, msg_rsp)       \
 /* CGX mbox IDs (range 0x200 - 0x3FF) */                               \
 M(CGX_START_RXTX,      0x200, cgx_start_rxtx, msg_req, msg_rsp)        \
@@ -163,7 +165,15 @@ M(CGX_SET_LINK_MODE,       0x214, cgx_set_link_mode, cgx_set_link_mode_req,\
 M(CGX_FEATURES_GET,    0x215, cgx_features_get, msg_req,               \
                               cgx_features_info_msg)                   \
 M(RPM_STATS,           0x216, rpm_stats, msg_req, rpm_stats_rsp)       \
- /* NPA mbox IDs (range 0x400 - 0x5FF) */                              \
+M(CGX_MAC_ADDR_ADD,    0x217, cgx_mac_addr_add, cgx_mac_addr_add_req,    \
+                              cgx_mac_addr_add_rsp)            \
+M(CGX_MAC_ADDR_DEL,    0x218, cgx_mac_addr_del, cgx_mac_addr_del_req,    \
+                              msg_rsp)         \
+M(CGX_MAC_MAX_ENTRIES_GET, 0x219, cgx_mac_max_entries_get, msg_req,    \
+                                 cgx_max_dmac_entries_get_rsp)         \
+M(CGX_MAC_ADDR_RESET,  0x21A, cgx_mac_addr_reset, msg_req, msg_rsp)    \
+M(CGX_MAC_ADDR_UPDATE, 0x21B, cgx_mac_addr_update, cgx_mac_addr_update_req, \
+                              msg_rsp)                                 \
 /* NPA mbox IDs (range 0x400 - 0x5FF) */                               \
 M(NPA_LF_ALLOC,                0x400, npa_lf_alloc,                            \
                                npa_lf_alloc_req, npa_lf_alloc_rsp)     \
@@ -401,6 +411,38 @@ struct cgx_mac_addr_set_or_get {
        u8 mac_addr[ETH_ALEN];
 };
 
+/* Structure for requesting the operation to
+ * add DMAC filter entry into CGX interface
+ */
+struct cgx_mac_addr_add_req {
+       struct mbox_msghdr hdr;
+       u8 mac_addr[ETH_ALEN];
+};
+
+/* Structure for response against the operation to
+ * add DMAC filter entry into CGX interface
+ */
+struct cgx_mac_addr_add_rsp {
+       struct mbox_msghdr hdr;
+       u8 index;
+};
+
+/* Structure for requesting the operation to
+ * delete DMAC filter entry from CGX interface
+ */
+struct cgx_mac_addr_del_req {
+       struct mbox_msghdr hdr;
+       u8 index;
+};
+
+/* Structure for response against the operation to
+ * get maximum supported DMAC filter entries
+ */
+struct cgx_max_dmac_entries_get_rsp {
+       struct mbox_msghdr hdr;
+       u8 max_dmac_filters;
+};
+
 struct cgx_link_user_info {
        uint64_t link_up:1;
        uint64_t full_duplex:1;
@@ -499,6 +541,12 @@ struct cgx_set_link_mode_rsp {
        int status;
 };
 
+struct cgx_mac_addr_update_req {
+       struct mbox_msghdr hdr;
+       u8 mac_addr[ETH_ALEN];
+       u8 index;
+};
+
 #define RVU_LMAC_FEAT_FC               BIT_ULL(0) /* pause frames */
 #define RVU_LMAC_FEAT_PTP              BIT_ULL(1) /* precision time protocol */
 #define RVU_MAC_VERSION                        BIT_ULL(2)
@@ -1278,6 +1326,14 @@ struct set_vf_perm  {
        u64     flags;
 };
 
+struct lmtst_tbl_setup_req {
+       struct mbox_msghdr hdr;
+       u16 base_pcifunc;
+       u8  use_local_lmt_region;
+       u64 lmt_iova;
+       u64 rsvd[4];
+};
+
 /* CPT mailbox error codes
  * Range 901 - 1000.
  */
index 0b09294..10cddf1 100644 (file)
@@ -2333,6 +2333,7 @@ static void __rvu_flr_handler(struct rvu *rvu, u16 pcifunc)
        rvu_blklf_teardown(rvu, pcifunc, BLKADDR_SSOW);
        rvu_blklf_teardown(rvu, pcifunc, BLKADDR_SSO);
        rvu_blklf_teardown(rvu, pcifunc, BLKADDR_NPA);
+       rvu_reset_lmt_map_tbl(rvu, pcifunc);
        rvu_detach_rsrcs(rvu, NULL, pcifunc);
        mutex_unlock(&rvu->flr_lock);
 }
index 9e5d9ba..10e58a5 100644 (file)
@@ -243,6 +243,7 @@ struct rvu_pfvf {
        u8      nix_blkaddr; /* BLKADDR_NIX0/1 assigned to this PF */
        u8      nix_rx_intf; /* NIX0_RX/NIX1_RX interface to NPC */
        u8      nix_tx_intf; /* NIX0_TX/NIX1_TX interface to NPC */
+       u64     lmt_base_addr; /* Preseving the pcifunc's lmtst base addr*/
        unsigned long flags;
 };
 
@@ -656,6 +657,8 @@ void rvu_cgx_enadis_rx_bp(struct rvu *rvu, int pf, bool enable);
 int rvu_cgx_start_stop_io(struct rvu *rvu, u16 pcifunc, bool start);
 int rvu_cgx_nix_cuml_stats(struct rvu *rvu, void *cgxd, int lmac_id, int index,
                           int rxtxflag, u64 *stat);
+void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc);
+
 /* NPA APIs */
 int rvu_npa_init(struct rvu *rvu);
 void rvu_npa_freemem(struct rvu *rvu);
@@ -741,6 +744,7 @@ void npc_read_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam,
 bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature);
 u32  rvu_cgx_get_fifolen(struct rvu *rvu);
 void *rvu_first_cgx_pdata(struct rvu *rvu);
+int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id);
 
 int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, u16 pcifunc, int nixlf,
                             int type);
@@ -754,6 +758,9 @@ int rvu_cpt_lf_teardown(struct rvu *rvu, u16 pcifunc, int lf, int slot);
 int rvu_set_channels_base(struct rvu *rvu);
 void rvu_program_channels(struct rvu *rvu);
 
+/* CN10K RVU - LMT*/
+void rvu_reset_lmt_map_tbl(struct rvu *rvu, u16 pcifunc);
+
 #ifdef CONFIG_DEBUG_FS
 void rvu_dbg_init(struct rvu *rvu);
 void rvu_dbg_exit(struct rvu *rvu);
index 6e2bf4f..6cc8fbb 100644 (file)
@@ -63,7 +63,7 @@ static u16 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id)
        return rvu->cgxlmac2pf_map[CGX_OFFSET(cgx_id) + lmac_id];
 }
 
-static int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id)
+int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id)
 {
        unsigned long pfmap;
 
@@ -454,6 +454,31 @@ int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start)
        return 0;
 }
 
+void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc)
+{
+       int pf = rvu_get_pf(pcifunc);
+       int i = 0, lmac_count = 0;
+       u8 max_dmac_filters;
+       u8 cgx_id, lmac_id;
+       void *cgx_dev;
+
+       if (!is_cgx_config_permitted(rvu, pcifunc))
+               return;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       cgx_dev = cgx_get_pdata(cgx_id);
+       lmac_count = cgx_get_lmac_cnt(cgx_dev);
+       max_dmac_filters = MAX_DMAC_ENTRIES_PER_CGX / lmac_count;
+
+       for (i = 0; i < max_dmac_filters; i++)
+               cgx_lmac_addr_del(cgx_id, lmac_id, i);
+
+       /* As cgx_lmac_addr_del does not clear entry for index 0
+        * so it needs to be done explicitly
+        */
+       cgx_lmac_addr_reset(cgx_id, lmac_id);
+}
+
 int rvu_mbox_handler_cgx_start_rxtx(struct rvu *rvu, struct msg_req *req,
                                    struct msg_rsp *rsp)
 {
@@ -557,6 +582,63 @@ int rvu_mbox_handler_cgx_mac_addr_set(struct rvu *rvu,
        return 0;
 }
 
+int rvu_mbox_handler_cgx_mac_addr_add(struct rvu *rvu,
+                                     struct cgx_mac_addr_add_req *req,
+                                     struct cgx_mac_addr_add_rsp *rsp)
+{
+       int pf = rvu_get_pf(req->hdr.pcifunc);
+       u8 cgx_id, lmac_id;
+       int rc = 0;
+
+       if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+               return -EPERM;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       rc = cgx_lmac_addr_add(cgx_id, lmac_id, req->mac_addr);
+       if (rc >= 0) {
+               rsp->index = rc;
+               return 0;
+       }
+
+       return rc;
+}
+
+int rvu_mbox_handler_cgx_mac_addr_del(struct rvu *rvu,
+                                     struct cgx_mac_addr_del_req *req,
+                                     struct msg_rsp *rsp)
+{
+       int pf = rvu_get_pf(req->hdr.pcifunc);
+       u8 cgx_id, lmac_id;
+
+       if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+               return -EPERM;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       return cgx_lmac_addr_del(cgx_id, lmac_id, req->index);
+}
+
+int rvu_mbox_handler_cgx_mac_max_entries_get(struct rvu *rvu,
+                                            struct msg_req *req,
+                                            struct cgx_max_dmac_entries_get_rsp
+                                            *rsp)
+{
+       int pf = rvu_get_pf(req->hdr.pcifunc);
+       u8 cgx_id, lmac_id;
+
+       /* If msg is received from PFs(which are not mapped to CGX LMACs)
+        * or VF then no entries are allocated for DMAC filters at CGX level.
+        * So returning zero.
+        */
+       if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc)) {
+               rsp->max_dmac_filters = 0;
+               return 0;
+       }
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       rsp->max_dmac_filters = cgx_lmac_addr_max_entries_get(cgx_id, lmac_id);
+       return 0;
+}
+
 int rvu_mbox_handler_cgx_mac_addr_get(struct rvu *rvu,
                                      struct cgx_mac_addr_set_or_get *req,
                                      struct cgx_mac_addr_set_or_get *rsp)
@@ -953,3 +1035,30 @@ int rvu_mbox_handler_cgx_set_link_mode(struct rvu *rvu,
        rsp->status = cgx_set_link_mode(cgxd, req->args, cgx_idx, lmac);
        return 0;
 }
+
+int rvu_mbox_handler_cgx_mac_addr_reset(struct rvu *rvu, struct msg_req *req,
+                                       struct msg_rsp *rsp)
+{
+       int pf = rvu_get_pf(req->hdr.pcifunc);
+       u8 cgx_id, lmac_id;
+
+       if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+               return -EPERM;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       return cgx_lmac_addr_reset(cgx_id, lmac_id);
+}
+
+int rvu_mbox_handler_cgx_mac_addr_update(struct rvu *rvu,
+                                        struct cgx_mac_addr_update_req *req,
+                                        struct msg_rsp *rsp)
+{
+       int pf = rvu_get_pf(req->hdr.pcifunc);
+       u8 cgx_id, lmac_id;
+
+       if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
+               return -EPERM;
+
+       rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+       return cgx_lmac_addr_update(cgx_id, lmac_id, req->mac_addr, req->index);
+}
index 7d9e71c..8d48b64 100644 (file)
 #include "cgx.h"
 #include "rvu_reg.h"
 
+/* RVU LMTST */
+#define LMT_TBL_OP_READ                0
+#define LMT_TBL_OP_WRITE       1
+#define LMT_MAP_TABLE_SIZE     (128 * 1024)
+#define LMT_MAPTBL_ENTRY_SIZE  16
+
+/* Function to perform operations (read/write) on lmtst map table */
+static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val,
+                              int lmt_tbl_op)
+{
+       void __iomem *lmt_map_base;
+       u64 tbl_base;
+
+       tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE);
+
+       lmt_map_base = ioremap_wc(tbl_base, LMT_MAP_TABLE_SIZE);
+       if (!lmt_map_base) {
+               dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n");
+               return -ENOMEM;
+       }
+
+       if (lmt_tbl_op == LMT_TBL_OP_READ) {
+               *val = readq(lmt_map_base + index);
+       } else {
+               writeq((*val), (lmt_map_base + index));
+               /* Flushing the AP interceptor cache to make APR_LMT_MAP_ENTRY_S
+                * changes effective. Write 1 for flush and read is being used as a
+                * barrier and sets up a data dependency. Write to 0 after a write
+                * to 1 to complete the flush.
+                */
+               rvu_write64(rvu, BLKADDR_APR, APR_AF_LMT_CTL, BIT_ULL(0));
+               rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CTL);
+               rvu_write64(rvu, BLKADDR_APR, APR_AF_LMT_CTL, 0x00);
+       }
+
+       iounmap(lmt_map_base);
+       return 0;
+}
+
+static u32 rvu_get_lmtst_tbl_index(struct rvu *rvu, u16 pcifunc)
+{
+       return ((rvu_get_pf(pcifunc) * rvu->hw->total_vfs) +
+               (pcifunc & RVU_PFVF_FUNC_MASK)) * LMT_MAPTBL_ENTRY_SIZE;
+}
+
+static int rvu_get_lmtaddr(struct rvu *rvu, u16 pcifunc,
+                          u64 iova, u64 *lmt_addr)
+{
+       u64 pa, val, pf;
+       int err;
+
+       if (!iova) {
+               dev_err(rvu->dev, "%s Requested Null address for transulation\n", __func__);
+               return -EINVAL;
+       }
+
+       rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_REQ, iova);
+       pf = rvu_get_pf(pcifunc) & 0x1F;
+       val = BIT_ULL(63) | BIT_ULL(14) | BIT_ULL(13) | pf << 8 |
+             ((pcifunc & RVU_PFVF_FUNC_MASK) & 0xFF);
+       rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TXN_REQ, val);
+
+       err = rvu_poll_reg(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_RSP_STS, BIT_ULL(0), false);
+       if (err) {
+               dev_err(rvu->dev, "%s LMTLINE iova transulation failed\n", __func__);
+               return err;
+       }
+       val = rvu_read64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_RSP_STS);
+       if (val & ~0x1ULL) {
+               dev_err(rvu->dev, "%s LMTLINE iova transulation failed err:%llx\n", __func__, val);
+               return -EIO;
+       }
+       /* PA[51:12] = RVU_AF_SMMU_TLN_FLIT1[60:21]
+        * PA[11:0] = IOVA[11:0]
+        */
+       pa = rvu_read64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TLN_FLIT1) >> 21;
+       pa &= GENMASK_ULL(39, 0);
+       *lmt_addr = (pa << 12) | (iova  & 0xFFF);
+
+       return 0;
+}
+
+static int rvu_update_lmtaddr(struct rvu *rvu, u16 pcifunc, u64 lmt_addr)
+{
+       struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+       u32 tbl_idx;
+       int err = 0;
+       u64 val;
+
+       /* Read the current lmt addr of pcifunc */
+       tbl_idx = rvu_get_lmtst_tbl_index(rvu, pcifunc);
+       err = lmtst_map_table_ops(rvu, tbl_idx, &val, LMT_TBL_OP_READ);
+       if (err) {
+               dev_err(rvu->dev,
+                       "Failed to read LMT map table: index 0x%x err %d\n",
+                       tbl_idx, err);
+               return err;
+       }
+
+       /* Storing the seondary's lmt base address as this needs to be
+        * reverted in FLR. Also making sure this default value doesn't
+        * get overwritten on multiple calls to this mailbox.
+        */
+       if (!pfvf->lmt_base_addr)
+               pfvf->lmt_base_addr = val;
+
+       /* Update the LMT table with new addr */
+       err = lmtst_map_table_ops(rvu, tbl_idx, &lmt_addr, LMT_TBL_OP_WRITE);
+       if (err) {
+               dev_err(rvu->dev,
+                       "Failed to update LMT map table: index 0x%x err %d\n",
+                       tbl_idx, err);
+               return err;
+       }
+       return 0;
+}
+
+int rvu_mbox_handler_lmtst_tbl_setup(struct rvu *rvu,
+                                    struct lmtst_tbl_setup_req *req,
+                                    struct msg_rsp *rsp)
+{
+       u64 lmt_addr, val;
+       u32 pri_tbl_idx;
+       int err = 0;
+
+       /* Check if PF_FUNC wants to use it's own local memory as LMTLINE
+        * region, if so, convert that IOVA to physical address and
+        * populate LMT table with that address
+        */
+       if (req->use_local_lmt_region) {
+               err = rvu_get_lmtaddr(rvu, req->hdr.pcifunc,
+                                     req->lmt_iova, &lmt_addr);
+               if (err < 0)
+                       return err;
+
+               /* Update the lmt addr for this PFFUNC in the LMT table */
+               err = rvu_update_lmtaddr(rvu, req->hdr.pcifunc, lmt_addr);
+               if (err)
+                       return err;
+       }
+
+       /* Reconfiguring lmtst map table in lmt region shared mode i.e. make
+        * multiple PF_FUNCs to share an LMTLINE region, so primary/base
+        * pcifunc (which is passed as an argument to mailbox) is the one
+        * whose lmt base address will be shared among other secondary
+        * pcifunc (will be the one who is calling this mailbox).
+        */
+       if (req->base_pcifunc) {
+               /* Calculating the LMT table index equivalent to primary
+                * pcifunc.
+                */
+               pri_tbl_idx = rvu_get_lmtst_tbl_index(rvu, req->base_pcifunc);
+
+               /* Read the base lmt addr of the primary pcifunc */
+               err = lmtst_map_table_ops(rvu, pri_tbl_idx, &val,
+                                         LMT_TBL_OP_READ);
+               if (err) {
+                       dev_err(rvu->dev,
+                               "Failed to read LMT map table: index 0x%x err %d\n",
+                               pri_tbl_idx, err);
+                       return err;
+               }
+
+               /* Update the base lmt addr of secondary with primary's base
+                * lmt addr.
+                */
+               err = rvu_update_lmtaddr(rvu, req->hdr.pcifunc, val);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+/* Resetting the lmtst map table to original base addresses */
+void rvu_reset_lmt_map_tbl(struct rvu *rvu, u16 pcifunc)
+{
+       struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+       u32 tbl_idx;
+       int err;
+
+       if (is_rvu_otx2(rvu))
+               return;
+
+       if (pfvf->lmt_base_addr) {
+               /* This corresponds to lmt map table index */
+               tbl_idx = rvu_get_lmtst_tbl_index(rvu, pcifunc);
+               /* Reverting back original lmt base addr for respective
+                * pcifunc.
+                */
+               err = lmtst_map_table_ops(rvu, tbl_idx, &pfvf->lmt_base_addr,
+                                         LMT_TBL_OP_WRITE);
+               if (err)
+                       dev_err(rvu->dev,
+                               "Failed to update LMT map table: index 0x%x err %d\n",
+                               tbl_idx, err);
+               pfvf->lmt_base_addr = 0;
+       }
+}
+
 int rvu_set_channels_base(struct rvu *rvu)
 {
        struct rvu_hwinfo *hw = rvu->hw;
index 3cc3c6f..370d4ca 100644 (file)
@@ -1971,10 +1971,9 @@ static int cgx_print_stats(struct seq_file *s, int lmac_id)
        return err;
 }
 
-static int rvu_dbg_cgx_stat_display(struct seq_file *filp, void *unused)
+static int rvu_dbg_derive_lmacid(struct seq_file *filp, int *lmac_id)
 {
        struct dentry *current_dir;
-       int err, lmac_id;
        char *buf;
 
        current_dir = filp->file->f_path.dentry->d_parent;
@@ -1982,17 +1981,87 @@ static int rvu_dbg_cgx_stat_display(struct seq_file *filp, void *unused)
        if (!buf)
                return -EINVAL;
 
-       err = kstrtoint(buf + 1, 10, &lmac_id);
-       if (!err) {
-               err = cgx_print_stats(filp, lmac_id);
-               if (err)
-                       return err;
-       }
+       return kstrtoint(buf + 1, 10, lmac_id);
+}
+
+static int rvu_dbg_cgx_stat_display(struct seq_file *filp, void *unused)
+{
+       int lmac_id, err;
+
+       err = rvu_dbg_derive_lmacid(filp, &lmac_id);
+       if (!err)
+               return cgx_print_stats(filp, lmac_id);
+
        return err;
 }
 
 RVU_DEBUG_SEQ_FOPS(cgx_stat, cgx_stat_display, NULL);
 
+static int cgx_print_dmac_flt(struct seq_file *s, int lmac_id)
+{
+       struct pci_dev *pdev = NULL;
+       void *cgxd = s->private;
+       char *bcast, *mcast;
+       u16 index, domain;
+       u8 dmac[ETH_ALEN];
+       struct rvu *rvu;
+       u64 cfg, mac;
+       int pf;
+
+       rvu = pci_get_drvdata(pci_get_device(PCI_VENDOR_ID_CAVIUM,
+                                            PCI_DEVID_OCTEONTX2_RVU_AF, NULL));
+       if (!rvu)
+               return -ENODEV;
+
+       pf = cgxlmac_to_pf(rvu, cgx_get_cgxid(cgxd), lmac_id);
+       domain = 2;
+
+       pdev = pci_get_domain_bus_and_slot(domain, pf + 1, 0);
+       if (!pdev)
+               return 0;
+
+       cfg = cgx_read_dmac_ctrl(cgxd, lmac_id);
+       bcast = cfg & CGX_DMAC_BCAST_MODE ? "ACCEPT" : "REJECT";
+       mcast = cfg & CGX_DMAC_MCAST_MODE ? "ACCEPT" : "REJECT";
+
+       seq_puts(s,
+                "PCI dev       RVUPF   BROADCAST  MULTICAST  FILTER-MODE\n");
+       seq_printf(s, "%s  PF%d  %9s  %9s",
+                  dev_name(&pdev->dev), pf, bcast, mcast);
+       if (cfg & CGX_DMAC_CAM_ACCEPT)
+               seq_printf(s, "%12s\n\n", "UNICAST");
+       else
+               seq_printf(s, "%16s\n\n", "PROMISCUOUS");
+
+       seq_puts(s, "\nDMAC-INDEX  ADDRESS\n");
+
+       for (index = 0 ; index < 32 ; index++) {
+               cfg = cgx_read_dmac_entry(cgxd, index);
+               /* Display enabled dmac entries associated with current lmac */
+               if (lmac_id == FIELD_GET(CGX_DMAC_CAM_ENTRY_LMACID, cfg) &&
+                   FIELD_GET(CGX_DMAC_CAM_ADDR_ENABLE, cfg)) {
+                       mac = FIELD_GET(CGX_RX_DMAC_ADR_MASK, cfg);
+                       u64_to_ether_addr(mac, dmac);
+                       seq_printf(s, "%7d     %pM\n", index, dmac);
+               }
+       }
+
+       return 0;
+}
+
+static int rvu_dbg_cgx_dmac_flt_display(struct seq_file *filp, void *unused)
+{
+       int err, lmac_id;
+
+       err = rvu_dbg_derive_lmacid(filp, &lmac_id);
+       if (!err)
+               return cgx_print_dmac_flt(filp, lmac_id);
+
+       return err;
+}
+
+RVU_DEBUG_SEQ_FOPS(cgx_dmac_flt, cgx_dmac_flt_display, NULL);
+
 static void rvu_dbg_cgx_init(struct rvu *rvu)
 {
        struct mac_ops *mac_ops;
@@ -2029,6 +2098,9 @@ static void rvu_dbg_cgx_init(struct rvu *rvu)
 
                        debugfs_create_file("stats", 0600, rvu->rvu_dbg.lmac,
                                            cgx, &rvu_dbg_cgx_stat_fops);
+                       debugfs_create_file("mac_filter", 0600,
+                                           rvu->rvu_dbg.lmac, cgx,
+                                           &rvu_dbg_cgx_dmac_flt_fops);
                }
        }
 }
index d6f8210..aeae377 100644 (file)
@@ -346,6 +346,9 @@ static void nix_interface_deinit(struct rvu *rvu, u16 pcifunc, u8 nixlf)
 
        /* Free and disable any MCAM entries used by this NIX LF */
        rvu_npc_disable_mcam_entries(rvu, pcifunc, nixlf);
+
+       /* Disable DMAC filters used */
+       rvu_cgx_disable_dmac_entries(rvu, pcifunc);
 }
 
 int rvu_mbox_handler_nix_bp_disable(struct rvu *rvu,
index 76837d5..8b01ef6 100644 (file)
 #define RVU_AF_PFX_VF_BAR4_ADDR             (0x5400 | (a) << 4)
 #define RVU_AF_PFX_VF_BAR4_CFG              (0x5600 | (a) << 4)
 #define RVU_AF_PFX_LMTLINE_ADDR             (0x5800 | (a) << 4)
+#define RVU_AF_SMMU_ADDR_REQ               (0x6000)
+#define RVU_AF_SMMU_TXN_REQ                (0x6008)
+#define RVU_AF_SMMU_ADDR_RSP_STS           (0x6010)
+#define RVU_AF_SMMU_ADDR_TLN               (0x6018)
+#define RVU_AF_SMMU_TLN_FLIT1              (0x6030)
 
 /* Admin function's privileged PF/VF registers */
 #define RVU_PRIV_CONST                      (0x8000000)
 #define LBK_LINK_CFG_ID_MASK           GENMASK_ULL(11, 6)
 #define LBK_LINK_CFG_BASE_MASK         GENMASK_ULL(5, 0)
 
+/* APR */
+#define        APR_AF_LMT_CFG                  (0x000ull)
+#define        APR_AF_LMT_MAP_BASE             (0x008ull)
+#define        APR_AF_LMT_CTL                  (0x010ull)
+
 #endif /* RVU_REG_H */
index 14aa8e3..5bbe672 100644 (file)
@@ -35,7 +35,8 @@ enum rvu_block_addr_e {
        BLKADDR_NDC_NPA0        = 0xeULL,
        BLKADDR_NDC_NIX1_RX     = 0x10ULL,
        BLKADDR_NDC_NIX1_TX     = 0x11ULL,
-       BLK_COUNT               = 0x12ULL,
+       BLKADDR_APR             = 0x16ULL,
+       BLK_COUNT               = 0x17ULL,
 };
 
 /* RVU Block Type Enumeration */
index 457c947..3254b02 100644 (file)
@@ -7,7 +7,7 @@ obj-$(CONFIG_OCTEONTX2_PF) += rvu_nicpf.o
 obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o
 
 rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
-                    otx2_ptp.o otx2_flows.o otx2_tc.o cn10k.o
+               otx2_ptp.o otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o
 rvu_nicvf-y := otx2_vf.o
 
 ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af
index 1b08896..184de94 100644 (file)
@@ -22,69 +22,52 @@ static struct dev_hw_ops cn10k_hw_ops = {
        .refill_pool_ptrs = cn10k_refill_pool_ptrs,
 };
 
-int cn10k_pf_lmtst_init(struct otx2_nic *pf)
+int cn10k_lmtst_init(struct otx2_nic *pfvf)
 {
-       int size, num_lines;
-       u64 base;
 
-       if (!test_bit(CN10K_LMTST, &pf->hw.cap_flag)) {
-               pf->hw_ops = &otx2_hw_ops;
+       struct lmtst_tbl_setup_req *req;
+       int qcount, err;
+
+       if (!test_bit(CN10K_LMTST, &pfvf->hw.cap_flag)) {
+               pfvf->hw_ops = &otx2_hw_ops;
                return 0;
        }
 
-       pf->hw_ops = &cn10k_hw_ops;
-       base = pci_resource_start(pf->pdev, PCI_MBOX_BAR_NUM) +
-                      (MBOX_SIZE * (pf->total_vfs + 1));
-
-       size = pci_resource_len(pf->pdev, PCI_MBOX_BAR_NUM) -
-              (MBOX_SIZE * (pf->total_vfs + 1));
-
-       pf->hw.lmt_base = ioremap(base, size);
+       pfvf->hw_ops = &cn10k_hw_ops;
+       qcount = pfvf->hw.max_queues;
+       /* LMTST lines allocation
+        * qcount = num_online_cpus();
+        * NPA = TX + RX + XDP.
+        * NIX = TX * 32 (For Burst SQE flush).
+        */
+       pfvf->tot_lmt_lines = (qcount * 3) + (qcount * 32);
+       pfvf->npa_lmt_lines = qcount * 3;
+       pfvf->nix_lmt_size =  LMT_BURST_SIZE * LMT_LINE_SIZE;
 
-       if (!pf->hw.lmt_base) {
-               dev_err(pf->dev, "Unable to map PF LMTST region\n");
+       mutex_lock(&pfvf->mbox.lock);
+       req = otx2_mbox_alloc_msg_lmtst_tbl_setup(&pfvf->mbox);
+       if (!req) {
+               mutex_unlock(&pfvf->mbox.lock);
                return -ENOMEM;
        }
 
-       /* FIXME: Get the num of LMTST lines from LMT table */
-       pf->tot_lmt_lines = size / LMT_LINE_SIZE;
-       num_lines = (pf->tot_lmt_lines - NIX_LMTID_BASE) /
-                           pf->hw.tx_queues;
-       /* Number of LMT lines per SQ queues */
-       pf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
-
-       pf->nix_lmt_size = pf->nix_lmt_lines * LMT_LINE_SIZE;
-       return 0;
-}
+       req->use_local_lmt_region = true;
 
-int cn10k_vf_lmtst_init(struct otx2_nic *vf)
-{
-       int size, num_lines;
-
-       if (!test_bit(CN10K_LMTST, &vf->hw.cap_flag)) {
-               vf->hw_ops = &otx2_hw_ops;
-               return 0;
+       err = qmem_alloc(pfvf->dev, &pfvf->dync_lmt, pfvf->tot_lmt_lines,
+                        LMT_LINE_SIZE);
+       if (err) {
+               mutex_unlock(&pfvf->mbox.lock);
+               return err;
        }
+       pfvf->hw.lmt_base = (u64 *)pfvf->dync_lmt->base;
+       req->lmt_iova = (u64)pfvf->dync_lmt->iova;
 
-       vf->hw_ops = &cn10k_hw_ops;
-       size = pci_resource_len(vf->pdev, PCI_MBOX_BAR_NUM);
-       vf->hw.lmt_base = ioremap_wc(pci_resource_start(vf->pdev,
-                                                       PCI_MBOX_BAR_NUM),
-                                    size);
-       if (!vf->hw.lmt_base) {
-               dev_err(vf->dev, "Unable to map VF LMTST region\n");
-               return -ENOMEM;
-       }
+       err = otx2_sync_mbox_msg(&pfvf->mbox);
+       mutex_unlock(&pfvf->mbox.lock);
 
-       vf->tot_lmt_lines = size / LMT_LINE_SIZE;
-       /* LMTST lines per SQ */
-       num_lines = (vf->tot_lmt_lines - NIX_LMTID_BASE) /
-                           vf->hw.tx_queues;
-       vf->nix_lmt_lines = num_lines > 32 ? 32 : num_lines;
-       vf->nix_lmt_size = vf->nix_lmt_lines * LMT_LINE_SIZE;
        return 0;
 }
-EXPORT_SYMBOL(cn10k_vf_lmtst_init);
+EXPORT_SYMBOL(cn10k_lmtst_init);
 
 int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura)
 {
@@ -93,9 +76,11 @@ int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura)
        struct otx2_snd_queue *sq;
 
        sq = &pfvf->qset.sq[qidx];
-       sq->lmt_addr = (__force u64 *)((u64)pfvf->hw.nix_lmt_base +
+       sq->lmt_addr = (u64 *)((u64)pfvf->hw.nix_lmt_base +
                               (qidx * pfvf->nix_lmt_size));
 
+       sq->lmt_id = pfvf->npa_lmt_lines + (qidx * LMT_BURST_SIZE);
+
        /* Get memory to put this msg */
        aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox);
        if (!aq)
@@ -158,15 +143,13 @@ void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq)
 
 void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx)
 {
-       struct otx2_nic *pfvf = dev;
-       int lmt_id = NIX_LMTID_BASE + (qidx * pfvf->nix_lmt_lines);
        u64 val = 0, tar_addr = 0;
 
        /* FIXME: val[0:10] LMT_ID.
         * [12:15] no of LMTST - 1 in the burst.
         * [19:63] data size of each LMTST in the burst except first.
         */
-       val = (lmt_id & 0x7FF);
+       val = (sq->lmt_id & 0x7FF);
        /* Target address for LMTST flush tells HW how many 128bit
         * words are present.
         * tar_addr[6:4] size of first LMTST - 1 in units of 128b.
index 71292a4..1a1ae33 100644 (file)
@@ -12,8 +12,7 @@
 void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq);
 void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx);
 int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura);
-int cn10k_pf_lmtst_init(struct otx2_nic *pf);
-int cn10k_vf_lmtst_init(struct otx2_nic *vf);
+int cn10k_lmtst_init(struct otx2_nic *pfvf);
 int cn10k_free_all_ipolicers(struct otx2_nic *pfvf);
 int cn10k_alloc_matchall_ipolicer(struct otx2_nic *pfvf);
 int cn10k_free_matchall_ipolicer(struct otx2_nic *pfvf);
index cf7875d..7cccd80 100644 (file)
@@ -210,6 +210,9 @@ int otx2_set_mac_address(struct net_device *netdev, void *p)
                /* update dmac field in vlan offload rule */
                if (pfvf->flags & OTX2_FLAG_RX_VLAN_SUPPORT)
                        otx2_install_rxvlan_offload_flow(pfvf);
+               /* update dmac address in ntuple and DMAC filter list */
+               if (pfvf->flags & OTX2_FLAG_DMACFLTR_SUPPORT)
+                       otx2_dmacflt_update_pfmac_flow(pfvf);
        } else {
                return -EPERM;
        }
index 234b330..8fd58cd 100644 (file)
@@ -218,8 +218,8 @@ struct otx2_hw {
        unsigned long           cap_flag;
 
 #define LMT_LINE_SIZE          128
-#define NIX_LMTID_BASE         72 /* RX + TX + XDP */
-       void __iomem            *lmt_base;
+#define LMT_BURST_SIZE         32 /* 32 LMTST lines for burst SQE flush */
+       u64                     *lmt_base;
        u64                     *npa_lmt_base;
        u64                     *nix_lmt_base;
 };
@@ -288,6 +288,9 @@ struct otx2_flow_config {
        u16                     tc_flower_offset;
        u16                     ntuple_max_flows;
        u16                     tc_max_flows;
+       u8                      dmacflt_max_flows;
+       u8                      *bmap_to_dmacindex;
+       unsigned long           dmacflt_bmap;
        struct list_head        flow_list;
 };
 
@@ -329,6 +332,7 @@ struct otx2_nic {
 #define OTX2_FLAG_TC_FLOWER_SUPPORT            BIT_ULL(11)
 #define OTX2_FLAG_TC_MATCHALL_EGRESS_ENABLED   BIT_ULL(12)
 #define OTX2_FLAG_TC_MATCHALL_INGRESS_ENABLED  BIT_ULL(13)
+#define OTX2_FLAG_DMACFLTR_SUPPORT             BIT_ULL(14)
        u64                     flags;
 
        struct otx2_qset        qset;
@@ -363,8 +367,9 @@ struct otx2_nic {
        /* Block address of NIX either BLKADDR_NIX0 or BLKADDR_NIX1 */
        int                     nix_blkaddr;
        /* LMTST Lines info */
+       struct qmem             *dync_lmt;
        u16                     tot_lmt_lines;
-       u16                     nix_lmt_lines;
+       u16                     npa_lmt_lines;
        u32                     nix_lmt_size;
 
        struct otx2_ptp         *ptp;
@@ -833,4 +838,11 @@ int otx2_init_tc(struct otx2_nic *nic);
 void otx2_shutdown_tc(struct otx2_nic *nic);
 int otx2_setup_tc(struct net_device *netdev, enum tc_setup_type type,
                  void *type_data);
+/* CGX/RPM DMAC filters support */
+int otx2_dmacflt_get_max_cnt(struct otx2_nic *pf);
+int otx2_dmacflt_add(struct otx2_nic *pf, const u8 *mac, u8 bit_pos);
+int otx2_dmacflt_remove(struct otx2_nic *pf, const u8 *mac, u8 bit_pos);
+int otx2_dmacflt_update(struct otx2_nic *pf, u8 *mac, u8 bit_pos);
+void otx2_dmacflt_reinstall_flows(struct otx2_nic *pf);
+void otx2_dmacflt_update_pfmac_flow(struct otx2_nic *pfvf);
 #endif /* OTX2_COMMON_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_dmac_flt.c
new file mode 100644 (file)
index 0000000..383a6b5
--- /dev/null
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell OcteonTx2 RVU Physcial Function ethernet driver
+ *
+ * Copyright (C) 2021 Marvell.
+ */
+
+#include "otx2_common.h"
+
+static int otx2_dmacflt_do_add(struct otx2_nic *pf, const u8 *mac,
+                              u8 *dmac_index)
+{
+       struct cgx_mac_addr_add_req *req;
+       struct cgx_mac_addr_add_rsp *rsp;
+       int err;
+
+       mutex_lock(&pf->mbox.lock);
+
+       req = otx2_mbox_alloc_msg_cgx_mac_addr_add(&pf->mbox);
+       if (!req) {
+               mutex_unlock(&pf->mbox.lock);
+               return -ENOMEM;
+       }
+
+       ether_addr_copy(req->mac_addr, mac);
+       err = otx2_sync_mbox_msg(&pf->mbox);
+
+       if (!err) {
+               rsp = (struct cgx_mac_addr_add_rsp *)
+                        otx2_mbox_get_rsp(&pf->mbox.mbox, 0, &req->hdr);
+               *dmac_index = rsp->index;
+       }
+
+       mutex_unlock(&pf->mbox.lock);
+       return err;
+}
+
+static int otx2_dmacflt_add_pfmac(struct otx2_nic *pf)
+{
+       struct cgx_mac_addr_set_or_get *req;
+       int err;
+
+       mutex_lock(&pf->mbox.lock);
+
+       req = otx2_mbox_alloc_msg_cgx_mac_addr_set(&pf->mbox);
+       if (!req) {
+               mutex_unlock(&pf->mbox.lock);
+               return -ENOMEM;
+       }
+
+       ether_addr_copy(req->mac_addr, pf->netdev->dev_addr);
+       err = otx2_sync_mbox_msg(&pf->mbox);
+
+       mutex_unlock(&pf->mbox.lock);
+       return err;
+}
+
+int otx2_dmacflt_add(struct otx2_nic *pf, const u8 *mac, u8 bit_pos)
+{
+       u8 *dmacindex;
+
+       /* Store dmacindex returned by CGX/RPM driver which will
+        * be used for macaddr update/remove
+        */
+       dmacindex = &pf->flow_cfg->bmap_to_dmacindex[bit_pos];
+
+       if (ether_addr_equal(mac, pf->netdev->dev_addr))
+               return otx2_dmacflt_add_pfmac(pf);
+       else
+               return otx2_dmacflt_do_add(pf, mac, dmacindex);
+}
+
+static int otx2_dmacflt_do_remove(struct otx2_nic *pfvf, const u8 *mac,
+                                 u8 dmac_index)
+{
+       struct cgx_mac_addr_del_req *req;
+       int err;
+
+       mutex_lock(&pfvf->mbox.lock);
+       req = otx2_mbox_alloc_msg_cgx_mac_addr_del(&pfvf->mbox);
+       if (!req) {
+               mutex_unlock(&pfvf->mbox.lock);
+               return -ENOMEM;
+       }
+
+       req->index = dmac_index;
+
+       err = otx2_sync_mbox_msg(&pfvf->mbox);
+       mutex_unlock(&pfvf->mbox.lock);
+
+       return err;
+}
+
+static int otx2_dmacflt_remove_pfmac(struct otx2_nic *pf)
+{
+       struct msg_req *req;
+       int err;
+
+       mutex_lock(&pf->mbox.lock);
+       req = otx2_mbox_alloc_msg_cgx_mac_addr_reset(&pf->mbox);
+       if (!req) {
+               mutex_unlock(&pf->mbox.lock);
+               return -ENOMEM;
+       }
+
+       err = otx2_sync_mbox_msg(&pf->mbox);
+
+       mutex_unlock(&pf->mbox.lock);
+       return err;
+}
+
+int otx2_dmacflt_remove(struct otx2_nic *pf, const u8 *mac,
+                       u8 bit_pos)
+{
+       u8 dmacindex = pf->flow_cfg->bmap_to_dmacindex[bit_pos];
+
+       if (ether_addr_equal(mac, pf->netdev->dev_addr))
+               return otx2_dmacflt_remove_pfmac(pf);
+       else
+               return otx2_dmacflt_do_remove(pf, mac, dmacindex);
+}
+
+/* CGX/RPM blocks support max unicast entries of 32.
+ * on typical configuration MAC block associated
+ * with 4 lmacs, each lmac will have 8 dmac entries
+ */
+int otx2_dmacflt_get_max_cnt(struct otx2_nic *pf)
+{
+       struct cgx_max_dmac_entries_get_rsp *rsp;
+       struct msg_req *msg;
+       int err;
+
+       mutex_lock(&pf->mbox.lock);
+       msg = otx2_mbox_alloc_msg_cgx_mac_max_entries_get(&pf->mbox);
+
+       if (!msg) {
+               mutex_unlock(&pf->mbox.lock);
+               return -ENOMEM;
+       }
+
+       err = otx2_sync_mbox_msg(&pf->mbox);
+       if (err)
+               goto out;
+
+       rsp = (struct cgx_max_dmac_entries_get_rsp *)
+                    otx2_mbox_get_rsp(&pf->mbox.mbox, 0, &msg->hdr);
+       pf->flow_cfg->dmacflt_max_flows = rsp->max_dmac_filters;
+
+out:
+       mutex_unlock(&pf->mbox.lock);
+       return err;
+}
+
+int otx2_dmacflt_update(struct otx2_nic *pf, u8 *mac, u8 bit_pos)
+{
+       struct cgx_mac_addr_update_req *req;
+       int rc;
+
+       mutex_lock(&pf->mbox.lock);
+
+       req = otx2_mbox_alloc_msg_cgx_mac_addr_update(&pf->mbox);
+
+       if (!req) {
+               mutex_unlock(&pf->mbox.lock);
+               return -ENOMEM;
+       }
+
+       ether_addr_copy(req->mac_addr, mac);
+       req->index = pf->flow_cfg->bmap_to_dmacindex[bit_pos];
+       rc = otx2_sync_mbox_msg(&pf->mbox);
+
+       mutex_unlock(&pf->mbox.lock);
+       return rc;
+}
index 8c97106..4d9de52 100644 (file)
@@ -18,6 +18,12 @@ struct otx2_flow {
        bool is_vf;
        u8 rss_ctx_id;
        int vf;
+       bool dmac_filter;
+};
+
+enum dmac_req {
+       DMAC_ADDR_UPDATE,
+       DMAC_ADDR_DEL
 };
 
 static void otx2_clear_ntuple_flow_info(struct otx2_nic *pfvf, struct otx2_flow_config *flow_cfg)
@@ -219,6 +225,22 @@ int otx2_mcam_flow_init(struct otx2_nic *pf)
        if (!pf->mac_table)
                return -ENOMEM;
 
+       otx2_dmacflt_get_max_cnt(pf);
+
+       /* DMAC filters are not allocated */
+       if (!pf->flow_cfg->dmacflt_max_flows)
+               return 0;
+
+       pf->flow_cfg->bmap_to_dmacindex =
+                       devm_kzalloc(pf->dev, sizeof(u8) *
+                                    pf->flow_cfg->dmacflt_max_flows,
+                                    GFP_KERNEL);
+
+       if (!pf->flow_cfg->bmap_to_dmacindex)
+               return -ENOMEM;
+
+       pf->flags |= OTX2_FLAG_DMACFLTR_SUPPORT;
+
        return 0;
 }
 
@@ -280,6 +302,12 @@ int otx2_add_macfilter(struct net_device *netdev, const u8 *mac)
 {
        struct otx2_nic *pf = netdev_priv(netdev);
 
+       if (bitmap_weight(&pf->flow_cfg->dmacflt_bmap,
+                         pf->flow_cfg->dmacflt_max_flows))
+               netdev_warn(netdev,
+                           "Add %pM to CGX/RPM DMAC filters list as well\n",
+                           mac);
+
        return otx2_do_add_macfilter(pf, mac);
 }
 
@@ -351,12 +379,22 @@ static void otx2_add_flow_to_list(struct otx2_nic *pfvf, struct otx2_flow *flow)
        list_add(&flow->list, head);
 }
 
+static int otx2_get_maxflows(struct otx2_flow_config *flow_cfg)
+{
+       if (flow_cfg->nr_flows == flow_cfg->ntuple_max_flows ||
+           bitmap_weight(&flow_cfg->dmacflt_bmap,
+                         flow_cfg->dmacflt_max_flows))
+               return flow_cfg->ntuple_max_flows + flow_cfg->dmacflt_max_flows;
+       else
+               return flow_cfg->ntuple_max_flows;
+}
+
 int otx2_get_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
                  u32 location)
 {
        struct otx2_flow *iter;
 
-       if (location >= pfvf->flow_cfg->ntuple_max_flows)
+       if (location >= otx2_get_maxflows(pfvf->flow_cfg))
                return -EINVAL;
 
        list_for_each_entry(iter, &pfvf->flow_cfg->flow_list, list) {
@@ -378,7 +416,7 @@ int otx2_get_all_flows(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
        int idx = 0;
        int err = 0;
 
-       nfc->data = pfvf->flow_cfg->ntuple_max_flows;
+       nfc->data = otx2_get_maxflows(pfvf->flow_cfg);
        while ((!err || err == -ENOENT) && idx < rule_cnt) {
                err = otx2_get_flow(pfvf, nfc, location);
                if (!err)
@@ -760,6 +798,32 @@ int otx2_prepare_flow_request(struct ethtool_rx_flow_spec *fsp,
        return 0;
 }
 
+static int otx2_is_flow_rule_dmacfilter(struct otx2_nic *pfvf,
+                                       struct ethtool_rx_flow_spec *fsp)
+{
+       struct ethhdr *eth_mask = &fsp->m_u.ether_spec;
+       struct ethhdr *eth_hdr = &fsp->h_u.ether_spec;
+       u64 ring_cookie = fsp->ring_cookie;
+       u32 flow_type;
+
+       if (!(pfvf->flags & OTX2_FLAG_DMACFLTR_SUPPORT))
+               return false;
+
+       flow_type = fsp->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS);
+
+       /* CGX/RPM block dmac filtering configured for white listing
+        * check for action other than DROP
+        */
+       if (flow_type == ETHER_FLOW && ring_cookie != RX_CLS_FLOW_DISC &&
+           !ethtool_get_flow_spec_ring_vf(ring_cookie)) {
+               if (is_zero_ether_addr(eth_mask->h_dest) &&
+                   is_valid_ether_addr(eth_hdr->h_dest))
+                       return true;
+       }
+
+       return false;
+}
+
 static int otx2_add_flow_msg(struct otx2_nic *pfvf, struct otx2_flow *flow)
 {
        u64 ring_cookie = flow->flow_spec.ring_cookie;
@@ -818,14 +882,46 @@ static int otx2_add_flow_msg(struct otx2_nic *pfvf, struct otx2_flow *flow)
        return err;
 }
 
+static int otx2_add_flow_with_pfmac(struct otx2_nic *pfvf,
+                                   struct otx2_flow *flow)
+{
+       struct otx2_flow *pf_mac;
+       struct ethhdr *eth_hdr;
+
+       pf_mac = kzalloc(sizeof(*pf_mac), GFP_KERNEL);
+       if (!pf_mac)
+               return -ENOMEM;
+
+       pf_mac->entry = 0;
+       pf_mac->dmac_filter = true;
+       pf_mac->location = pfvf->flow_cfg->ntuple_max_flows;
+       memcpy(&pf_mac->flow_spec, &flow->flow_spec,
+              sizeof(struct ethtool_rx_flow_spec));
+       pf_mac->flow_spec.location = pf_mac->location;
+
+       /* Copy PF mac address */
+       eth_hdr = &pf_mac->flow_spec.h_u.ether_spec;
+       ether_addr_copy(eth_hdr->h_dest, pfvf->netdev->dev_addr);
+
+       /* Install DMAC filter with PF mac address */
+       otx2_dmacflt_add(pfvf, eth_hdr->h_dest, 0);
+
+       otx2_add_flow_to_list(pfvf, pf_mac);
+       pfvf->flow_cfg->nr_flows++;
+       set_bit(0, &pfvf->flow_cfg->dmacflt_bmap);
+
+       return 0;
+}
+
 int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
 {
        struct otx2_flow_config *flow_cfg = pfvf->flow_cfg;
        struct ethtool_rx_flow_spec *fsp = &nfc->fs;
        struct otx2_flow *flow;
+       struct ethhdr *eth_hdr;
        bool new = false;
+       int err = 0;
        u32 ring;
-       int err;
 
        ring = ethtool_get_flow_spec_ring(fsp->ring_cookie);
        if (!(pfvf->flags & OTX2_FLAG_NTUPLE_SUPPORT))
@@ -834,16 +930,15 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
        if (ring >= pfvf->hw.rx_queues && fsp->ring_cookie != RX_CLS_FLOW_DISC)
                return -EINVAL;
 
-       if (fsp->location >= flow_cfg->ntuple_max_flows)
+       if (fsp->location >= otx2_get_maxflows(flow_cfg))
                return -EINVAL;
 
        flow = otx2_find_flow(pfvf, fsp->location);
        if (!flow) {
-               flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+               flow = kzalloc(sizeof(*flow), GFP_KERNEL);
                if (!flow)
                        return -ENOMEM;
                flow->location = fsp->location;
-               flow->entry = flow_cfg->flow_ent[flow->location];
                new = true;
        }
        /* struct copy */
@@ -852,7 +947,54 @@ int otx2_add_flow(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc)
        if (fsp->flow_type & FLOW_RSS)
                flow->rss_ctx_id = nfc->rss_context;
 
-       err = otx2_add_flow_msg(pfvf, flow);
+       if (otx2_is_flow_rule_dmacfilter(pfvf, &flow->flow_spec)) {
+               eth_hdr = &flow->flow_spec.h_u.ether_spec;
+
+               /* Sync dmac filter table with updated fields */
+               if (flow->dmac_filter)
+                       return otx2_dmacflt_update(pfvf, eth_hdr->h_dest,
+                                                  flow->entry);
+
+               if (bitmap_full(&flow_cfg->dmacflt_bmap,
+                               flow_cfg->dmacflt_max_flows)) {
+                       netdev_warn(pfvf->netdev,
+                                   "Can't insert the rule %d as max allowed dmac filters are %d\n",
+                                   flow->location +
+                                   flow_cfg->dmacflt_max_flows,
+                                   flow_cfg->dmacflt_max_flows);
+                       err = -EINVAL;
+                       if (new)
+                               kfree(flow);
+                       return err;
+               }
+
+               /* Install PF mac address to DMAC filter list */
+               if (!test_bit(0, &flow_cfg->dmacflt_bmap))
+                       otx2_add_flow_with_pfmac(pfvf, flow);
+
+               flow->dmac_filter = true;
+               flow->entry = find_first_zero_bit(&flow_cfg->dmacflt_bmap,
+                                                 flow_cfg->dmacflt_max_flows);
+               fsp->location = flow_cfg->ntuple_max_flows + flow->entry;
+               flow->flow_spec.location = fsp->location;
+               flow->location = fsp->location;
+
+               set_bit(flow->entry, &flow_cfg->dmacflt_bmap);
+               otx2_dmacflt_add(pfvf, eth_hdr->h_dest, flow->entry);
+
+       } else {
+               if (flow->location >= pfvf->flow_cfg->ntuple_max_flows) {
+                       netdev_warn(pfvf->netdev,
+                                   "Can't insert non dmac ntuple rule at %d, allowed range %d-0\n",
+                                   flow->location,
+                                   flow_cfg->ntuple_max_flows - 1);
+                       err = -EINVAL;
+               } else {
+                       flow->entry = flow_cfg->flow_ent[flow->location];
+                       err = otx2_add_flow_msg(pfvf, flow);
+               }
+       }
+
        if (err) {
                if (new)
                        kfree(flow);
@@ -890,20 +1032,70 @@ static int otx2_remove_flow_msg(struct otx2_nic *pfvf, u16 entry, bool all)
        return err;
 }
 
+static void otx2_update_rem_pfmac(struct otx2_nic *pfvf, int req)
+{
+       struct otx2_flow *iter;
+       struct ethhdr *eth_hdr;
+       bool found = false;
+
+       list_for_each_entry(iter, &pfvf->flow_cfg->flow_list, list) {
+               if (iter->dmac_filter && iter->entry == 0) {
+                       eth_hdr = &iter->flow_spec.h_u.ether_spec;
+                       if (req == DMAC_ADDR_DEL) {
+                               otx2_dmacflt_remove(pfvf, eth_hdr->h_dest,
+                                                   0);
+                               clear_bit(0, &pfvf->flow_cfg->dmacflt_bmap);
+                               found = true;
+                       } else {
+                               ether_addr_copy(eth_hdr->h_dest,
+                                               pfvf->netdev->dev_addr);
+                               otx2_dmacflt_update(pfvf, eth_hdr->h_dest, 0);
+                       }
+                       break;
+               }
+       }
+
+       if (found) {
+               list_del(&iter->list);
+               kfree(iter);
+               pfvf->flow_cfg->nr_flows--;
+       }
+}
+
 int otx2_remove_flow(struct otx2_nic *pfvf, u32 location)
 {
        struct otx2_flow_config *flow_cfg = pfvf->flow_cfg;
        struct otx2_flow *flow;
        int err;
 
-       if (location >= flow_cfg->ntuple_max_flows)
+       if (location >= otx2_get_maxflows(flow_cfg))
                return -EINVAL;
 
        flow = otx2_find_flow(pfvf, location);
        if (!flow)
                return -ENOENT;
 
-       err = otx2_remove_flow_msg(pfvf, flow->entry, false);
+       if (flow->dmac_filter) {
+               struct ethhdr *eth_hdr = &flow->flow_spec.h_u.ether_spec;
+
+               /* user not allowed to remove dmac filter with interface mac */
+               if (ether_addr_equal(pfvf->netdev->dev_addr, eth_hdr->h_dest))
+                       return -EPERM;
+
+               err = otx2_dmacflt_remove(pfvf, eth_hdr->h_dest,
+                                         flow->entry);
+               clear_bit(flow->entry, &flow_cfg->dmacflt_bmap);
+               /* If all dmac filters are removed delete macfilter with
+                * interface mac address and configure CGX/RPM block in
+                * promiscuous mode
+                */
+               if (bitmap_weight(&flow_cfg->dmacflt_bmap,
+                                 flow_cfg->dmacflt_max_flows) == 1)
+                       otx2_update_rem_pfmac(pfvf, DMAC_ADDR_DEL);
+       } else {
+               err = otx2_remove_flow_msg(pfvf, flow->entry, false);
+       }
+
        if (err)
                return err;
 
@@ -1100,3 +1292,22 @@ int otx2_enable_rxvlan(struct otx2_nic *pf, bool enable)
        mutex_unlock(&pf->mbox.lock);
        return rsp_hdr->rc;
 }
+
+void otx2_dmacflt_reinstall_flows(struct otx2_nic *pf)
+{
+       struct otx2_flow *iter;
+       struct ethhdr *eth_hdr;
+
+       list_for_each_entry(iter, &pf->flow_cfg->flow_list, list) {
+               if (iter->dmac_filter) {
+                       eth_hdr = &iter->flow_spec.h_u.ether_spec;
+                       otx2_dmacflt_add(pf, eth_hdr->h_dest,
+                                        iter->entry);
+               }
+       }
+}
+
+void otx2_dmacflt_update_pfmac_flow(struct otx2_nic *pfvf)
+{
+       otx2_update_rem_pfmac(pfvf, DMAC_ADDR_UPDATE);
+}
index 59912f7..f300b80 100644 (file)
@@ -1110,6 +1110,11 @@ static int otx2_cgx_config_loopback(struct otx2_nic *pf, bool enable)
        struct msg_req *msg;
        int err;
 
+       if (enable && bitmap_weight(&pf->flow_cfg->dmacflt_bmap,
+                                   pf->flow_cfg->dmacflt_max_flows))
+               netdev_warn(pf->netdev,
+                           "CGX/RPM internal loopback might not work as DMAC filters are active\n");
+
        mutex_lock(&pf->mbox.lock);
        if (enable)
                msg = otx2_mbox_alloc_msg_cgx_intlbk_enable(&pf->mbox);
@@ -1533,10 +1538,10 @@ int otx2_open(struct net_device *netdev)
 
        if (test_bit(CN10K_LMTST, &pf->hw.cap_flag)) {
                /* Reserve LMT lines for NPA AURA batch free */
-               pf->hw.npa_lmt_base = (__force u64 *)pf->hw.lmt_base;
+               pf->hw.npa_lmt_base = pf->hw.lmt_base;
                /* Reserve LMT lines for NIX TX */
-               pf->hw.nix_lmt_base = (__force u64 *)((u64)pf->hw.npa_lmt_base +
-                                     (NIX_LMTID_BASE * LMT_LINE_SIZE));
+               pf->hw.nix_lmt_base = (u64 *)((u64)pf->hw.npa_lmt_base +
+                                     (pf->npa_lmt_lines * LMT_LINE_SIZE));
        }
 
        err = otx2_init_hw_resources(pf);
@@ -1644,6 +1649,10 @@ int otx2_open(struct net_device *netdev)
        /* Restore pause frame settings */
        otx2_config_pause_frm(pf);
 
+       /* Install DMAC Filters */
+       if (pf->flags & OTX2_FLAG_DMACFLTR_SUPPORT)
+               otx2_dmacflt_reinstall_flows(pf);
+
        err = otx2_rxtx_enable(pf, true);
        if (err)
                goto err_tx_stop_queues;
@@ -2526,7 +2535,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (err)
                goto err_detach_rsrc;
 
-       err = cn10k_pf_lmtst_init(pf);
+       err = cn10k_lmtst_init(pf);
        if (err)
                goto err_detach_rsrc;
 
@@ -2630,8 +2639,8 @@ err_del_mcam_entries:
 err_ptp_destroy:
        otx2_ptp_destroy(pf);
 err_detach_rsrc:
-       if (hw->lmt_base)
-               iounmap(hw->lmt_base);
+       if (test_bit(CN10K_LMTST, &pf->hw.cap_flag))
+               qmem_free(pf->dev, pf->dync_lmt);
        otx2_detach_resources(&pf->mbox);
 err_disable_mbox_intr:
        otx2_disable_mbox_intr(pf);
@@ -2772,9 +2781,8 @@ static void otx2_remove(struct pci_dev *pdev)
        otx2_mcam_flow_del(pf);
        otx2_shutdown_tc(pf);
        otx2_detach_resources(&pf->mbox);
-       if (pf->hw.lmt_base)
-               iounmap(pf->hw.lmt_base);
-
+       if (test_bit(CN10K_LMTST, &pf->hw.cap_flag))
+               qmem_free(pf->dev, pf->dync_lmt);
        otx2_disable_mbox_intr(pf);
        otx2_pfaf_mbox_destroy(pf);
        pci_free_irq_vectors(pf->pdev);
index 905fc02..972b202 100644 (file)
@@ -288,7 +288,7 @@ static int otx2_tc_parse_actions(struct otx2_nic *nic,
        struct otx2_nic *priv;
        u32 burst, mark = 0;
        u8 nr_police = 0;
-       bool pps;
+       bool pps = false;
        u64 rate;
        int i;
 
index 52486c1..2f144e2 100644 (file)
@@ -83,6 +83,7 @@ struct otx2_snd_queue {
        u16                     num_sqbs;
        u16                     sqe_thresh;
        u8                      sqe_per_sqb;
+       u32                     lmt_id;
        u64                      io_addr;
        u64                     *aura_fc_addr;
        u64                     *lmt_addr;
index 13a908f..a8bee5a 100644 (file)
@@ -609,7 +609,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
        if (err)
                goto err_detach_rsrc;
 
-       err = cn10k_vf_lmtst_init(vf);
+       err = cn10k_lmtst_init(vf);
        if (err)
                goto err_detach_rsrc;
 
@@ -667,8 +667,8 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 err_unreg_netdev:
        unregister_netdev(netdev);
 err_detach_rsrc:
-       if (hw->lmt_base)
-               iounmap(hw->lmt_base);
+       if (test_bit(CN10K_LMTST, &vf->hw.cap_flag))
+               qmem_free(vf->dev, vf->dync_lmt);
        otx2_detach_resources(&vf->mbox);
 err_disable_mbox_intr:
        otx2vf_disable_mbox_intr(vf);
@@ -700,10 +700,8 @@ static void otx2vf_remove(struct pci_dev *pdev)
                destroy_workqueue(vf->otx2_wq);
        otx2vf_disable_mbox_intr(vf);
        otx2_detach_resources(&vf->mbox);
-
-       if (vf->hw.lmt_base)
-               iounmap(vf->hw.lmt_base);
-
+       if (test_bit(CN10K_LMTST, &vf->hw.cap_flag))
+               qmem_free(vf->dev, vf->dync_lmt);
        otx2vf_vfaf_mbox_destroy(vf);
        pci_free_irq_vectors(vf->pdev);
        pci_set_drvdata(pdev, NULL);
index a80419d..ac403d4 100644 (file)
@@ -2,6 +2,7 @@ config SPARX5_SWITCH
        tristate "Sparx5 switch driver"
        depends on NET_SWITCHDEV
        depends on HAS_IOMEM
+       depends on OF
        select PHYLINK
        select PHY_SPARX5_SERDES
        select RESET_CONTROLLER
index 5249b64..49def69 100644 (file)
@@ -540,10 +540,8 @@ static int moxart_mac_probe(struct platform_device *pdev)
        SET_NETDEV_DEV(ndev, &pdev->dev);
 
        ret = register_netdev(ndev);
-       if (ret) {
-               free_netdev(ndev);
+       if (ret)
                goto init_fail;
-       }
 
        netdev_dbg(ndev, "%s: IRQ=%d address=%pM\n",
                   __func__, ndev->irq, ndev->dev_addr);
index 3e89e34..e9d260d 100644 (file)
@@ -1298,6 +1298,7 @@ static int ocelot_netdevice_lag_leave(struct net_device *dev,
 }
 
 static int ocelot_netdevice_changeupper(struct net_device *dev,
+                                       struct net_device *brport_dev,
                                        struct netdev_notifier_changeupper_info *info)
 {
        struct netlink_ext_ack *extack;
@@ -1307,11 +1308,11 @@ static int ocelot_netdevice_changeupper(struct net_device *dev,
 
        if (netif_is_bridge_master(info->upper_dev)) {
                if (info->linking)
-                       err = ocelot_netdevice_bridge_join(dev, dev,
+                       err = ocelot_netdevice_bridge_join(dev, brport_dev,
                                                           info->upper_dev,
                                                           extack);
                else
-                       err = ocelot_netdevice_bridge_leave(dev, dev,
+                       err = ocelot_netdevice_bridge_leave(dev, brport_dev,
                                                            info->upper_dev);
        }
        if (netif_is_lag_master(info->upper_dev)) {
@@ -1346,7 +1347,7 @@ ocelot_netdevice_lag_changeupper(struct net_device *dev,
                if (ocelot_port->bond != dev)
                        return NOTIFY_OK;
 
-               err = ocelot_netdevice_changeupper(lower, info);
+               err = ocelot_netdevice_changeupper(lower, dev, info);
                if (err)
                        return notifier_from_errno(err);
        }
@@ -1385,7 +1386,7 @@ static int ocelot_netdevice_event(struct notifier_block *unused,
                struct netdev_notifier_changeupper_info *info = ptr;
 
                if (ocelot_netdevice_dev_check(dev))
-                       return ocelot_netdevice_changeupper(dev, info);
+                       return ocelot_netdevice_changeupper(dev, dev, info);
 
                if (netif_is_lag_master(dev))
                        return ocelot_netdevice_lag_changeupper(dev, info);
index 273d529..062bb2d 100644 (file)
@@ -1141,20 +1141,7 @@ int nfp_fl_ct_del_flow(struct nfp_fl_ct_map_entry *ct_map_ent)
                nfp_fl_ct_clean_flow_entry(ct_entry);
                kfree(ct_map_ent);
 
-               /* If this is the last pre_ct_rule it means that it is
-                * very likely that the nft table will be cleaned up next,
-                * as this happens on the removal of the last act_ct flow.
-                * However we cannot deregister the callback on the removal
-                * of the last nft flow as this runs into a deadlock situation.
-                * So deregister the callback on removal of the last pre_ct flow
-                * and remove any remaining nft flow entries. We also cannot
-                * save this state and delete the callback later since the
-                * nft table would already have been freed at that time.
-                */
                if (!zt->pre_ct_count) {
-                       nf_flow_table_offload_del_cb(zt->nft,
-                                                    nfp_fl_ct_handle_nft_flow,
-                                                    zt);
                        zt->nft = NULL;
                        nfp_fl_ct_clean_nft_entries(zt);
                }
@@ -1172,6 +1159,7 @@ int nfp_fl_ct_del_flow(struct nfp_fl_ct_map_entry *ct_map_ent)
                                       nfp_ct_map_params);
                nfp_fl_ct_clean_flow_entry(ct_map_ent->ct_entry);
                kfree(ct_map_ent);
+               break;
        default:
                break;
        }
index 8543bf3..ad655f0 100644 (file)
@@ -735,12 +735,13 @@ static int emac_remove(struct platform_device *pdev)
 
        put_device(&adpt->phydev->mdio.dev);
        mdiobus_unregister(adpt->mii_bus);
-       free_netdev(netdev);
 
        if (adpt->phy.digital)
                iounmap(adpt->phy.digital);
        iounmap(adpt->phy.base);
 
+       free_netdev(netdev);
+
        return 0;
 }
 
index a3ca406..e5b0d79 100644 (file)
@@ -152,6 +152,7 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
         * maximum size.
         */
        tx_per_ev = EFX_MAX_EVQ_SIZE / EFX_TXQ_MAX_ENT(efx);
+       tx_per_ev = min(tx_per_ev, EFX_MAX_TXQ_PER_CHANNEL);
        n_xdp_tx = num_possible_cpus();
        n_xdp_ev = DIV_ROUND_UP(n_xdp_tx, tx_per_ev);
 
@@ -169,6 +170,8 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
                netif_err(efx, drv, efx->net_dev,
                          "Insufficient resources for %d XDP event queues (%d other channels, max %d)\n",
                          n_xdp_ev, n_channels, max_channels);
+               netif_err(efx, drv, efx->net_dev,
+                         "XDP_TX and XDP_REDIRECT will not work on this interface");
                efx->n_xdp_channels = 0;
                efx->xdp_tx_per_channel = 0;
                efx->xdp_tx_queue_count = 0;
@@ -176,12 +179,14 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
                netif_err(efx, drv, efx->net_dev,
                          "Insufficient resources for %d XDP TX queues (%d other channels, max VIs %d)\n",
                          n_xdp_tx, n_channels, efx->max_vis);
+               netif_err(efx, drv, efx->net_dev,
+                         "XDP_TX and XDP_REDIRECT will not work on this interface");
                efx->n_xdp_channels = 0;
                efx->xdp_tx_per_channel = 0;
                efx->xdp_tx_queue_count = 0;
        } else {
                efx->n_xdp_channels = n_xdp_ev;
-               efx->xdp_tx_per_channel = EFX_MAX_TXQ_PER_CHANNEL;
+               efx->xdp_tx_per_channel = tx_per_ev;
                efx->xdp_tx_queue_count = n_xdp_tx;
                n_channels += n_xdp_ev;
                netif_dbg(efx, drv, efx->net_dev,
@@ -891,18 +896,20 @@ int efx_set_channels(struct efx_nic *efx)
                        if (efx_channel_is_xdp_tx(channel)) {
                                efx_for_each_channel_tx_queue(tx_queue, channel) {
                                        tx_queue->queue = next_queue++;
-                                       netif_dbg(efx, drv, efx->net_dev, "Channel %u TXQ %u is XDP %u, HW %u\n",
-                                                 channel->channel, tx_queue->label,
-                                                 xdp_queue_number, tx_queue->queue);
+
                                        /* We may have a few left-over XDP TX
                                         * queues owing to xdp_tx_queue_count
                                         * not dividing evenly by EFX_MAX_TXQ_PER_CHANNEL.
                                         * We still allocate and probe those
                                         * TXQs, but never use them.
                                         */
-                                       if (xdp_queue_number < efx->xdp_tx_queue_count)
+                                       if (xdp_queue_number < efx->xdp_tx_queue_count) {
+                                               netif_dbg(efx, drv, efx->net_dev, "Channel %u TXQ %u is XDP %u, HW %u\n",
+                                                         channel->channel, tx_queue->label,
+                                                         xdp_queue_number, tx_queue->queue);
                                                efx->xdp_tx_queues[xdp_queue_number] = tx_queue;
-                                       xdp_queue_number++;
+                                               xdp_queue_number++;
+                                       }
                                }
                        } else {
                                efx_for_each_channel_tx_queue(tx_queue, channel) {
@@ -914,8 +921,7 @@ int efx_set_channels(struct efx_nic *efx)
                        }
                }
        }
-       if (xdp_queue_number)
-               efx->xdp_tx_queue_count = xdp_queue_number;
+       WARN_ON(xdp_queue_number != efx->xdp_tx_queue_count);
 
        rc = netif_set_real_num_tx_queues(efx->net_dev, efx->n_tx_channels);
        if (rc)
index e108b0d..4c9a37d 100644 (file)
@@ -49,9 +49,9 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id
 {
        struct plat_stmmacenet_data *plat;
        struct stmmac_resources res;
-       bool mdio = false;
-       int ret, i;
        struct device_node *np;
+       int ret, i, phy_mode;
+       bool mdio = false;
 
        np = dev_of_node(&pdev->dev);
 
@@ -108,10 +108,11 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id
        if (plat->bus_id < 0)
                plat->bus_id = pci_dev_id(pdev);
 
-       plat->phy_interface = device_get_phy_mode(&pdev->dev);
-       if (plat->phy_interface < 0)
+       phy_mode = device_get_phy_mode(&pdev->dev);
+       if (phy_mode < 0)
                dev_err(&pdev->dev, "phy_mode not found\n");
 
+       plat->phy_interface = phy_mode;
        plat->interface = PHY_INTERFACE_MODE_GMII;
 
        pci_set_master(pdev);
index e735134..fcdb1d2 100644 (file)
@@ -349,6 +349,9 @@ void stmmac_enable_rx_queue(struct stmmac_priv *priv, u32 queue);
 void stmmac_disable_tx_queue(struct stmmac_priv *priv, u32 queue);
 void stmmac_enable_tx_queue(struct stmmac_priv *priv, u32 queue);
 int stmmac_xsk_wakeup(struct net_device *dev, u32 queue, u32 flags);
+struct timespec64 stmmac_calc_tas_basetime(ktime_t old_base_time,
+                                          ktime_t current_time,
+                                          u64 cycle_time);
 
 #if IS_ENABLED(CONFIG_STMMAC_SELFTESTS)
 void stmmac_selftest_run(struct net_device *dev,
index 8d9d6ec..7b8404a 100644 (file)
@@ -7171,6 +7171,7 @@ int stmmac_suspend(struct device *dev)
                                     priv->plat->rx_queues_to_use, false);
 
                stmmac_fpe_handshake(priv, false);
+               stmmac_fpe_stop_wq(priv);
        }
 
        priv->speed = SPEED_UNKNOWN;
index 072eff8..5ca7108 100644 (file)
@@ -397,6 +397,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
        struct device_node *np = pdev->dev.of_node;
        struct plat_stmmacenet_data *plat;
        struct stmmac_dma_cfg *dma_cfg;
+       int phy_mode;
        void *ret;
        int rc;
 
@@ -412,10 +413,11 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac)
                eth_zero_addr(mac);
        }
 
-       plat->phy_interface = device_get_phy_mode(&pdev->dev);
-       if (plat->phy_interface < 0)
-               return ERR_PTR(plat->phy_interface);
+       phy_mode = device_get_phy_mode(&pdev->dev);
+       if (phy_mode < 0)
+               return ERR_PTR(phy_mode);
 
+       plat->phy_interface = phy_mode;
        plat->interface = stmmac_of_get_mac_mode(np);
        if (plat->interface < 0)
                plat->interface = plat->phy_interface;
index 4e86cdf..580cc03 100644 (file)
@@ -62,7 +62,8 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
        u32 sec, nsec;
        u32 quotient, reminder;
        int neg_adj = 0;
-       bool xmac;
+       bool xmac, est_rst = false;
+       int ret;
 
        xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
 
@@ -75,10 +76,48 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
        sec = quotient;
        nsec = reminder;
 
+       /* If EST is enabled, disabled it before adjust ptp time. */
+       if (priv->plat->est && priv->plat->est->enable) {
+               est_rst = true;
+               mutex_lock(&priv->plat->est->lock);
+               priv->plat->est->enable = false;
+               stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
+                                    priv->plat->clk_ptp_rate);
+               mutex_unlock(&priv->plat->est->lock);
+       }
+
        spin_lock_irqsave(&priv->ptp_lock, flags);
        stmmac_adjust_systime(priv, priv->ptpaddr, sec, nsec, neg_adj, xmac);
        spin_unlock_irqrestore(&priv->ptp_lock, flags);
 
+       /* Caculate new basetime and re-configured EST after PTP time adjust. */
+       if (est_rst) {
+               struct timespec64 current_time, time;
+               ktime_t current_time_ns, basetime;
+               u64 cycle_time;
+
+               mutex_lock(&priv->plat->est->lock);
+               priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
+               current_time_ns = timespec64_to_ktime(current_time);
+               time.tv_nsec = priv->plat->est->btr_reserve[0];
+               time.tv_sec = priv->plat->est->btr_reserve[1];
+               basetime = timespec64_to_ktime(time);
+               cycle_time = priv->plat->est->ctr[1] * NSEC_PER_SEC +
+                            priv->plat->est->ctr[0];
+               time = stmmac_calc_tas_basetime(basetime,
+                                               current_time_ns,
+                                               cycle_time);
+
+               priv->plat->est->btr[0] = (u32)time.tv_nsec;
+               priv->plat->est->btr[1] = (u32)time.tv_sec;
+               priv->plat->est->enable = true;
+               ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
+                                          priv->plat->clk_ptp_rate);
+               mutex_unlock(&priv->plat->est->lock);
+               if (ret)
+                       netdev_err(priv->dev, "failed to configure EST\n");
+       }
+
        return 0;
 }
 
index 92dab60..4f3b643 100644 (file)
@@ -711,12 +711,35 @@ static int tc_setup_cls(struct stmmac_priv *priv,
        return ret;
 }
 
+struct timespec64 stmmac_calc_tas_basetime(ktime_t old_base_time,
+                                          ktime_t current_time,
+                                          u64 cycle_time)
+{
+       struct timespec64 time;
+
+       if (ktime_after(old_base_time, current_time)) {
+               time = ktime_to_timespec64(old_base_time);
+       } else {
+               s64 n;
+               ktime_t base_time;
+
+               n = div64_s64(ktime_sub_ns(current_time, old_base_time),
+                             cycle_time);
+               base_time = ktime_add_ns(old_base_time,
+                                        (n + 1) * cycle_time);
+
+               time = ktime_to_timespec64(base_time);
+       }
+
+       return time;
+}
+
 static int tc_setup_taprio(struct stmmac_priv *priv,
                           struct tc_taprio_qopt_offload *qopt)
 {
        u32 size, wid = priv->dma_cap.estwid, dep = priv->dma_cap.estdep;
        struct plat_stmmacenet_data *plat = priv->plat;
-       struct timespec64 time, current_time;
+       struct timespec64 time, current_time, qopt_time;
        ktime_t current_time_ns;
        bool fpe = false;
        int i, ret = 0;
@@ -773,14 +796,18 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
                                         GFP_KERNEL);
                if (!plat->est)
                        return -ENOMEM;
+
+               mutex_init(&priv->plat->est->lock);
        } else {
                memset(plat->est, 0, sizeof(*plat->est));
        }
 
        size = qopt->num_entries;
 
+       mutex_lock(&priv->plat->est->lock);
        priv->plat->est->gcl_size = size;
        priv->plat->est->enable = qopt->enable;
+       mutex_unlock(&priv->plat->est->lock);
 
        for (i = 0; i < size; i++) {
                s64 delta_ns = qopt->entries[i].interval;
@@ -811,32 +838,28 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
                priv->plat->est->gcl[i] = delta_ns | (gates << wid);
        }
 
+       mutex_lock(&priv->plat->est->lock);
        /* Adjust for real system time */
        priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
        current_time_ns = timespec64_to_ktime(current_time);
-       if (ktime_after(qopt->base_time, current_time_ns)) {
-               time = ktime_to_timespec64(qopt->base_time);
-       } else {
-               ktime_t base_time;
-               s64 n;
-
-               n = div64_s64(ktime_sub_ns(current_time_ns, qopt->base_time),
-                             qopt->cycle_time);
-               base_time = ktime_add_ns(qopt->base_time,
-                                        (n + 1) * qopt->cycle_time);
-
-               time = ktime_to_timespec64(base_time);
-       }
+       time = stmmac_calc_tas_basetime(qopt->base_time, current_time_ns,
+                                       qopt->cycle_time);
 
        priv->plat->est->btr[0] = (u32)time.tv_nsec;
        priv->plat->est->btr[1] = (u32)time.tv_sec;
 
+       qopt_time = ktime_to_timespec64(qopt->base_time);
+       priv->plat->est->btr_reserve[0] = (u32)qopt_time.tv_nsec;
+       priv->plat->est->btr_reserve[1] = (u32)qopt_time.tv_sec;
+
        ctr = qopt->cycle_time;
        priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC);
        priv->plat->est->ctr[1] = (u32)ctr;
 
-       if (fpe && !priv->dma_cap.fpesel)
+       if (fpe && !priv->dma_cap.fpesel) {
+               mutex_unlock(&priv->plat->est->lock);
                return -EOPNOTSUPP;
+       }
 
        /* Actual FPE register configuration will be done after FPE handshake
         * is success.
@@ -845,6 +868,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 
        ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
                                   priv->plat->clk_ptp_rate);
+       mutex_unlock(&priv->plat->est->lock);
        if (ret) {
                netdev_err(priv->dev, "failed to configure EST\n");
                goto disable;
@@ -860,9 +884,11 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
        return 0;
 
 disable:
+       mutex_lock(&priv->plat->est->lock);
        priv->plat->est->enable = false;
        stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
                             priv->plat->clk_ptp_rate);
+       mutex_unlock(&priv->plat->est->lock);
 
        priv->plat->fpe_cfg->enable = false;
        stmmac_fpe_configure(priv, priv->ioaddr,
index 0b2ce4b..e0cb713 100644 (file)
@@ -313,9 +313,8 @@ static void tlan_remove_one(struct pci_dev *pdev)
        pci_release_regions(pdev);
 #endif
 
-       free_netdev(dev);
-
        cancel_work_sync(&priv->tlan_tqueue);
+       free_netdev(dev);
 }
 
 static void tlan_start(struct net_device *dev)
index 14f0705..0de2c45 100644 (file)
@@ -1504,9 +1504,8 @@ err_out_resource:
        release_mem_region(start, len);
 
 err_out_kfree:
-       free_netdev(dev);
-
        pr_err("%s: initialization failure, aborting!\n", fp->name);
+       free_netdev(dev);
        return ret;
 }
 
index 3811f1b..b80ed2f 100644 (file)
@@ -85,7 +85,7 @@ static int nsim_ipsec_parse_proto_keys(struct xfrm_state *xs,
                                       u32 *mykey, u32 *mysalt)
 {
        const char aes_gcm_name[] = "rfc4106(gcm(aes))";
-       struct net_device *dev = xs->xso.dev;
+       struct net_device *dev = xs->xso.real_dev;
        unsigned char *key_data;
        char *alg_name = NULL;
        int key_len;
@@ -134,7 +134,7 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs)
        u16 sa_idx;
        int ret;
 
-       dev = xs->xso.dev;
+       dev = xs->xso.real_dev;
        ns = netdev_priv(dev);
        ipsec = &ns->ipsec;
 
@@ -194,7 +194,7 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs)
 
 static void nsim_ipsec_del_sa(struct xfrm_state *xs)
 {
-       struct netdevsim *ns = netdev_priv(xs->xso.dev);
+       struct netdevsim *ns = netdev_priv(xs->xso.real_dev);
        struct nsim_ipsec *ipsec = &ns->ipsec;
        u16 sa_idx;
 
@@ -211,7 +211,7 @@ static void nsim_ipsec_del_sa(struct xfrm_state *xs)
 
 static bool nsim_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 {
-       struct netdevsim *ns = netdev_priv(xs->xso.dev);
+       struct netdevsim *ns = netdev_priv(xs->xso.real_dev);
        struct nsim_ipsec *ipsec = &ns->ipsec;
 
        ipsec->ok++;
index bbbc6ac..53a4334 100644 (file)
@@ -78,6 +78,11 @@ enum {
        /* Temperature read register (88E2110 only) */
        MV_PCS_TEMP             = 0x8042,
 
+       /* Number of ports on the device */
+       MV_PCS_PORT_INFO        = 0xd00d,
+       MV_PCS_PORT_INFO_NPORTS_MASK    = 0x0380,
+       MV_PCS_PORT_INFO_NPORTS_SHIFT   = 7,
+
        /* These registers appear at 0x800X and 0xa00X - the 0xa00X control
         * registers appear to set themselves to the 0x800X when AN is
         * restarted, but status registers appear readable from either.
@@ -966,6 +971,30 @@ static const struct mv3310_chip mv2111_type = {
 #endif
 };
 
+static int mv3310_get_number_of_ports(struct phy_device *phydev)
+{
+       int ret;
+
+       ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_PCS_PORT_INFO);
+       if (ret < 0)
+               return ret;
+
+       ret &= MV_PCS_PORT_INFO_NPORTS_MASK;
+       ret >>= MV_PCS_PORT_INFO_NPORTS_SHIFT;
+
+       return ret + 1;
+}
+
+static int mv3310_match_phy_device(struct phy_device *phydev)
+{
+       return mv3310_get_number_of_ports(phydev) == 1;
+}
+
+static int mv3340_match_phy_device(struct phy_device *phydev)
+{
+       return mv3310_get_number_of_ports(phydev) == 4;
+}
+
 static int mv211x_match_phy_device(struct phy_device *phydev, bool has_5g)
 {
        int val;
@@ -994,7 +1023,8 @@ static int mv2111_match_phy_device(struct phy_device *phydev)
 static struct phy_driver mv3310_drivers[] = {
        {
                .phy_id         = MARVELL_PHY_ID_88X3310,
-               .phy_id_mask    = MARVELL_PHY_ID_88X33X0_MASK,
+               .phy_id_mask    = MARVELL_PHY_ID_MASK,
+               .match_phy_device = mv3310_match_phy_device,
                .name           = "mv88x3310",
                .driver_data    = &mv3310_type,
                .get_features   = mv3310_get_features,
@@ -1011,8 +1041,9 @@ static struct phy_driver mv3310_drivers[] = {
                .set_loopback   = genphy_c45_loopback,
        },
        {
-               .phy_id         = MARVELL_PHY_ID_88X3340,
-               .phy_id_mask    = MARVELL_PHY_ID_88X33X0_MASK,
+               .phy_id         = MARVELL_PHY_ID_88X3310,
+               .phy_id_mask    = MARVELL_PHY_ID_MASK,
+               .match_phy_device = mv3340_match_phy_device,
                .name           = "mv88x3340",
                .driver_data    = &mv3340_type,
                .get_features   = mv3310_get_features,
@@ -1069,8 +1100,7 @@ static struct phy_driver mv3310_drivers[] = {
 module_phy_driver(mv3310_drivers);
 
 static struct mdio_device_id __maybe_unused mv3310_tbl[] = {
-       { MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_88X33X0_MASK },
-       { MARVELL_PHY_ID_88X3340, MARVELL_PHY_ID_88X33X0_MASK },
+       { MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_MASK },
        { MARVELL_PHY_ID_88E2110, MARVELL_PHY_ID_MASK },
        { },
 };
index aec97b0..2c11521 100644 (file)
@@ -701,6 +701,7 @@ static int ax88772_init_phy(struct usbnet *dev)
                return ret;
        }
 
+       phy_suspend(priv->phydev);
        priv->phydev->mac_managed_pm = 1;
 
        phy_attached_info(priv->phydev);
index 8a58a2f..56c3f85 100644 (file)
@@ -1771,6 +1771,7 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
 {
        struct scatterlist *sgs[4], hdr, stat;
        unsigned out_num = 0, tmp;
+       int ret;
 
        /* Caller should know better */
        BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
@@ -1790,7 +1791,12 @@ static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
        sgs[out_num] = &stat;
 
        BUG_ON(out_num + 1 > ARRAY_SIZE(sgs));
-       virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
+       ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC);
+       if (ret < 0) {
+               dev_warn(&vi->vdev->dev,
+                        "Failed to add sgs for command vq: %d\n.", ret);
+               return false;
+       }
 
        if (unlikely(!virtqueue_kick(vi->cvq)))
                return vi->ctrl->status == VIRTIO_NET_OK;
index c0bd9cb..1b483cf 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Linux driver for VMware's vmxnet3 ethernet NIC.
  *
- * Copyright (C) 2008-2020, VMware, Inc. All Rights Reserved.
+ * Copyright (C) 2008-2021, VMware, Inc. All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
 
 
 #include "vmxnet3_int.h"
+#include <net/vxlan.h>
+#include <net/geneve.h>
+
+#define VXLAN_UDP_PORT 8472
 
 struct vmxnet3_stat_desc {
        char desc[ETH_GSTRING_LEN];
@@ -262,6 +266,8 @@ netdev_features_t vmxnet3_features_check(struct sk_buff *skb,
        if (VMXNET3_VERSION_GE_4(adapter) &&
            skb->encapsulation && skb->ip_summed == CHECKSUM_PARTIAL) {
                u8 l4_proto = 0;
+               u16 port;
+               struct udphdr *udph;
 
                switch (vlan_get_protocol(skb)) {
                case htons(ETH_P_IP):
@@ -274,8 +280,20 @@ netdev_features_t vmxnet3_features_check(struct sk_buff *skb,
                        return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
                }
 
-               if (l4_proto != IPPROTO_UDP)
+               switch (l4_proto) {
+               case IPPROTO_UDP:
+                       udph = udp_hdr(skb);
+                       port = be16_to_cpu(udph->dest);
+                       /* Check if offloaded port is supported */
+                       if (port != GENEVE_UDP_PORT &&
+                           port != IANA_VXLAN_UDP_PORT &&
+                           port != VXLAN_UDP_PORT) {
+                               return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+                       }
+                       break;
+               default:
                        return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
+               }
        }
        return features;
 }
index 349ca18..c54fdae 100644 (file)
@@ -364,19 +364,19 @@ static int cisco_ioctl(struct net_device *dev, struct ifreq *ifr)
        return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_cisco_init(void)
 {
        register_hdlc_protocol(&proto);
        return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_cisco_exit(void)
 {
        unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_cisco_init);
+module_exit(hdlc_cisco_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Cisco HDLC protocol support for generic HDLC");
index 72250fe..25e3564 100644 (file)
@@ -1279,19 +1279,19 @@ static int fr_ioctl(struct net_device *dev, struct ifreq *ifr)
        return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_fr_init(void)
 {
        register_hdlc_protocol(&proto);
        return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_fr_exit(void)
 {
        unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_fr_init);
+module_exit(hdlc_fr_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Frame-Relay protocol support for generic HDLC");
index 834be2a..b81ecf4 100644 (file)
@@ -705,20 +705,20 @@ static int ppp_ioctl(struct net_device *dev, struct ifreq *ifr)
        return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_ppp_init(void)
 {
        skb_queue_head_init(&tx_queue);
        register_hdlc_protocol(&proto);
        return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_ppp_exit(void)
 {
        unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_ppp_init);
+module_exit(hdlc_ppp_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("PPP protocol support for generic HDLC");
index 388fcc0..54d2849 100644 (file)
@@ -90,7 +90,7 @@ static int raw_ioctl(struct net_device *dev, struct ifreq *ifr)
 }
 
 
-static int __init mod_init(void)
+static int __init hdlc_raw_init(void)
 {
        register_hdlc_protocol(&proto);
        return 0;
@@ -98,14 +98,14 @@ static int __init mod_init(void)
 
 
 
-static void __exit mod_exit(void)
+static void __exit hdlc_raw_exit(void)
 {
        unregister_hdlc_protocol(&proto);
 }
 
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_raw_init);
+module_exit(hdlc_raw_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Raw HDLC protocol support for generic HDLC");
index c70a518..9275962 100644 (file)
@@ -110,7 +110,7 @@ static int raw_eth_ioctl(struct net_device *dev, struct ifreq *ifr)
 }
 
 
-static int __init mod_init(void)
+static int __init hdlc_eth_init(void)
 {
        register_hdlc_protocol(&proto);
        return 0;
@@ -118,14 +118,14 @@ static int __init mod_init(void)
 
 
 
-static void __exit mod_exit(void)
+static void __exit hdlc_eth_exit(void)
 {
        unregister_hdlc_protocol(&proto);
 }
 
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_eth_init);
+module_exit(hdlc_eth_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("Ethernet encapsulation support for generic HDLC");
index d2bf72b..9b7ebf8 100644 (file)
@@ -365,19 +365,19 @@ static int x25_ioctl(struct net_device *dev, struct ifreq *ifr)
        return -EINVAL;
 }
 
-static int __init mod_init(void)
+static int __init hdlc_x25_init(void)
 {
        register_hdlc_protocol(&proto);
        return 0;
 }
 
-static void __exit mod_exit(void)
+static void __exit hdlc_x25_exit(void)
 {
        unregister_hdlc_protocol(&proto);
 }
 
-module_init(mod_init);
-module_exit(mod_exit);
+module_init(hdlc_x25_init);
+module_exit(hdlc_x25_exit);
 
 MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>");
 MODULE_DESCRIPTION("X.25 protocol support for generic HDLC");
index 7fd2104..63ec140 100644 (file)
@@ -389,6 +389,7 @@ static int mt7921_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
        case WLAN_CIPHER_SUITE_WEP104:
                if (!mvif->wep_sta)
                        return -EOPNOTSUPP;
+               break;
        case WLAN_CIPHER_SUITE_TKIP:
        case WLAN_CIPHER_SUITE_CCMP:
        case WLAN_CIPHER_SUITE_CCMP_256:
index c2c4dc1..cd690c6 100644 (file)
@@ -931,7 +931,7 @@ static int mt7921_load_firmware(struct mt7921_dev *dev)
        ret = mt76_get_field(dev, MT_CONN_ON_MISC, MT_TOP_MISC2_FW_N9_RDY);
        if (ret) {
                dev_dbg(dev->mt76.dev, "Firmware is already download\n");
-               return -EIO;
+               goto fw_loaded;
        }
 
        ret = mt7921_load_patch(dev);
@@ -949,6 +949,7 @@ static int mt7921_load_firmware(struct mt7921_dev *dev)
                return -EIO;
        }
 
+fw_loaded:
        mt76_queue_tx_cleanup(dev, dev->mt76.q_mcu[MT_MCUQ_FWDL], false);
 
 #ifdef CONFIG_PM
index 46f76e8..0a472ce 100644 (file)
@@ -24,15 +24,7 @@ int ipc_imem_sys_wwan_open(struct iosm_imem *ipc_imem, int if_id)
                return -EIO;
        }
 
-       /* check for the interafce id
-        * if if_id 1 to 8 then create IP MUX channel sessions.
-        * To start MUX session from 0 as network interface id would start
-        * from 1 so map it to if_id = if_id - 1
-        */
-       if (if_id >= IP_MUX_SESSION_START && if_id <= IP_MUX_SESSION_END)
-               return ipc_mux_open_session(ipc_imem->mux, if_id - 1);
-
-       return -EINVAL;
+       return ipc_mux_open_session(ipc_imem->mux, if_id);
 }
 
 /* Release a net link to CP. */
@@ -41,7 +33,7 @@ void ipc_imem_sys_wwan_close(struct iosm_imem *ipc_imem, int if_id,
 {
        if (ipc_imem->mux && if_id >= IP_MUX_SESSION_START &&
            if_id <= IP_MUX_SESSION_END)
-               ipc_mux_close_session(ipc_imem->mux, if_id - 1);
+               ipc_mux_close_session(ipc_imem->mux, if_id);
 }
 
 /* Tasklet call to do uplink transfer. */
@@ -83,13 +75,8 @@ int ipc_imem_sys_wwan_transmit(struct iosm_imem *ipc_imem,
                goto out;
        }
 
-       if (if_id >= IP_MUX_SESSION_START && if_id <= IP_MUX_SESSION_END)
-               /* Route the UL packet through IP MUX Layer */
-               ret = ipc_mux_ul_trigger_encode(ipc_imem->mux,
-                                               if_id - 1, skb);
-       else
-               dev_err(ipc_imem->dev,
-                       "invalid if_id %d: ", if_id);
+       /* Route the UL packet through IP MUX Layer */
+       ret = ipc_mux_ul_trigger_encode(ipc_imem->mux, if_id, skb);
 out:
        return ret;
 }
index fd356da..2007fe2 100644 (file)
 #define BOOT_CHECK_DEFAULT_TIMEOUT 400
 
 /* IP MUX channel range */
-#define IP_MUX_SESSION_START 1
-#define IP_MUX_SESSION_END 8
+#define IP_MUX_SESSION_START 0
+#define IP_MUX_SESSION_END 7
 
 /* Default IP MUX channel */
-#define IP_MUX_SESSION_DEFAULT 1
+#define IP_MUX_SESSION_DEFAULT 0
 
 /**
  * ipc_imem_sys_port_open - Open a port link to CP.
index e634ffc..562de27 100644 (file)
@@ -288,7 +288,7 @@ static int ipc_mux_net_receive(struct iosm_mux *ipc_mux, int if_id,
        /* Pass the packet to the netif layer. */
        dest_skb->priority = service_class;
 
-       return ipc_wwan_receive(wwan, dest_skb, false, if_id + 1);
+       return ipc_wwan_receive(wwan, dest_skb, false, if_id);
 }
 
 /* Decode Flow Credit Table in the block */
index 2229d75..d12188f 100644 (file)
@@ -37,7 +37,7 @@ void ipc_uevent_send(struct device *dev, char *uevent)
 
        /* Store the device and event information */
        info->dev = dev;
-       snprintf(info->uevent, MAX_UEVENT_LEN, "%s: %s", dev_name(dev), uevent);
+       snprintf(info->uevent, MAX_UEVENT_LEN, "IOSM_EVENT=%s", uevent);
 
        /* Schedule uevent in process context using work queue */
        schedule_work(&info->work);
index c999c64..b2357ad 100644 (file)
@@ -107,6 +107,7 @@ static int ipc_wwan_link_transmit(struct sk_buff *skb,
 {
        struct iosm_netdev_priv *priv = wwan_netdev_drvpriv(netdev);
        struct iosm_wwan *ipc_wwan = priv->ipc_wwan;
+       unsigned int len = skb->len;
        int if_id = priv->if_id;
        int ret;
 
@@ -123,6 +124,8 @@ static int ipc_wwan_link_transmit(struct sk_buff *skb,
 
        /* Return code of zero is success */
        if (ret == 0) {
+               netdev->stats.tx_packets++;
+               netdev->stats.tx_bytes += len;
                ret = NETDEV_TX_OK;
        } else if (ret == -EBUSY) {
                ret = NETDEV_TX_BUSY;
@@ -140,7 +143,8 @@ exit:
                        ret);
 
        dev_kfree_skb_any(skb);
-       return ret;
+       netdev->stats.tx_dropped++;
+       return NETDEV_TX_OK;
 }
 
 /* Ops structure for wwan net link */
@@ -158,6 +162,7 @@ static void ipc_wwan_setup(struct net_device *iosm_dev)
        iosm_dev->priv_flags |= IFF_NO_QUEUE;
 
        iosm_dev->type = ARPHRD_NONE;
+       iosm_dev->mtu = ETH_DATA_LEN;
        iosm_dev->min_mtu = ETH_MIN_MTU;
        iosm_dev->max_mtu = ETH_MAX_MTU;
 
@@ -252,8 +257,8 @@ int ipc_wwan_receive(struct iosm_wwan *ipc_wwan, struct sk_buff *skb_arg,
 
        skb->pkt_type = PACKET_HOST;
 
-       if (if_id < (IP_MUX_SESSION_START - 1) ||
-           if_id > (IP_MUX_SESSION_END - 1)) {
+       if (if_id < IP_MUX_SESSION_START ||
+           if_id > IP_MUX_SESSION_END) {
                ret = -EINVAL;
                goto free;
        }
index 9bab073..d32fbfc 100644 (file)
@@ -230,8 +230,8 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd,
                        break;
                }
                /* If arch decided it can't, fall through... */
-#endif /* HAVE_PCI_MMAP */
                fallthrough;
+#endif /* HAVE_PCI_MMAP */
        default:
                ret = -EINVAL;
                break;
index 3d45ed0..a6ebdb2 100644 (file)
@@ -1728,6 +1728,7 @@ static void ab8500_fg_algorithm_calibrate(struct ab8500_fg *di)
                break;
        case AB8500_FG_CALIB_WAIT:
                dev_dbg(di->dev, "Calibration WFI\n");
+               break;
        default:
                break;
        }
@@ -2224,6 +2225,7 @@ static int ab8500_fg_get_ext_psy_data(struct device *dev, void *data)
                                        queue_work(di->fg_wq, &di->fg_work);
                                        break;
                                }
+                               break;
                        default:
                                break;
                        }
index a17849b..b72826c 100644 (file)
@@ -1150,6 +1150,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
                                default:
                                        break;
                                }
+                               break;
                        default:
                                break;
                        }
index 8673d17..28a6fe3 100644 (file)
@@ -3,7 +3,7 @@
 # Makefile for PTP 1588 clock support.
 #
 
-ptp-y                                  := ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp-y                                  := ptp_clock.o ptp_chardev.o ptp_sysfs.o ptp_vclock.o
 ptp_kvm-$(CONFIG_X86)                  := ptp_kvm_x86.o ptp_kvm_common.o
 ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC)       := ptp_kvm_arm.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)           += ptp.o
index a23a37a..4dfc52e 100644 (file)
 #define PTP_PPS_EVENT PPS_CAPTUREASSERT
 #define PTP_PPS_MODE (PTP_PPS_DEFAULTS | PPS_CANWAIT | PPS_TSFMT_TSPEC)
 
+struct class *ptp_class;
+
 /* private globals */
 
 static dev_t ptp_devt;
-static struct class *ptp_class;
 
 static DEFINE_IDA(ptp_clocks_map);
 
@@ -76,6 +77,11 @@ static int ptp_clock_settime(struct posix_clock *pc, const struct timespec64 *tp
 {
        struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
 
+       if (ptp_vclock_in_use(ptp)) {
+               pr_err("ptp: virtual clock in use\n");
+               return -EBUSY;
+       }
+
        return  ptp->info->settime64(ptp->info, tp);
 }
 
@@ -97,6 +103,11 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx)
        struct ptp_clock_info *ops;
        int err = -EOPNOTSUPP;
 
+       if (ptp_vclock_in_use(ptp)) {
+               pr_err("ptp: virtual clock in use\n");
+               return -EBUSY;
+       }
+
        ops = ptp->info;
 
        if (tx->modes & ADJ_SETOFFSET) {
@@ -161,6 +172,7 @@ static void ptp_clock_release(struct device *dev)
        ptp_cleanup_pin_groups(ptp);
        mutex_destroy(&ptp->tsevq_mux);
        mutex_destroy(&ptp->pincfg_mux);
+       mutex_destroy(&ptp->n_vclocks_mux);
        ida_simple_remove(&ptp_clocks_map, ptp->index);
        kfree(ptp);
 }
@@ -185,6 +197,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 {
        struct ptp_clock *ptp;
        int err = 0, index, major = MAJOR(ptp_devt);
+       size_t size;
 
        if (info->n_alarm > PTP_MAX_ALARMS)
                return ERR_PTR(-EINVAL);
@@ -208,6 +221,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
        spin_lock_init(&ptp->tsevq.lock);
        mutex_init(&ptp->tsevq_mux);
        mutex_init(&ptp->pincfg_mux);
+       mutex_init(&ptp->n_vclocks_mux);
        init_waitqueue_head(&ptp->tsev_wq);
 
        if (ptp->info->do_aux_work) {
@@ -218,7 +232,22 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
                        pr_err("failed to create ptp aux_worker %d\n", err);
                        goto kworker_err;
                }
-               ptp->pps_source->lookup_cookie = ptp;
+       }
+
+       /* PTP virtual clock is being registered under physical clock */
+       if (parent && parent->class && parent->class->name &&
+           strcmp(parent->class->name, "ptp") == 0)
+               ptp->is_virtual_clock = true;
+
+       if (!ptp->is_virtual_clock) {
+               ptp->max_vclocks = PTP_DEFAULT_MAX_VCLOCKS;
+
+               size = sizeof(int) * ptp->max_vclocks;
+               ptp->vclock_index = kzalloc(size, GFP_KERNEL);
+               if (!ptp->vclock_index) {
+                       err = -ENOMEM;
+                       goto no_mem_for_vclocks;
+               }
        }
 
        err = ptp_populate_pin_groups(ptp);
@@ -238,6 +267,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
                        pr_err("failed to register pps source\n");
                        goto no_pps;
                }
+               ptp->pps_source->lookup_cookie = ptp;
        }
 
        /* Initialize a new device of our class in our clock structure. */
@@ -265,11 +295,14 @@ no_clock:
 no_pps:
        ptp_cleanup_pin_groups(ptp);
 no_pin_groups:
+       kfree(ptp->vclock_index);
+no_mem_for_vclocks:
        if (ptp->kworker)
                kthread_destroy_worker(ptp->kworker);
 kworker_err:
        mutex_destroy(&ptp->tsevq_mux);
        mutex_destroy(&ptp->pincfg_mux);
+       mutex_destroy(&ptp->n_vclocks_mux);
        ida_simple_remove(&ptp_clocks_map, index);
 no_slot:
        kfree(ptp);
@@ -280,9 +313,16 @@ EXPORT_SYMBOL(ptp_clock_register);
 
 int ptp_clock_unregister(struct ptp_clock *ptp)
 {
+       if (ptp_vclock_in_use(ptp)) {
+               pr_err("ptp: virtual clock in use\n");
+               return -EBUSY;
+       }
+
        ptp->defunct = 1;
        wake_up_interruptible(&ptp->tsev_wq);
 
+       kfree(ptp->vclock_index);
+
        if (ptp->kworker) {
                kthread_cancel_delayed_work_sync(&ptp->aux_work);
                kthread_destroy_worker(ptp->kworker);
index 6b97155..dba6be4 100644 (file)
@@ -18,6 +18,7 @@
 
 #define PTP_MAX_TIMESTAMPS 128
 #define PTP_BUF_TIMESTAMPS 30
+#define PTP_DEFAULT_MAX_VCLOCKS 20
 
 struct timestamp_event_queue {
        struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS];
@@ -46,6 +47,24 @@ struct ptp_clock {
        const struct attribute_group *pin_attr_groups[2];
        struct kthread_worker *kworker;
        struct kthread_delayed_work aux_work;
+       unsigned int max_vclocks;
+       unsigned int n_vclocks;
+       int *vclock_index;
+       struct mutex n_vclocks_mux; /* protect concurrent n_vclocks access */
+       bool is_virtual_clock;
+};
+
+#define info_to_vclock(d) container_of((d), struct ptp_vclock, info)
+#define cc_to_vclock(d) container_of((d), struct ptp_vclock, cc)
+#define dw_to_vclock(d) container_of((d), struct ptp_vclock, refresh_work)
+
+struct ptp_vclock {
+       struct ptp_clock *pclock;
+       struct ptp_clock_info info;
+       struct ptp_clock *clock;
+       struct cyclecounter cc;
+       struct timecounter tc;
+       spinlock_t lock;        /* protects tc/cc */
 };
 
 /*
@@ -61,6 +80,24 @@ static inline int queue_cnt(struct timestamp_event_queue *q)
        return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt;
 }
 
+/* Check if ptp virtual clock is in use */
+static inline bool ptp_vclock_in_use(struct ptp_clock *ptp)
+{
+       bool in_use = false;
+
+       if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+               return true;
+
+       if (!ptp->is_virtual_clock && ptp->n_vclocks)
+               in_use = true;
+
+       mutex_unlock(&ptp->n_vclocks_mux);
+
+       return in_use;
+}
+
+extern struct class *ptp_class;
+
 /*
  * see ptp_chardev.c
  */
@@ -89,4 +126,6 @@ extern const struct attribute_group *ptp_groups[];
 int ptp_populate_pin_groups(struct ptp_clock *ptp);
 void ptp_cleanup_pin_groups(struct ptp_clock *ptp);
 
+struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock);
+void ptp_vclock_unregister(struct ptp_vclock *vclock);
 #endif
index be076a9..b3d96b7 100644 (file)
@@ -3,6 +3,7 @@
  * PTP 1588 clock support - sysfs interface.
  *
  * Copyright (C) 2010 OMICRON electronics GmbH
+ * Copyright 2021 NXP
  */
 #include <linux/capability.h>
 #include <linux/slab.h>
@@ -148,6 +149,159 @@ out:
 }
 static DEVICE_ATTR(pps_enable, 0220, NULL, pps_enable_store);
 
+static int unregister_vclock(struct device *dev, void *data)
+{
+       struct ptp_clock *ptp = dev_get_drvdata(dev);
+       struct ptp_clock_info *info = ptp->info;
+       struct ptp_vclock *vclock;
+       u8 *num = data;
+
+       vclock = info_to_vclock(info);
+       dev_info(dev->parent, "delete virtual clock ptp%d\n",
+                vclock->clock->index);
+
+       ptp_vclock_unregister(vclock);
+       (*num)--;
+
+       /* For break. Not error. */
+       if (*num == 0)
+               return -EINVAL;
+
+       return 0;
+}
+
+static ssize_t n_vclocks_show(struct device *dev,
+                             struct device_attribute *attr, char *page)
+{
+       struct ptp_clock *ptp = dev_get_drvdata(dev);
+       ssize_t size;
+
+       if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+               return -ERESTARTSYS;
+
+       size = snprintf(page, PAGE_SIZE - 1, "%u\n", ptp->n_vclocks);
+
+       mutex_unlock(&ptp->n_vclocks_mux);
+
+       return size;
+}
+
+static ssize_t n_vclocks_store(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t count)
+{
+       struct ptp_clock *ptp = dev_get_drvdata(dev);
+       struct ptp_vclock *vclock;
+       int err = -EINVAL;
+       u32 num, i;
+
+       if (kstrtou32(buf, 0, &num))
+               return err;
+
+       if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+               return -ERESTARTSYS;
+
+       if (num > ptp->max_vclocks) {
+               dev_err(dev, "max value is %d\n", ptp->max_vclocks);
+               goto out;
+       }
+
+       /* Need to create more vclocks */
+       if (num > ptp->n_vclocks) {
+               for (i = 0; i < num - ptp->n_vclocks; i++) {
+                       vclock = ptp_vclock_register(ptp);
+                       if (!vclock)
+                               goto out;
+
+                       *(ptp->vclock_index + ptp->n_vclocks + i) =
+                               vclock->clock->index;
+
+                       dev_info(dev, "new virtual clock ptp%d\n",
+                                vclock->clock->index);
+               }
+       }
+
+       /* Need to delete vclocks */
+       if (num < ptp->n_vclocks) {
+               i = ptp->n_vclocks - num;
+               device_for_each_child_reverse(dev, &i,
+                                             unregister_vclock);
+
+               for (i = 1; i <= ptp->n_vclocks - num; i++)
+                       *(ptp->vclock_index + ptp->n_vclocks - i) = -1;
+       }
+
+       if (num == 0)
+               dev_info(dev, "only physical clock in use now\n");
+       else
+               dev_info(dev, "guarantee physical clock free running\n");
+
+       ptp->n_vclocks = num;
+       mutex_unlock(&ptp->n_vclocks_mux);
+
+       return count;
+out:
+       mutex_unlock(&ptp->n_vclocks_mux);
+       return err;
+}
+static DEVICE_ATTR_RW(n_vclocks);
+
+static ssize_t max_vclocks_show(struct device *dev,
+                               struct device_attribute *attr, char *page)
+{
+       struct ptp_clock *ptp = dev_get_drvdata(dev);
+       ssize_t size;
+
+       size = snprintf(page, PAGE_SIZE - 1, "%u\n", ptp->max_vclocks);
+
+       return size;
+}
+
+static ssize_t max_vclocks_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+       struct ptp_clock *ptp = dev_get_drvdata(dev);
+       unsigned int *vclock_index;
+       int err = -EINVAL;
+       size_t size;
+       u32 max;
+
+       if (kstrtou32(buf, 0, &max) || max == 0)
+               return -EINVAL;
+
+       if (max == ptp->max_vclocks)
+               return count;
+
+       if (mutex_lock_interruptible(&ptp->n_vclocks_mux))
+               return -ERESTARTSYS;
+
+       if (max < ptp->n_vclocks)
+               goto out;
+
+       size = sizeof(int) * max;
+       vclock_index = kzalloc(size, GFP_KERNEL);
+       if (!vclock_index) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       size = sizeof(int) * ptp->n_vclocks;
+       memcpy(vclock_index, ptp->vclock_index, size);
+
+       kfree(ptp->vclock_index);
+       ptp->vclock_index = vclock_index;
+       ptp->max_vclocks = max;
+
+       mutex_unlock(&ptp->n_vclocks_mux);
+
+       return count;
+out:
+       mutex_unlock(&ptp->n_vclocks_mux);
+       return err;
+}
+static DEVICE_ATTR_RW(max_vclocks);
+
 static struct attribute *ptp_attrs[] = {
        &dev_attr_clock_name.attr,
 
@@ -162,6 +316,8 @@ static struct attribute *ptp_attrs[] = {
        &dev_attr_fifo.attr,
        &dev_attr_period.attr,
        &dev_attr_pps_enable.attr,
+       &dev_attr_n_vclocks.attr,
+       &dev_attr_max_vclocks.attr,
        NULL
 };
 
@@ -183,6 +339,10 @@ static umode_t ptp_is_attribute_visible(struct kobject *kobj,
        } else if (attr == &dev_attr_pps_enable.attr) {
                if (!info->pps)
                        mode = 0;
+       } else if (attr == &dev_attr_n_vclocks.attr ||
+                  attr == &dev_attr_max_vclocks.attr) {
+               if (ptp->is_virtual_clock)
+                       mode = 0;
        }
 
        return mode;
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
new file mode 100644 (file)
index 0000000..e0f87c5
--- /dev/null
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PTP virtual clock driver
+ *
+ * Copyright 2021 NXP
+ */
+#include <linux/slab.h>
+#include "ptp_private.h"
+
+#define PTP_VCLOCK_CC_SHIFT            31
+#define PTP_VCLOCK_CC_MULT             (1 << PTP_VCLOCK_CC_SHIFT)
+#define PTP_VCLOCK_FADJ_SHIFT          9
+#define PTP_VCLOCK_FADJ_DENOMINATOR    15625ULL
+#define PTP_VCLOCK_REFRESH_INTERVAL    (HZ * 2)
+
+static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+       struct ptp_vclock *vclock = info_to_vclock(ptp);
+       unsigned long flags;
+       s64 adj;
+
+       adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT;
+       adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR);
+
+       spin_lock_irqsave(&vclock->lock, flags);
+       timecounter_read(&vclock->tc);
+       vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj;
+       spin_unlock_irqrestore(&vclock->lock, flags);
+
+       return 0;
+}
+
+static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+       struct ptp_vclock *vclock = info_to_vclock(ptp);
+       unsigned long flags;
+
+       spin_lock_irqsave(&vclock->lock, flags);
+       timecounter_adjtime(&vclock->tc, delta);
+       spin_unlock_irqrestore(&vclock->lock, flags);
+
+       return 0;
+}
+
+static int ptp_vclock_gettime(struct ptp_clock_info *ptp,
+                             struct timespec64 *ts)
+{
+       struct ptp_vclock *vclock = info_to_vclock(ptp);
+       unsigned long flags;
+       u64 ns;
+
+       spin_lock_irqsave(&vclock->lock, flags);
+       ns = timecounter_read(&vclock->tc);
+       spin_unlock_irqrestore(&vclock->lock, flags);
+       *ts = ns_to_timespec64(ns);
+
+       return 0;
+}
+
+static int ptp_vclock_settime(struct ptp_clock_info *ptp,
+                             const struct timespec64 *ts)
+{
+       struct ptp_vclock *vclock = info_to_vclock(ptp);
+       u64 ns = timespec64_to_ns(ts);
+       unsigned long flags;
+
+       spin_lock_irqsave(&vclock->lock, flags);
+       timecounter_init(&vclock->tc, &vclock->cc, ns);
+       spin_unlock_irqrestore(&vclock->lock, flags);
+
+       return 0;
+}
+
+static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+{
+       struct ptp_vclock *vclock = info_to_vclock(ptp);
+       struct timespec64 ts;
+
+       ptp_vclock_gettime(&vclock->info, &ts);
+
+       return PTP_VCLOCK_REFRESH_INTERVAL;
+}
+
+static const struct ptp_clock_info ptp_vclock_info = {
+       .owner          = THIS_MODULE,
+       .name           = "ptp virtual clock",
+       /* The maximum ppb value that long scaled_ppm can support */
+       .max_adj        = 32767999,
+       .adjfine        = ptp_vclock_adjfine,
+       .adjtime        = ptp_vclock_adjtime,
+       .gettime64      = ptp_vclock_gettime,
+       .settime64      = ptp_vclock_settime,
+       .do_aux_work    = ptp_vclock_refresh,
+};
+
+static u64 ptp_vclock_read(const struct cyclecounter *cc)
+{
+       struct ptp_vclock *vclock = cc_to_vclock(cc);
+       struct ptp_clock *ptp = vclock->pclock;
+       struct timespec64 ts = {};
+
+       if (ptp->info->gettimex64)
+               ptp->info->gettimex64(ptp->info, &ts, NULL);
+       else
+               ptp->info->gettime64(ptp->info, &ts);
+
+       return timespec64_to_ns(&ts);
+}
+
+static const struct cyclecounter ptp_vclock_cc = {
+       .read   = ptp_vclock_read,
+       .mask   = CYCLECOUNTER_MASK(32),
+       .mult   = PTP_VCLOCK_CC_MULT,
+       .shift  = PTP_VCLOCK_CC_SHIFT,
+};
+
+struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+{
+       struct ptp_vclock *vclock;
+
+       vclock = kzalloc(sizeof(*vclock), GFP_KERNEL);
+       if (!vclock)
+               return NULL;
+
+       vclock->pclock = pclock;
+       vclock->info = ptp_vclock_info;
+       vclock->cc = ptp_vclock_cc;
+
+       snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt",
+                pclock->index);
+
+       spin_lock_init(&vclock->lock);
+
+       vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev);
+       if (IS_ERR_OR_NULL(vclock->clock)) {
+               kfree(vclock);
+               return NULL;
+       }
+
+       timecounter_init(&vclock->tc, &vclock->cc, 0);
+       ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+       return vclock;
+}
+
+void ptp_vclock_unregister(struct ptp_vclock *vclock)
+{
+       ptp_clock_unregister(vclock->clock);
+       kfree(vclock);
+}
+
+int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
+{
+       char name[PTP_CLOCK_NAME_LEN] = "";
+       struct ptp_clock *ptp;
+       struct device *dev;
+       int num = 0;
+
+       if (pclock_index < 0)
+               return num;
+
+       snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index);
+       dev = class_find_device_by_name(ptp_class, name);
+       if (!dev)
+               return num;
+
+       ptp = dev_get_drvdata(dev);
+
+       if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) {
+               put_device(dev);
+               return num;
+       }
+
+       *vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL);
+       if (!(*vclock_index))
+               goto out;
+
+       memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks);
+       num = ptp->n_vclocks;
+out:
+       mutex_unlock(&ptp->n_vclocks_mux);
+       put_device(dev);
+       return num;
+}
+EXPORT_SYMBOL(ptp_get_vclocks_index);
+
+void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+                          int vclock_index)
+{
+       char name[PTP_CLOCK_NAME_LEN] = "";
+       struct ptp_vclock *vclock;
+       struct ptp_clock *ptp;
+       unsigned long flags;
+       struct device *dev;
+       u64 ns;
+
+       snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", vclock_index);
+       dev = class_find_device_by_name(ptp_class, name);
+       if (!dev)
+               return;
+
+       ptp = dev_get_drvdata(dev);
+       if (!ptp->is_virtual_clock) {
+               put_device(dev);
+               return;
+       }
+
+       vclock = info_to_vclock(ptp->info);
+
+       ns = ktime_to_ns(hwtstamps->hwtstamp);
+
+       spin_lock_irqsave(&vclock->lock, flags);
+       ns = timecounter_cyc2time(&vclock->tc, ns);
+       spin_unlock_irqrestore(&vclock->lock, flags);
+
+       put_device(dev);
+       hwtstamps->hwtstamp = ns_to_ktime(ns);
+}
+EXPORT_SYMBOL(ptp_convert_timestamp);
index 5537b5f..e157273 100644 (file)
@@ -190,12 +190,9 @@ static int berlin_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                return 0;
        }
 
-       if (state->period != pwm->state.period ||
-           state->duty_cycle != pwm->state.duty_cycle) {
-               err = berlin_pwm_config(chip, pwm, state->duty_cycle, state->period);
-               if (err)
-                       return err;
-       }
+       err = berlin_pwm_config(chip, pwm, state->duty_cycle, state->period);
+       if (err)
+               return err;
 
        if (!enabled)
                return berlin_pwm_enable(chip, pwm);
index 8a3d781..fc3cb7d 100644 (file)
@@ -64,6 +64,11 @@ static int ep93xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
        int ret;
        struct ep93xx_pwm *ep93xx_pwm = to_ep93xx_pwm(chip);
        bool enabled = state->enabled;
+       void __iomem *base = ep93xx_pwm->base;
+       unsigned long long c;
+       unsigned long period_cycles;
+       unsigned long duty_cycles;
+       unsigned long term;
 
        if (state->polarity != pwm->state.polarity) {
                if (enabled) {
@@ -97,57 +102,47 @@ static int ep93xx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                return 0;
        }
 
-       if (state->period != pwm->state.period ||
-           state->duty_cycle != pwm->state.duty_cycle) {
-               struct ep93xx_pwm *ep93xx_pwm = to_ep93xx_pwm(chip);
-               void __iomem *base = ep93xx_pwm->base;
-               unsigned long long c;
-               unsigned long period_cycles;
-               unsigned long duty_cycles;
-               unsigned long term;
+       /*
+        * The clock needs to be enabled to access the PWM registers.
+        * Configuration can be changed at any time.
+        */
+       if (!pwm_is_enabled(pwm)) {
+               ret = clk_prepare_enable(ep93xx_pwm->clk);
+               if (ret)
+                       return ret;
+       }
 
-               /*
-                * The clock needs to be enabled to access the PWM registers.
-                * Configuration can be changed at any time.
-                */
-               if (!pwm_is_enabled(pwm)) {
-                       ret = clk_prepare_enable(ep93xx_pwm->clk);
-                       if (ret)
-                               return ret;
-               }
+       c = clk_get_rate(ep93xx_pwm->clk);
+       c *= state->period;
+       do_div(c, 1000000000);
+       period_cycles = c;
+
+       c = period_cycles;
+       c *= state->duty_cycle;
+       do_div(c, state->period);
+       duty_cycles = c;
 
-               c = clk_get_rate(ep93xx_pwm->clk);
-               c *= state->period;
-               do_div(c, 1000000000);
-               period_cycles = c;
-
-               c = period_cycles;
-               c *= state->duty_cycle;
-               do_div(c, state->period);
-               duty_cycles = c;
-
-               if (period_cycles < 0x10000 && duty_cycles < 0x10000) {
-                       term = readw(base + EP93XX_PWMx_TERM_COUNT);
-
-                       /* Order is important if PWM is running */
-                       if (period_cycles > term) {
-                               writew(period_cycles, base + EP93XX_PWMx_TERM_COUNT);
-                               writew(duty_cycles, base + EP93XX_PWMx_DUTY_CYCLE);
-                       } else {
-                               writew(duty_cycles, base + EP93XX_PWMx_DUTY_CYCLE);
-                               writew(period_cycles, base + EP93XX_PWMx_TERM_COUNT);
-                       }
-                       ret = 0;
+       if (period_cycles < 0x10000 && duty_cycles < 0x10000) {
+               term = readw(base + EP93XX_PWMx_TERM_COUNT);
+
+               /* Order is important if PWM is running */
+               if (period_cycles > term) {
+                       writew(period_cycles, base + EP93XX_PWMx_TERM_COUNT);
+                       writew(duty_cycles, base + EP93XX_PWMx_DUTY_CYCLE);
                } else {
-                       ret = -EINVAL;
+                       writew(duty_cycles, base + EP93XX_PWMx_DUTY_CYCLE);
+                       writew(period_cycles, base + EP93XX_PWMx_TERM_COUNT);
                }
+               ret = 0;
+       } else {
+               ret = -EINVAL;
+       }
 
-               if (!pwm_is_enabled(pwm))
-                       clk_disable_unprepare(ep93xx_pwm->clk);
+       if (!pwm_is_enabled(pwm))
+               clk_disable_unprepare(ep93xx_pwm->clk);
 
-               if (ret)
-                       return ret;
-       }
+       if (ret)
+               return ret;
 
        if (!enabled) {
                ret = clk_prepare_enable(ep93xx_pwm->clk);
index 48c31da..54c7990 100644 (file)
@@ -177,12 +177,9 @@ static int spear_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                return 0;
        }
 
-       if (state->period != pwm->state.period ||
-           state->duty_cycle != pwm->state.duty_cycle) {
-               err = spear_pwm_config(chip, pwm, state->duty_cycle, state->period);
-               if (err)
-                       return err;
-       }
+       err = spear_pwm_config(chip, pwm, state->duty_cycle, state->period);
+       if (err)
+               return err;
 
        if (!pwm->state.enabled)
                return spear_pwm_enable(chip, pwm);
index f2a85e8..7004f55 100644 (file)
@@ -183,13 +183,10 @@ static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                        }
                }
 
-               if (state->period != cstate->period ||
-                   state->duty_cycle != cstate->duty_cycle) {
-                       ret = sprd_pwm_config(spc, pwm, state->duty_cycle,
-                                             state->period);
-                       if (ret)
-                               return ret;
-               }
+               ret = sprd_pwm_config(spc, pwm, state->duty_cycle,
+                                     state->period);
+               if (ret)
+                       return ret;
 
                sprd_pwm_write(spc, pwm->hwpwm, SPRD_PWM_ENABLE, 1);
        } else if (cstate->enabled) {
index dec3f1f..35eb19a 100644 (file)
@@ -189,16 +189,13 @@ static int ecap_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
                return 0;
        }
 
-       if (state->period != pwm->state.period ||
-           state->duty_cycle != pwm->state.duty_cycle) {
-               if (state->period > NSEC_PER_SEC)
-                       return -ERANGE;
+       if (state->period > NSEC_PER_SEC)
+               return -ERANGE;
 
-               err = ecap_pwm_config(chip, pwm, state->duty_cycle,
-                                     state->period, enabled);
-               if (err)
-                       return err;
-       }
+       err = ecap_pwm_config(chip, pwm, state->duty_cycle,
+                             state->period, enabled);
+       if (err)
+               return err;
 
        if (!enabled)
                return ecap_pwm_enable(chip, pwm);
index 8abb429..cc8237a 100644 (file)
@@ -371,8 +371,6 @@ __tapechar_ioctl(struct tape_device *device,
                        case MTSEEK:
                                if (device->required_tapemarks)
                                        tape_std_terminate_write(device);
-                       default:
-                               ;
                }
                rc = tape_mtop(device, op.mt_op, op.mt_count);
 
index b341075..377e368 100644 (file)
@@ -1454,6 +1454,7 @@ again:
                                get_ccwdev_lock(ch->cdev), saveflags);
                if (rc != 0)
                        ctcm_ccw_check_rc(ch, rc, "normal RX");
+               break;
        default:
                break;
        }
index d308ff7..f0d6f20 100644 (file)
@@ -434,6 +434,7 @@ static int qeth_l3_correct_routing_type(struct qeth_card *card,
                        if (qeth_is_ipafunc_supported(card, prot,
                                                      IPA_OSA_MC_ROUTER))
                                return 0;
+                       goto out_inval;
                default:
                        goto out_inval;
                }
index 9f5068f..dd20541 100644 (file)
@@ -461,7 +461,7 @@ static void sas_discover_domain(struct work_struct *work)
                break;
 #else
                pr_notice("ATA device seen but CONFIG_SCSI_SAS_ATA=N so cannot attach\n");
-               /* Fall through */
+               fallthrough;
 #endif
                /* Fall through - only for the #else condition above. */
        default:
index 6d2d636..b8d55af 100644 (file)
@@ -98,11 +98,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
 MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC);
 
-#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
 #define SD_MINORS      16
-#else
-#define SD_MINORS      0
-#endif
 
 static void sd_config_discard(struct scsi_disk *, unsigned int);
 static void sd_config_write_same(struct scsi_disk *);
index 8e85889..15db7a3 100644 (file)
@@ -586,6 +586,7 @@ static int qe_ep_init(struct qe_udc *udc,
                        case USB_SPEED_FULL:
                                if (max <= 1023)
                                        break;
+                               fallthrough;
                        default:
                                goto en_done;
                        }
index ffbf900..438e2c7 100644 (file)
@@ -241,6 +241,8 @@ xilinx_fb_blank(int blank_mode, struct fb_info *fbi)
        case FB_BLANK_POWERDOWN:
                /* turn off panel */
                xilinx_fb_out32(drvdata, REG_CTRL, 0);
+               break;
+
        default:
                break;
        }
index 38b127b..9e7d9d0 100644 (file)
@@ -1498,9 +1498,18 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
                return;
 
-       mutex_lock(&fs_info->reclaim_bgs_lock);
+       /*
+        * Long running balances can keep us blocked here for eternity, so
+        * simply skip reclaim if we're unable to get the mutex.
+        */
+       if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
+               btrfs_exclop_finish(fs_info);
+               return;
+       }
+
        spin_lock(&fs_info->unused_bgs_lock);
        while (!list_empty(&fs_info->reclaim_bgs)) {
+               u64 zone_unusable;
                int ret = 0;
 
                bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1534,13 +1543,22 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
                        goto next;
                }
 
+               /*
+                * Cache the zone_unusable value before turning the block group
+                * to read only. As soon as the blog group is read only it's
+                * zone_unusable value gets moved to the block group's read-only
+                * bytes and isn't available for calculations anymore.
+                */
+               zone_unusable = bg->zone_unusable;
                ret = inc_block_group_ro(bg, 0);
                up_write(&space_info->groups_sem);
                if (ret < 0)
                        goto next;
 
-               btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
-                               bg->start, div_u64(bg->used * 100, bg->length));
+               btrfs_info(fs_info,
+                       "reclaiming chunk %llu with %llu%% used %llu%% unusable",
+                               bg->start, div_u64(bg->used * 100, bg->length),
+                               div64_u64(zone_unusable * 100, bg->length));
                trace_btrfs_reclaim_block_group(bg);
                ret = btrfs_relocate_chunk(fs_info, bg->start);
                if (ret)
@@ -2197,6 +2215,13 @@ error:
        return ret;
 }
 
+/*
+ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
+ * allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
 static int insert_block_group_item(struct btrfs_trans_handle *trans,
                                   struct btrfs_block_group *block_group)
 {
@@ -2219,15 +2244,19 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
        return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
 }
 
+/*
+ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
+ * chunk allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *block_group;
        int ret = 0;
 
-       if (!trans->can_flush_pending_bgs)
-               return;
-
        while (!list_empty(&trans->new_bgs)) {
                int index;
 
@@ -2242,6 +2271,13 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
                ret = insert_block_group_item(trans, block_group);
                if (ret)
                        btrfs_abort_transaction(trans, ret);
+               if (!block_group->chunk_item_inserted) {
+                       mutex_lock(&fs_info->chunk_mutex);
+                       ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
+                       mutex_unlock(&fs_info->chunk_mutex);
+                       if (ret)
+                               btrfs_abort_transaction(trans, ret);
+               }
                ret = btrfs_finish_chunk_alloc(trans, block_group->start,
                                        block_group->length);
                if (ret)
@@ -2265,8 +2301,9 @@ next:
        btrfs_trans_release_chunk_metadata(trans);
 }
 
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
-                          u64 type, u64 chunk_offset, u64 size)
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+                                                u64 bytes_used, u64 type,
+                                                u64 chunk_offset, u64 size)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group *cache;
@@ -2276,7 +2313,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
 
        cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
        if (!cache)
-               return -ENOMEM;
+               return ERR_PTR(-ENOMEM);
 
        cache->length = size;
        set_free_space_tree_thresholds(cache);
@@ -2290,7 +2327,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
        ret = btrfs_load_block_group_zone_info(cache, true);
        if (ret) {
                btrfs_put_block_group(cache);
-               return ret;
+               return ERR_PTR(ret);
        }
 
        ret = exclude_super_stripes(cache);
@@ -2298,7 +2335,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
                /* We may have excluded something, so call this just in case */
                btrfs_free_excluded_extents(cache);
                btrfs_put_block_group(cache);
-               return ret;
+               return ERR_PTR(ret);
        }
 
        add_new_free_space(cache, chunk_offset, chunk_offset + size);
@@ -2325,7 +2362,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
        if (ret) {
                btrfs_remove_free_space_cache(cache);
                btrfs_put_block_group(cache);
-               return ret;
+               return ERR_PTR(ret);
        }
 
        /*
@@ -2344,7 +2381,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
        btrfs_update_delayed_refs_rsv(trans);
 
        set_avail_alloc_bits(fs_info, type);
-       return 0;
+       return cache;
 }
 
 /*
@@ -3222,11 +3259,203 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
        return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 }
 
+static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+{
+       struct btrfs_block_group *bg;
+       int ret;
+
+       /*
+        * Check if we have enough space in the system space info because we
+        * will need to update device items in the chunk btree and insert a new
+        * chunk item in the chunk btree as well. This will allocate a new
+        * system block group if needed.
+        */
+       check_system_chunk(trans, flags);
+
+       bg = btrfs_alloc_chunk(trans, flags);
+       if (IS_ERR(bg)) {
+               ret = PTR_ERR(bg);
+               goto out;
+       }
+
+       /*
+        * If this is a system chunk allocation then stop right here and do not
+        * add the chunk item to the chunk btree. This is to prevent a deadlock
+        * because this system chunk allocation can be triggered while COWing
+        * some extent buffer of the chunk btree and while holding a lock on a
+        * parent extent buffer, in which case attempting to insert the chunk
+        * item (or update the device item) would result in a deadlock on that
+        * parent extent buffer. In this case defer the chunk btree updates to
+        * the second phase of chunk allocation and keep our reservation until
+        * the second phase completes.
+        *
+        * This is a rare case and can only be triggered by the very few cases
+        * we have where we need to touch the chunk btree outside chunk allocation
+        * and chunk removal. These cases are basically adding a device, removing
+        * a device or resizing a device.
+        */
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               return 0;
+
+       ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+       /*
+        * Normally we are not expected to fail with -ENOSPC here, since we have
+        * previously reserved space in the system space_info and allocated one
+        * new system chunk if necessary. However there are two exceptions:
+        *
+        * 1) We may have enough free space in the system space_info but all the
+        *    existing system block groups have a profile which can not be used
+        *    for extent allocation.
+        *
+        *    This happens when mounting in degraded mode. For example we have a
+        *    RAID1 filesystem with 2 devices, lose one device and mount the fs
+        *    using the other device in degraded mode. If we then allocate a chunk,
+        *    we may have enough free space in the existing system space_info, but
+        *    none of the block groups can be used for extent allocation since they
+        *    have a RAID1 profile, and because we are in degraded mode with a
+        *    single device, we are forced to allocate a new system chunk with a
+        *    SINGLE profile. Making check_system_chunk() iterate over all system
+        *    block groups and check if they have a usable profile and enough space
+        *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
+        *    try again after forcing allocation of a new system chunk. Like this
+        *    we avoid paying the cost of that search in normal circumstances, when
+        *    we were not mounted in degraded mode;
+        *
+        * 2) We had enough free space info the system space_info, and one suitable
+        *    block group to allocate from when we called check_system_chunk()
+        *    above. However right after we called it, the only system block group
+        *    with enough free space got turned into RO mode by a running scrub,
+        *    and in this case we have to allocate a new one and retry. We only
+        *    need do this allocate and retry once, since we have a transaction
+        *    handle and scrub uses the commit root to search for block groups.
+        */
+       if (ret == -ENOSPC) {
+               const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
+               struct btrfs_block_group *sys_bg;
+
+               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               if (IS_ERR(sys_bg)) {
+                       ret = PTR_ERR(sys_bg);
+                       btrfs_abort_transaction(trans, ret);
+                       goto out;
+               }
+
+               ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+               if (ret) {
+                       btrfs_abort_transaction(trans, ret);
+                       goto out;
+               }
+
+               ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+               if (ret) {
+                       btrfs_abort_transaction(trans, ret);
+                       goto out;
+               }
+       } else if (ret) {
+               btrfs_abort_transaction(trans, ret);
+               goto out;
+       }
+out:
+       btrfs_trans_release_chunk_metadata(trans);
+
+       return ret;
+}
+
 /*
- * If force is CHUNK_ALLOC_FORCE:
+ * Chunk allocation is done in 2 phases:
+ *
+ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
+ *    the chunk, the chunk mapping, create its block group and add the items
+ *    that belong in the chunk btree to it - more specifically, we need to
+ *    update device items in the chunk btree and add a new chunk item to it.
+ *
+ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
+ *    group item to the extent btree and the device extent items to the devices
+ *    btree.
+ *
+ * This is done to prevent deadlocks. For example when COWing a node from the
+ * extent btree we are holding a write lock on the node's parent and if we
+ * trigger chunk allocation and attempted to insert the new block group item
+ * in the extent btree right way, we could deadlock because the path for the
+ * insertion can include that parent node. At first glance it seems impossible
+ * to trigger chunk allocation after starting a transaction since tasks should
+ * reserve enough transaction units (metadata space), however while that is true
+ * most of the time, chunk allocation may still be triggered for several reasons:
+ *
+ * 1) When reserving metadata, we check if there is enough free space in the
+ *    metadata space_info and therefore don't trigger allocation of a new chunk.
+ *    However later when the task actually tries to COW an extent buffer from
+ *    the extent btree or from the device btree for example, it is forced to
+ *    allocate a new block group (chunk) because the only one that had enough
+ *    free space was just turned to RO mode by a running scrub for example (or
+ *    device replace, block group reclaim thread, etc), so we can not use it
+ *    for allocating an extent and end up being forced to allocate a new one;
+ *
+ * 2) Because we only check that the metadata space_info has enough free bytes,
+ *    we end up not allocating a new metadata chunk in that case. However if
+ *    the filesystem was mounted in degraded mode, none of the existing block
+ *    groups might be suitable for extent allocation due to their incompatible
+ *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
+ *    use a RAID1 profile, in degraded mode using a single device). In this case
+ *    when the task attempts to COW some extent buffer of the extent btree for
+ *    example, it will trigger allocation of a new metadata block group with a
+ *    suitable profile (SINGLE profile in the example of the degraded mount of
+ *    the RAID1 filesystem);
+ *
+ * 3) The task has reserved enough transaction units / metadata space, but when
+ *    it attempts to COW an extent buffer from the extent or device btree for
+ *    example, it does not find any free extent in any metadata block group,
+ *    therefore forced to try to allocate a new metadata block group.
+ *    This is because some other task allocated all available extents in the
+ *    meanwhile - this typically happens with tasks that don't reserve space
+ *    properly, either intentionally or as a bug. One example where this is
+ *    done intentionally is fsync, as it does not reserve any transaction units
+ *    and ends up allocating a variable number of metadata extents for log
+ *    tree extent buffers.
+ *
+ * We also need this 2 phases setup when adding a device to a filesystem with
+ * a seed device - we must create new metadata and system chunks without adding
+ * any of the block group items to the chunk, extent and device btrees. If we
+ * did not do it this way, we would get ENOSPC when attempting to update those
+ * btrees, since all the chunks from the seed device are read-only.
+ *
+ * Phase 1 does the updates and insertions to the chunk btree because if we had
+ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
+ * parallel, we risk having too many system chunks allocated by many tasks if
+ * many tasks reach phase 1 without the previous ones completing phase 2. In the
+ * extreme case this leads to exhaustion of the system chunk array in the
+ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
+ * and with RAID filesystems (so we have more device items in the chunk btree).
+ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
+ * the system chunk array due to concurrent allocations") provides more details.
+ *
+ * For allocation of system chunks, we defer the updates and insertions into the
+ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
+ * if the chunk allocation is triggered while COWing an extent buffer of the
+ * chunk btree, we are holding a lock on the parent of that extent buffer and
+ * doing the chunk btree updates and insertions can require locking that parent.
+ * This is for the very few and rare cases where we update the chunk btree that
+ * are not chunk allocation or chunk removal: adding a device, removing a device
+ * or resizing a device.
+ *
+ * The reservation of system space, done through check_system_chunk(), as well
+ * as all the updates and insertions into the chunk btree must be done while
+ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
+ * an extent buffer from the chunks btree we never trigger allocation of a new
+ * system chunk, which would result in a deadlock (trying to lock twice an
+ * extent buffer of the chunk btree, first time before triggering the chunk
+ * allocation and the second time during chunk allocation while attempting to
+ * update the chunks btree). The system chunk array is also updated while holding
+ * that mutex. The same logic applies to removing chunks - we must reserve system
+ * space, update the chunk btree and the system chunk array in the superblock
+ * while holding fs_info->chunk_mutex.
+ *
+ * This function, btrfs_chunk_alloc(), belongs to phase 1.
+ *
+ * If @force is CHUNK_ALLOC_FORCE:
  *    - return 1 if it successfully allocates a chunk,
  *    - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
+ * If @force is NOT CHUNK_ALLOC_FORCE:
  *    - return 0 if it doesn't need to allocate a new chunk,
  *    - return 1 if it successfully allocates a chunk,
  *    - return errors including -ENOSPC otherwise.
@@ -3243,6 +3472,13 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
        /* Don't re-enter if we're already allocating a chunk */
        if (trans->allocating_chunk)
                return -ENOSPC;
+       /*
+        * If we are removing a chunk, don't re-enter or we would deadlock.
+        * System space reservation and system chunk allocation is done by the
+        * chunk remove operation (btrfs_remove_chunk()).
+        */
+       if (trans->removing_chunk)
+               return -ENOSPC;
 
        space_info = btrfs_find_space_info(fs_info, flags);
        ASSERT(space_info);
@@ -3306,13 +3542,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
                        force_metadata_allocation(fs_info);
        }
 
-       /*
-        * Check if we have enough space in SYSTEM chunk because we may need
-        * to update devices.
-        */
-       check_system_chunk(trans, flags);
-
-       ret = btrfs_alloc_chunk(trans, flags);
+       ret = do_chunk_alloc(trans, flags);
        trans->allocating_chunk = false;
 
        spin_lock(&space_info->lock);
@@ -3331,22 +3561,6 @@ out:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
        mutex_unlock(&fs_info->chunk_mutex);
-       /*
-        * When we allocate a new chunk we reserve space in the chunk block
-        * reserve to make sure we can COW nodes/leafs in the chunk tree or
-        * add new nodes/leafs to it if we end up needing to do it when
-        * inserting the chunk item and updating device items as part of the
-        * second phase of chunk allocation, performed by
-        * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
-        * large number of new block groups to create in our transaction
-        * handle's new_bgs list to avoid exhausting the chunk block reserve
-        * in extreme cases - like having a single transaction create many new
-        * block groups when starting to write out the free space caches of all
-        * the block groups that were made dirty during the lifetime of the
-        * transaction.
-        */
-       if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
-               btrfs_create_pending_block_groups(trans);
 
        return ret;
 }
@@ -3367,7 +3581,6 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
  */
 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
 {
-       struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_space_info *info;
        u64 left;
@@ -3382,7 +3595,6 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
        lockdep_assert_held(&fs_info->chunk_mutex);
 
        info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-again:
        spin_lock(&info->lock);
        left = info->total_bytes - btrfs_space_info_used(info, true);
        spin_unlock(&info->lock);
@@ -3401,76 +3613,39 @@ again:
 
        if (left < thresh) {
                u64 flags = btrfs_system_alloc_profile(fs_info);
-               u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
-
-               /*
-                * If there's not available space for the chunk tree (system
-                * space) and there are other tasks that reserved space for
-                * creating a new system block group, wait for them to complete
-                * the creation of their system block group and release excess
-                * reserved space. We do this because:
-                *
-                * *) We can end up allocating more system chunks than necessary
-                *    when there are multiple tasks that are concurrently
-                *    allocating block groups, which can lead to exhaustion of
-                *    the system array in the superblock;
-                *
-                * *) If we allocate extra and unnecessary system block groups,
-                *    despite being empty for a long time, and possibly forever,
-                *    they end not being added to the list of unused block groups
-                *    because that typically happens only when deallocating the
-                *    last extent from a block group - which never happens since
-                *    we never allocate from them in the first place. The few
-                *    exceptions are when mounting a filesystem or running scrub,
-                *    which add unused block groups to the list of unused block
-                *    groups, to be deleted by the cleaner kthread.
-                *    And even when they are added to the list of unused block
-                *    groups, it can take a long time until they get deleted,
-                *    since the cleaner kthread might be sleeping or busy with
-                *    other work (deleting subvolumes, running delayed iputs,
-                *    defrag scheduling, etc);
-                *
-                * This is rare in practice, but can happen when too many tasks
-                * are allocating blocks groups in parallel (via fallocate())
-                * and before the one that reserved space for a new system block
-                * group finishes the block group creation and releases the space
-                * reserved in excess (at btrfs_create_pending_block_groups()),
-                * other tasks end up here and see free system space temporarily
-                * not enough for updating the chunk tree.
-                *
-                * We unlock the chunk mutex before waiting for such tasks and
-                * lock it again after the wait, otherwise we would deadlock.
-                * It is safe to do so because allocating a system chunk is the
-                * first thing done while allocating a new block group.
-                */
-               if (reserved > trans->chunk_bytes_reserved) {
-                       const u64 min_needed = reserved - thresh;
-
-                       mutex_unlock(&fs_info->chunk_mutex);
-                       wait_event(cur_trans->chunk_reserve_wait,
-                          atomic64_read(&cur_trans->chunk_bytes_reserved) <=
-                          min_needed);
-                       mutex_lock(&fs_info->chunk_mutex);
-                       goto again;
-               }
+               struct btrfs_block_group *bg;
 
                /*
                 * Ignore failure to create system chunk. We might end up not
                 * needing it, as we might not need to COW all nodes/leafs from
                 * the paths we visit in the chunk tree (they were already COWed
                 * or created in the current transaction for example).
+                *
+                * Also, if our caller is allocating a system chunk, do not
+                * attempt to insert the chunk item in the chunk btree, as we
+                * could deadlock on an extent buffer since our caller may be
+                * COWing an extent buffer from the chunk btree.
                 */
-               ret = btrfs_alloc_chunk(trans, flags);
+               bg = btrfs_alloc_chunk(trans, flags);
+               if (IS_ERR(bg)) {
+                       ret = PTR_ERR(bg);
+               } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+                       /*
+                        * If we fail to add the chunk item here, we end up
+                        * trying again at phase 2 of chunk allocation, at
+                        * btrfs_create_pending_block_groups(). So ignore
+                        * any error here.
+                        */
+                       btrfs_chunk_alloc_add_chunk_item(trans, bg);
+               }
        }
 
        if (!ret) {
                ret = btrfs_block_rsv_add(fs_info->chunk_root,
                                          &fs_info->chunk_block_rsv,
                                          thresh, BTRFS_RESERVE_NO_FLUSH);
-               if (!ret) {
-                       atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
+               if (!ret)
                        trans->chunk_bytes_reserved += thresh;
-               }
        }
 }
 
index 7b92742..c72a71e 100644 (file)
@@ -97,6 +97,7 @@ struct btrfs_block_group {
        unsigned int removed:1;
        unsigned int to_copy:1;
        unsigned int relocating_repair:1;
+       unsigned int chunk_item_inserted:1;
 
        int disk_cache_state;
 
@@ -268,8 +269,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work);
 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
 int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
-                          u64 type, u64 chunk_offset, u64 size);
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+                                                u64 bytes_used, u64 type,
+                                                u64 chunk_offset, u64 size);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
                             bool do_chunk_alloc);
index 4bc3ca2..c5c08c8 100644 (file)
@@ -364,49 +364,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
-                                         struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root,
-                                         u64 parent_start,
-                                         const struct btrfs_disk_key *disk_key,
-                                         int level,
-                                         u64 hint,
-                                         u64 empty_size,
-                                         enum btrfs_lock_nesting nest)
-{
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct extent_buffer *ret;
-
-       /*
-        * If we are COWing a node/leaf from the extent, chunk, device or free
-        * space trees, make sure that we do not finish block group creation of
-        * pending block groups. We do this to avoid a deadlock.
-        * COWing can result in allocation of a new chunk, and flushing pending
-        * block groups (btrfs_create_pending_block_groups()) can be triggered
-        * when finishing allocation of a new chunk. Creation of a pending block
-        * group modifies the extent, chunk, device and free space trees,
-        * therefore we could deadlock with ourselves since we are holding a
-        * lock on an extent buffer that btrfs_create_pending_block_groups() may
-        * try to COW later.
-        * For similar reasons, we also need to delay flushing pending block
-        * groups when splitting a leaf or node, from one of those trees, since
-        * we are holding a write lock on it and its parent or when inserting a
-        * new root node for one of those trees.
-        */
-       if (root == fs_info->extent_root ||
-           root == fs_info->chunk_root ||
-           root == fs_info->dev_root ||
-           root == fs_info->free_space_root)
-               trans->can_flush_pending_bgs = false;
-
-       ret = btrfs_alloc_tree_block(trans, root, parent_start,
-                                    root->root_key.objectid, disk_key, level,
-                                    hint, empty_size, nest);
-       trans->can_flush_pending_bgs = true;
-
-       return ret;
-}
-
 /*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -455,8 +412,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
                parent_start = parent->start;
 
-       cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
-                                          level, search_start, empty_size, nest);
+       cow = btrfs_alloc_tree_block(trans, root, parent_start,
+                                    root->root_key.objectid, &disk_key, level,
+                                    search_start, empty_size, nest);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -2458,9 +2416,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        else
                btrfs_node_key(lower, &lower_key, 0);
 
-       c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
-                                        root->node->start, 0,
-                                        BTRFS_NESTING_NEW_ROOT);
+       c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+                                  &lower_key, level, root->node->start, 0,
+                                  BTRFS_NESTING_NEW_ROOT);
        if (IS_ERR(c))
                return PTR_ERR(c);
 
@@ -2589,8 +2547,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        mid = (c_nritems + 1) / 2;
        btrfs_node_key(c, &disk_key, mid);
 
-       split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
-                                            c->start, 0, BTRFS_NESTING_SPLIT);
+       split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+                                      &disk_key, level, c->start, 0,
+                                      BTRFS_NESTING_SPLIT);
        if (IS_ERR(split))
                return PTR_ERR(split);
 
@@ -3381,10 +3340,10 @@ again:
         * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
         * use BTRFS_NESTING_NEW_ROOT.
         */
-       right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
-                                            l->start, 0, num_doubles ?
-                                            BTRFS_NESTING_NEW_ROOT :
-                                            BTRFS_NESTING_SPLIT);
+       right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+                                      &disk_key, 0, l->start, 0,
+                                      num_doubles ? BTRFS_NESTING_NEW_ROOT :
+                                      BTRFS_NESTING_SPLIT);
        if (IS_ERR(right))
                return PTR_ERR(right);
 
index e6eb209..8f60314 100644 (file)
@@ -2271,13 +2271,127 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
        return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
 }
 
+/*
+ * Split an extent_map at [start, start + len]
+ *
+ * This function is intended to be used only for extract_ordered_extent().
+ */
+static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+                         u64 pre, u64 post)
+{
+       struct extent_map_tree *em_tree = &inode->extent_tree;
+       struct extent_map *em;
+       struct extent_map *split_pre = NULL;
+       struct extent_map *split_mid = NULL;
+       struct extent_map *split_post = NULL;
+       int ret = 0;
+       int modified;
+       unsigned long flags;
+
+       /* Sanity check */
+       if (pre == 0 && post == 0)
+               return 0;
+
+       split_pre = alloc_extent_map();
+       if (pre)
+               split_mid = alloc_extent_map();
+       if (post)
+               split_post = alloc_extent_map();
+       if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ASSERT(pre + post < len);
+
+       lock_extent(&inode->io_tree, start, start + len - 1);
+       write_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, start, len);
+       if (!em) {
+               ret = -EIO;
+               goto out_unlock;
+       }
+
+       ASSERT(em->len == len);
+       ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+       ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+
+       flags = em->flags;
+       clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+       clear_bit(EXTENT_FLAG_LOGGING, &flags);
+       modified = !list_empty(&em->list);
+
+       /* First, replace the em with a new extent_map starting from * em->start */
+       split_pre->start = em->start;
+       split_pre->len = (pre ? pre : em->len - post);
+       split_pre->orig_start = split_pre->start;
+       split_pre->block_start = em->block_start;
+       split_pre->block_len = split_pre->len;
+       split_pre->orig_block_len = split_pre->block_len;
+       split_pre->ram_bytes = split_pre->len;
+       split_pre->flags = flags;
+       split_pre->compress_type = em->compress_type;
+       split_pre->generation = em->generation;
+
+       replace_extent_mapping(em_tree, em, split_pre, modified);
+
+       /*
+        * Now we only have an extent_map at:
+        *     [em->start, em->start + pre] if pre != 0
+        *     [em->start, em->start + em->len - post] if pre == 0
+        */
+
+       if (pre) {
+               /* Insert the middle extent_map */
+               split_mid->start = em->start + pre;
+               split_mid->len = em->len - pre - post;
+               split_mid->orig_start = split_mid->start;
+               split_mid->block_start = em->block_start + pre;
+               split_mid->block_len = split_mid->len;
+               split_mid->orig_block_len = split_mid->block_len;
+               split_mid->ram_bytes = split_mid->len;
+               split_mid->flags = flags;
+               split_mid->compress_type = em->compress_type;
+               split_mid->generation = em->generation;
+               add_extent_mapping(em_tree, split_mid, modified);
+       }
+
+       if (post) {
+               split_post->start = em->start + em->len - post;
+               split_post->len = post;
+               split_post->orig_start = split_post->start;
+               split_post->block_start = em->block_start + em->len - post;
+               split_post->block_len = split_post->len;
+               split_post->orig_block_len = split_post->block_len;
+               split_post->ram_bytes = split_post->len;
+               split_post->flags = flags;
+               split_post->compress_type = em->compress_type;
+               split_post->generation = em->generation;
+               add_extent_mapping(em_tree, split_post, modified);
+       }
+
+       /* Once for us */
+       free_extent_map(em);
+       /* Once for the tree */
+       free_extent_map(em);
+
+out_unlock:
+       write_unlock(&em_tree->lock);
+       unlock_extent(&inode->io_tree, start, start + len - 1);
+out:
+       free_extent_map(split_pre);
+       free_extent_map(split_mid);
+       free_extent_map(split_post);
+
+       return ret;
+}
+
 static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
                                           struct bio *bio, loff_t file_offset)
 {
        struct btrfs_ordered_extent *ordered;
-       struct extent_map *em = NULL, *em_new = NULL;
-       struct extent_map_tree *em_tree = &inode->extent_tree;
        u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+       u64 file_len;
        u64 len = bio->bi_iter.bi_size;
        u64 end = start + len;
        u64 ordered_end;
@@ -2317,41 +2431,16 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
                goto out;
        }
 
+       file_len = ordered->num_bytes;
        pre = start - ordered->disk_bytenr;
        post = ordered_end - end;
 
        ret = btrfs_split_ordered_extent(ordered, pre, post);
        if (ret)
                goto out;
-
-       read_lock(&em_tree->lock);
-       em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
-       if (!em) {
-               read_unlock(&em_tree->lock);
-               ret = -EIO;
-               goto out;
-       }
-       read_unlock(&em_tree->lock);
-
-       ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
-       /*
-        * We cannot reuse em_new here but have to create a new one, as
-        * unpin_extent_cache() expects the start of the extent map to be the
-        * logical offset of the file, which does not hold true anymore after
-        * splitting.
-        */
-       em_new = create_io_em(inode, em->start + pre, len,
-                             em->start + pre, em->block_start + pre, len,
-                             len, len, BTRFS_COMPRESS_NONE,
-                             BTRFS_ORDERED_REGULAR);
-       if (IS_ERR(em_new)) {
-               ret = PTR_ERR(em_new);
-               goto out;
-       }
-       free_extent_map(em_new);
+       ret = split_zoned_em(inode, file_offset, file_len, pre, post);
 
 out:
-       free_extent_map(em);
        btrfs_put_ordered_extent(ordered);
 
        return errno_to_blk_status(ret);
index 5031823..14b9fdc 100644 (file)
@@ -254,23 +254,21 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
 }
 
 /*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
+ * To be called after doing the chunk btree updates right after allocating a new
+ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
+ * chunk after all chunk btree updates and after finishing the second phase of
+ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
+ * group had its chunk item insertion delayed to the second phase.
  */
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
-       struct btrfs_transaction *cur_trans = trans->transaction;
 
        if (!trans->chunk_bytes_reserved)
                return;
 
-       WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
        btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
                                trans->chunk_bytes_reserved, NULL);
-       atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
-       cond_wake_up(&cur_trans->chunk_reserve_wait);
        trans->chunk_bytes_reserved = 0;
 }
 
@@ -386,8 +384,6 @@ loop:
        spin_lock_init(&cur_trans->dropped_roots_lock);
        INIT_LIST_HEAD(&cur_trans->releasing_ebs);
        spin_lock_init(&cur_trans->releasing_ebs_lock);
-       atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
-       init_waitqueue_head(&cur_trans->chunk_reserve_wait);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
                        IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -701,7 +697,6 @@ again:
        h->fs_info = root->fs_info;
 
        h->type = type;
-       h->can_flush_pending_bgs = true;
        INIT_LIST_HEAD(&h->new_bgs);
 
        smp_mb();
index 07d7602..ba45065 100644 (file)
@@ -96,13 +96,6 @@ struct btrfs_transaction {
 
        spinlock_t releasing_ebs_lock;
        struct list_head releasing_ebs;
-
-       /*
-        * The number of bytes currently reserved, by all transaction handles
-        * attached to this transaction, for metadata extents of the chunk tree.
-        */
-       atomic64_t chunk_bytes_reserved;
-       wait_queue_head_t chunk_reserve_wait;
 };
 
 #define __TRANS_FREEZABLE      (1U << 0)
@@ -139,7 +132,7 @@ struct btrfs_trans_handle {
        short aborted;
        bool adding_csums;
        bool allocating_chunk;
-       bool can_flush_pending_bgs;
+       bool removing_chunk;
        bool reloc_reserved;
        bool in_fsync;
        struct btrfs_root *root;
index cab451d..dc6eb08 100644 (file)
@@ -3173,7 +3173,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                if (!log_root_tree->node) {
                        ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
                        if (ret) {
-                               mutex_unlock(&fs_info->tree_log_mutex);
+                               mutex_unlock(&fs_info->tree_root->log_mutex);
                                goto out;
                        }
                }
index 807502c..1e4d43f 100644 (file)
@@ -1745,19 +1745,14 @@ again:
                extent = btrfs_item_ptr(leaf, path->slots[0],
                                        struct btrfs_dev_extent);
        } else {
-               btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
                goto out;
        }
 
        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
 
        ret = btrfs_del_item(trans, root, path);
-       if (ret) {
-               btrfs_handle_fs_error(fs_info, ret,
-                                     "Failed to remove dev extent item");
-       } else {
+       if (ret == 0)
                set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
-       }
 out:
        btrfs_free_path(path);
        return ret;
@@ -2942,7 +2937,7 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
        u32 cur;
        struct btrfs_key key;
 
-       mutex_lock(&fs_info->chunk_mutex);
+       lockdep_assert_held(&fs_info->chunk_mutex);
        array_size = btrfs_super_sys_array_size(super_copy);
 
        ptr = super_copy->sys_chunk_array;
@@ -2972,7 +2967,6 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
                        cur += len;
                }
        }
-       mutex_unlock(&fs_info->chunk_mutex);
        return ret;
 }
 
@@ -3012,6 +3006,29 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
        return em;
 }
 
+static int remove_chunk_item(struct btrfs_trans_handle *trans,
+                            struct map_lookup *map, u64 chunk_offset)
+{
+       int i;
+
+       /*
+        * Removing chunk items and updating the device items in the chunks btree
+        * requires holding the chunk_mutex.
+        * See the comment at btrfs_chunk_alloc() for the details.
+        */
+       lockdep_assert_held(&trans->fs_info->chunk_mutex);
+
+       for (i = 0; i < map->num_stripes; i++) {
+               int ret;
+
+               ret = btrfs_update_device(trans, map->stripes[i].dev);
+               if (ret)
+                       return ret;
+       }
+
+       return btrfs_free_chunk(trans, chunk_offset);
+}
+
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -3032,14 +3049,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
                return PTR_ERR(em);
        }
        map = em->map_lookup;
-       mutex_lock(&fs_info->chunk_mutex);
-       check_system_chunk(trans, map->type);
-       mutex_unlock(&fs_info->chunk_mutex);
 
        /*
-        * Take the device list mutex to prevent races with the final phase of
-        * a device replace operation that replaces the device object associated
-        * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+        * First delete the device extent items from the devices btree.
+        * We take the device_list_mutex to avoid racing with the finishing phase
+        * of a device replace operation. See the comment below before acquiring
+        * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
+        * because that can result in a deadlock when deleting the device extent
+        * items from the devices btree - COWing an extent buffer from the btree
+        * may result in allocating a new metadata chunk, which would attempt to
+        * lock again fs_info->chunk_mutex.
         */
        mutex_lock(&fs_devices->device_list_mutex);
        for (i = 0; i < map->num_stripes; i++) {
@@ -3061,18 +3080,73 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
                        btrfs_clear_space_info_full(fs_info);
                        mutex_unlock(&fs_info->chunk_mutex);
                }
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
 
-               ret = btrfs_update_device(trans, device);
+       /*
+        * We acquire fs_info->chunk_mutex for 2 reasons:
+        *
+        * 1) Just like with the first phase of the chunk allocation, we must
+        *    reserve system space, do all chunk btree updates and deletions, and
+        *    update the system chunk array in the superblock while holding this
+        *    mutex. This is for similar reasons as explained on the comment at
+        *    the top of btrfs_chunk_alloc();
+        *
+        * 2) Prevent races with the final phase of a device replace operation
+        *    that replaces the device object associated with the map's stripes,
+        *    because the device object's id can change at any time during that
+        *    final phase of the device replace operation
+        *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+        *    replaced device and then see it with an ID of
+        *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
+        *    the device item, which does not exists on the chunk btree.
+        *    The finishing phase of device replace acquires both the
+        *    device_list_mutex and the chunk_mutex, in that order, so we are
+        *    safe by just acquiring the chunk_mutex.
+        */
+       trans->removing_chunk = true;
+       mutex_lock(&fs_info->chunk_mutex);
+
+       check_system_chunk(trans, map->type);
+
+       ret = remove_chunk_item(trans, map, chunk_offset);
+       /*
+        * Normally we should not get -ENOSPC since we reserved space before
+        * through the call to check_system_chunk().
+        *
+        * Despite our system space_info having enough free space, we may not
+        * be able to allocate extents from its block groups, because all have
+        * an incompatible profile, which will force us to allocate a new system
+        * block group with the right profile, or right after we called
+        * check_system_space() above, a scrub turned the only system block group
+        * with enough free space into RO mode.
+        * This is explained with more detail at do_chunk_alloc().
+        *
+        * So if we get -ENOSPC, allocate a new system chunk and retry once.
+        */
+       if (ret == -ENOSPC) {
+               const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
+               struct btrfs_block_group *sys_bg;
+
+               sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+               if (IS_ERR(sys_bg)) {
+                       ret = PTR_ERR(sys_bg);
+                       btrfs_abort_transaction(trans, ret);
+                       goto out;
+               }
+
+               ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
                if (ret) {
-                       mutex_unlock(&fs_devices->device_list_mutex);
                        btrfs_abort_transaction(trans, ret);
                        goto out;
                }
-       }
-       mutex_unlock(&fs_devices->device_list_mutex);
 
-       ret = btrfs_free_chunk(trans, chunk_offset);
-       if (ret) {
+               ret = remove_chunk_item(trans, map, chunk_offset);
+               if (ret) {
+                       btrfs_abort_transaction(trans, ret);
+                       goto out;
+               }
+       } else if (ret) {
                btrfs_abort_transaction(trans, ret);
                goto out;
        }
@@ -3087,6 +3161,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
                }
        }
 
+       mutex_unlock(&fs_info->chunk_mutex);
+       trans->removing_chunk = false;
+
+       /*
+        * We are done with chunk btree updates and deletions, so release the
+        * system space we previously reserved (with check_system_chunk()).
+        */
+       btrfs_trans_release_chunk_metadata(trans);
+
        ret = btrfs_remove_block_group(trans, chunk_offset, em);
        if (ret) {
                btrfs_abort_transaction(trans, ret);
@@ -3094,6 +3177,10 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
        }
 
 out:
+       if (trans->removing_chunk) {
+               mutex_unlock(&fs_info->chunk_mutex);
+               trans->removing_chunk = false;
+       }
        /* once for us */
        free_extent_map(em);
        return ret;
@@ -4860,13 +4947,12 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
        u32 array_size;
        u8 *ptr;
 
-       mutex_lock(&fs_info->chunk_mutex);
+       lockdep_assert_held(&fs_info->chunk_mutex);
+
        array_size = btrfs_super_sys_array_size(super_copy);
        if (array_size + item_size + sizeof(disk_key)
-                       > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
-               mutex_unlock(&fs_info->chunk_mutex);
+                       > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
                return -EFBIG;
-       }
 
        ptr = super_copy->sys_chunk_array + array_size;
        btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4875,7 +4961,6 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
        memcpy(ptr, chunk, item_size);
        item_size += sizeof(disk_key);
        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
-       mutex_unlock(&fs_info->chunk_mutex);
 
        return 0;
 }
@@ -5225,13 +5310,14 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
        }
 }
 
-static int create_chunk(struct btrfs_trans_handle *trans,
+static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
                        struct alloc_chunk_ctl *ctl,
                        struct btrfs_device_info *devices_info)
 {
        struct btrfs_fs_info *info = trans->fs_info;
        struct map_lookup *map = NULL;
        struct extent_map_tree *em_tree;
+       struct btrfs_block_group *block_group;
        struct extent_map *em;
        u64 start = ctl->start;
        u64 type = ctl->type;
@@ -5241,7 +5327,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
 
        map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
        if (!map)
-               return -ENOMEM;
+               return ERR_PTR(-ENOMEM);
        map->num_stripes = ctl->num_stripes;
 
        for (i = 0; i < ctl->ndevs; ++i) {
@@ -5263,7 +5349,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
        em = alloc_extent_map();
        if (!em) {
                kfree(map);
-               return -ENOMEM;
+               return ERR_PTR(-ENOMEM);
        }
        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
        em->map_lookup = map;
@@ -5279,12 +5365,12 @@ static int create_chunk(struct btrfs_trans_handle *trans,
        if (ret) {
                write_unlock(&em_tree->lock);
                free_extent_map(em);
-               return ret;
+               return ERR_PTR(ret);
        }
        write_unlock(&em_tree->lock);
 
-       ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
-       if (ret)
+       block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+       if (IS_ERR(block_group))
                goto error_del_extent;
 
        for (i = 0; i < map->num_stripes; i++) {
@@ -5304,7 +5390,7 @@ static int create_chunk(struct btrfs_trans_handle *trans,
        check_raid56_incompat_flag(info, type);
        check_raid1c34_incompat_flag(info, type);
 
-       return 0;
+       return block_group;
 
 error_del_extent:
        write_lock(&em_tree->lock);
@@ -5316,34 +5402,36 @@ error_del_extent:
        /* One for the tree reference */
        free_extent_map(em);
 
-       return ret;
+       return block_group;
 }
 
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                                           u64 type)
 {
        struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_fs_devices *fs_devices = info->fs_devices;
        struct btrfs_device_info *devices_info = NULL;
        struct alloc_chunk_ctl ctl;
+       struct btrfs_block_group *block_group;
        int ret;
 
        lockdep_assert_held(&info->chunk_mutex);
 
        if (!alloc_profile_is_valid(type, 0)) {
                ASSERT(0);
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
        }
 
        if (list_empty(&fs_devices->alloc_list)) {
                if (btrfs_test_opt(info, ENOSPC_DEBUG))
                        btrfs_debug(info, "%s: no writable device", __func__);
-               return -ENOSPC;
+               return ERR_PTR(-ENOSPC);
        }
 
        if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
                btrfs_err(info, "invalid chunk type 0x%llx requested", type);
                ASSERT(0);
-               return -EINVAL;
+               return ERR_PTR(-EINVAL);
        }
 
        ctl.start = find_next_chunk(info);
@@ -5353,46 +5441,43 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
                               GFP_NOFS);
        if (!devices_info)
-               return -ENOMEM;
+               return ERR_PTR(-ENOMEM);
 
        ret = gather_device_info(fs_devices, &ctl, devices_info);
-       if (ret < 0)
+       if (ret < 0) {
+               block_group = ERR_PTR(ret);
                goto out;
+       }
 
        ret = decide_stripe_size(fs_devices, &ctl, devices_info);
-       if (ret < 0)
+       if (ret < 0) {
+               block_group = ERR_PTR(ret);
                goto out;
+       }
 
-       ret = create_chunk(trans, &ctl, devices_info);
+       block_group = create_chunk(trans, &ctl, devices_info);
 
 out:
        kfree(devices_info);
-       return ret;
+       return block_group;
 }
 
 /*
- * Chunk allocation falls into two parts. The first part does work
- * that makes the new allocated chunk usable, but does not do any operation
- * that modifies the chunk tree. The second part does the work that
- * requires modifying the chunk tree. This division is important for the
- * bootstrap process of adding storage to a seed btrfs.
+ * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
  */
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                             u64 chunk_offset, u64 chunk_size)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
-       struct btrfs_root *extent_root = fs_info->extent_root;
-       struct btrfs_root *chunk_root = fs_info->chunk_root;
-       struct btrfs_key key;
        struct btrfs_device *device;
-       struct btrfs_chunk *chunk;
-       struct btrfs_stripe *stripe;
        struct extent_map *em;
        struct map_lookup *map;
-       size_t item_size;
        u64 dev_offset;
        u64 stripe_size;
-       int i = 0;
+       int i;
        int ret = 0;
 
        em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
@@ -5400,53 +5485,117 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                return PTR_ERR(em);
 
        map = em->map_lookup;
-       item_size = btrfs_chunk_item_size(map->num_stripes);
        stripe_size = em->orig_block_len;
 
-       chunk = kzalloc(item_size, GFP_NOFS);
-       if (!chunk) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
        /*
         * Take the device list mutex to prevent races with the final phase of
         * a device replace operation that replaces the device object associated
         * with the map's stripes, because the device object's id can change
         * at any time during that final phase of the device replace operation
-        * (dev-replace.c:btrfs_dev_replace_finishing()).
+        * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+        * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+        * resulting in persisting a device extent item with such ID.
         */
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        for (i = 0; i < map->num_stripes; i++) {
                device = map->stripes[i].dev;
                dev_offset = map->stripes[i].physical;
 
-               ret = btrfs_update_device(trans, device);
-               if (ret)
-                       break;
                ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
                                             dev_offset, stripe_size);
                if (ret)
                        break;
        }
-       if (ret) {
-               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+       free_extent_map(em);
+       return ret;
+}
+
+/*
+ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
+ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
+ * chunks.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+                                    struct btrfs_block_group *bg)
+{
+       struct btrfs_fs_info *fs_info = trans->fs_info;
+       struct btrfs_root *extent_root = fs_info->extent_root;
+       struct btrfs_root *chunk_root = fs_info->chunk_root;
+       struct btrfs_key key;
+       struct btrfs_chunk *chunk;
+       struct btrfs_stripe *stripe;
+       struct extent_map *em;
+       struct map_lookup *map;
+       size_t item_size;
+       int i;
+       int ret;
+
+       /*
+        * We take the chunk_mutex for 2 reasons:
+        *
+        * 1) Updates and insertions in the chunk btree must be done while holding
+        *    the chunk_mutex, as well as updating the system chunk array in the
+        *    superblock. See the comment on top of btrfs_chunk_alloc() for the
+        *    details;
+        *
+        * 2) To prevent races with the final phase of a device replace operation
+        *    that replaces the device object associated with the map's stripes,
+        *    because the device object's id can change at any time during that
+        *    final phase of the device replace operation
+        *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+        *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+        *    which would cause a failure when updating the device item, which does
+        *    not exists, or persisting a stripe of the chunk item with such ID.
+        *    Here we can't use the device_list_mutex because our caller already
+        *    has locked the chunk_mutex, and the final phase of device replace
+        *    acquires both mutexes - first the device_list_mutex and then the
+        *    chunk_mutex. Using any of those two mutexes protects us from a
+        *    concurrent device replace.
+        */
+       lockdep_assert_held(&fs_info->chunk_mutex);
+
+       em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               btrfs_abort_transaction(trans, ret);
+               return ret;
+       }
+
+       map = em->map_lookup;
+       item_size = btrfs_chunk_item_size(map->num_stripes);
+
+       chunk = kzalloc(item_size, GFP_NOFS);
+       if (!chunk) {
+               ret = -ENOMEM;
+               btrfs_abort_transaction(trans, ret);
                goto out;
        }
 
+       for (i = 0; i < map->num_stripes; i++) {
+               struct btrfs_device *device = map->stripes[i].dev;
+
+               ret = btrfs_update_device(trans, device);
+               if (ret)
+                       goto out;
+       }
+
        stripe = &chunk->stripe;
        for (i = 0; i < map->num_stripes; i++) {
-               device = map->stripes[i].dev;
-               dev_offset = map->stripes[i].physical;
+               struct btrfs_device *device = map->stripes[i].dev;
+               const u64 dev_offset = map->stripes[i].physical;
 
                btrfs_set_stack_stripe_devid(stripe, device->devid);
                btrfs_set_stack_stripe_offset(stripe, dev_offset);
                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
                stripe++;
        }
-       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
-       btrfs_set_stack_chunk_length(chunk, chunk_size);
+       btrfs_set_stack_chunk_length(chunk, bg->length);
        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
        btrfs_set_stack_chunk_type(chunk, map->type);
@@ -5458,15 +5607,18 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.type = BTRFS_CHUNK_ITEM_KEY;
-       key.offset = chunk_offset;
+       key.offset = bg->start;
 
        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-       if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-               /*
-                * TODO: Cleanup of inserted chunk root in case of
-                * failure.
-                */
+       if (ret)
+               goto out;
+
+       bg->chunk_item_inserted = 1;
+
+       if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
+               if (ret)
+                       goto out;
        }
 
 out:
@@ -5479,16 +5631,41 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
 {
        struct btrfs_fs_info *fs_info = trans->fs_info;
        u64 alloc_profile;
-       int ret;
+       struct btrfs_block_group *meta_bg;
+       struct btrfs_block_group *sys_bg;
+
+       /*
+        * When adding a new device for sprouting, the seed device is read-only
+        * so we must first allocate a metadata and a system chunk. But before
+        * adding the block group items to the extent, device and chunk btrees,
+        * we must first:
+        *
+        * 1) Create both chunks without doing any changes to the btrees, as
+        *    otherwise we would get -ENOSPC since the block groups from the
+        *    seed device are read-only;
+        *
+        * 2) Add the device item for the new sprout device - finishing the setup
+        *    of a new block group requires updating the device item in the chunk
+        *    btree, so it must exist when we attempt to do it. The previous step
+        *    ensures this does not fail with -ENOSPC.
+        *
+        * After that we can add the block group items to their btrees:
+        * update existing device item in the chunk btree, add a new block group
+        * item to the extent btree, add a new chunk item to the chunk btree and
+        * finally add the new device extent items to the devices btree.
+        */
 
        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
-       ret = btrfs_alloc_chunk(trans, alloc_profile);
-       if (ret)
-               return ret;
+       meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       if (IS_ERR(meta_bg))
+               return PTR_ERR(meta_bg);
 
        alloc_profile = btrfs_system_alloc_profile(fs_info);
-       ret = btrfs_alloc_chunk(trans, alloc_profile);
-       return ret;
+       sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+       if (IS_ERR(sys_bg))
+               return PTR_ERR(sys_bg);
+
+       return 0;
 }
 
 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
@@ -7415,10 +7592,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
                        total_dev++;
                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
                        struct btrfs_chunk *chunk;
+
+                       /*
+                        * We are only called at mount time, so no need to take
+                        * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
+                        * we always lock first fs_info->chunk_mutex before
+                        * acquiring any locks on the chunk tree. This is a
+                        * requirement for chunk allocation, see the comment on
+                        * top of btrfs_chunk_alloc() for details.
+                        */
+                       ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
-                       mutex_lock(&fs_info->chunk_mutex);
                        ret = read_one_chunk(&found_key, leaf, chunk);
-                       mutex_unlock(&fs_info->chunk_mutex);
                        if (ret)
                                goto error;
                }
index c7fc7ca..55a8ba2 100644 (file)
@@ -450,7 +450,8 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map,
                          struct btrfs_io_geometry *io_geom);
 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                                           u64 type);
 void btrfs_mapping_tree_free(struct extent_map_tree *tree);
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
                           int mirror_num);
@@ -509,6 +510,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
                                    u64 logical);
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                             u64 chunk_offset, u64 chunk_size);
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+                                    struct btrfs_block_group *bg);
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
                                       u64 logical, u64 length);
index 2f63bf3..5a0be99 100644 (file)
@@ -91,7 +91,10 @@ static ssize_t configfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
        }
        pr_debug("%s: count = %zd, pos = %lld, buf = %s\n",
                 __func__, iov_iter_count(to), iocb->ki_pos, buffer->page);
-       retval = copy_to_iter(buffer->page, buffer->count, to);
+       if (iocb->ki_pos >= buffer->count)
+               goto out;
+       retval = copy_to_iter(buffer->page + iocb->ki_pos,
+                             buffer->count - iocb->ki_pos, to);
        iocb->ki_pos += retval;
        if (retval == 0)
                retval = -EFAULT;
@@ -162,7 +165,10 @@ static ssize_t configfs_bin_read_iter(struct kiocb *iocb, struct iov_iter *to)
                buffer->needs_read_fill = 0;
        }
 
-       retval = copy_to_iter(buffer->bin_buffer, buffer->bin_buffer_size, to);
+       if (iocb->ki_pos >= buffer->bin_buffer_size)
+               goto out;
+       retval = copy_to_iter(buffer->bin_buffer + iocb->ki_pos,
+                             buffer->bin_buffer_size - iocb->ki_pos, to);
        iocb->ki_pos += retval;
        if (retval == 0)
                retval = -EFAULT;
@@ -171,21 +177,28 @@ out:
        return retval;
 }
 
-static int fill_write_buffer(struct configfs_buffer *buffer,
+/* Fill [buffer, buffer + pos) with data coming from @from. */
+static int fill_write_buffer(struct configfs_buffer *buffer, loff_t pos,
                             struct iov_iter *from)
 {
+       loff_t to_copy;
        int copied;
+       u8 *to;
 
        if (!buffer->page)
                buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0);
        if (!buffer->page)
                return -ENOMEM;
 
-       copied = copy_from_iter(buffer->page, SIMPLE_ATTR_SIZE - 1, from);
+       to_copy = SIMPLE_ATTR_SIZE - 1 - pos;
+       if (to_copy <= 0)
+               return 0;
+       to = buffer->page + pos;
+       copied = copy_from_iter(to, to_copy, from);
        buffer->needs_read_fill = 1;
        /* if buf is assumed to contain a string, terminate it by \0,
         * so e.g. sscanf() can scan the string easily */
-       buffer->page[copied] = 0;
+       to[copied] = 0;
        return copied ? : -EFAULT;
 }
 
@@ -217,7 +230,7 @@ static ssize_t configfs_write_iter(struct kiocb *iocb, struct iov_iter *from)
        ssize_t len;
 
        mutex_lock(&buffer->mutex);
-       len = fill_write_buffer(buffer, from);
+       len = fill_write_buffer(buffer, iocb->ki_pos, from);
        if (len > 0)
                len = flush_write_buffer(file, buffer, len);
        if (len > 0)
@@ -272,7 +285,9 @@ static ssize_t configfs_bin_write_iter(struct kiocb *iocb,
                buffer->bin_buffer_size = end_offset;
        }
 
-       len = copy_from_iter(buffer->bin_buffer, buffer->bin_buffer_size, from);
+       len = copy_from_iter(buffer->bin_buffer + iocb->ki_pos,
+                            buffer->bin_buffer_size - iocb->ki_pos, from);
+       iocb->ki_pos += len;
 out:
        mutex_unlock(&buffer->mutex);
        return len ? : -EFAULT;
index dfc72f1..f946bec 100644 (file)
@@ -369,8 +369,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        /* 32-bit arches must use fcntl64() */
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
-#endif
                fallthrough;
+#endif
        case F_SETLK:
        case F_SETLKW:
                if (copy_from_user(&flock, argp, sizeof(flock)))
index 2834d1a..de1985e 100644 (file)
@@ -80,6 +80,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
 }
 
 /**
+ * vfs_parse_fs_param_source - Handle setting "source" via parameter
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * This is a simple helper for filesystems to verify that the "source" they
+ * accept is sane.
+ *
+ * Returns 0 on success, -ENOPARAM if this is not  "source" parameter, and
+ * -EINVAL otherwise. In the event of failure, supplementary error information
+ *  is logged.
+ */
+int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
+{
+       if (strcmp(param->key, "source") != 0)
+               return -ENOPARAM;
+
+       if (param->type != fs_value_is_string)
+               return invalf(fc, "Non-string source");
+
+       if (fc->source)
+               return invalf(fc, "Multiple sources");
+
+       fc->source = param->string;
+       param->string = NULL;
+       return 0;
+}
+EXPORT_SYMBOL(vfs_parse_fs_param_source);
+
+/**
  * vfs_parse_fs_param - Add a single parameter to a superblock config
  * @fc: The filesystem context to modify
  * @param: The parameter
@@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
        /* If the filesystem doesn't take any arguments, give it the
         * default handling of source.
         */
-       if (strcmp(param->key, "source") == 0) {
-               if (param->type != fs_value_is_string)
-                       return invalf(fc, "VFS: Non-string source");
-               if (fc->source)
-                       return invalf(fc, "VFS: Multiple sources");
-               fc->source = param->string;
-               param->string = NULL;
-               return 0;
-       }
+       ret = vfs_parse_fs_param_source(fc, param);
+       if (ret != -ENOPARAM)
+               return ret;
 
        return invalf(fc, "%s: Unknown parameter '%s'",
                      fc->fs_type->name, param->key);
@@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
        struct legacy_fs_context *ctx = fc->fs_private;
        unsigned int size = ctx->data_size;
        size_t len = 0;
+       int ret;
 
-       if (strcmp(param->key, "source") == 0) {
-               if (param->type != fs_value_is_string)
-                       return invalf(fc, "VFS: Legacy: Non-string source");
-               if (fc->source)
-                       return invalf(fc, "VFS: Legacy: Multiple sources");
-               fc->source = param->string;
-               param->string = NULL;
-               return 0;
-       }
+       ret = vfs_parse_fs_param_source(fc, param);
+       if (ret != -ENOPARAM)
+               return ret;
 
        if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
                return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
index 4af318f..ef9498a 100644 (file)
@@ -25,7 +25,19 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
        fd->key = ptr + tree->max_key_len + 2;
        hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
                tree->cnid, __builtin_return_address(0));
-       mutex_lock(&tree->tree_lock);
+       switch (tree->cnid) {
+       case HFS_CAT_CNID:
+               mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
+               break;
+       case HFS_EXT_CNID:
+               mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
+               break;
+       case HFS_ATTR_CNID:
+               mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
+               break;
+       default:
+               return -EINVAL;
+       }
        return 0;
 }
 
index b63a4df..c0a73a6 100644 (file)
 
 #include "btree.h"
 
-void hfs_bnode_read(struct hfs_bnode *node, void *buf,
-               int off, int len)
+void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 {
        struct page *page;
+       int pagenum;
+       int bytes_read;
+       int bytes_to_read;
+       void *vaddr;
 
        off += node->page_offset;
-       page = node->page[0];
+       pagenum = off >> PAGE_SHIFT;
+       off &= ~PAGE_MASK; /* compute page offset for the first page */
 
-       memcpy(buf, kmap(page) + off, len);
-       kunmap(page);
+       for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) {
+               if (pagenum >= node->tree->pages_per_bnode)
+                       break;
+               page = node->page[pagenum];
+               bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off);
+
+               vaddr = kmap_atomic(page);
+               memcpy(buf + bytes_read, vaddr + off, bytes_to_read);
+               kunmap_atomic(vaddr);
+
+               pagenum++;
+               off = 0; /* page offset only applies to the first page */
+       }
 }
 
 u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
index 4ba45ca..0e6baee 100644 (file)
@@ -13,6 +13,13 @@ typedef int (*btree_keycmp)(const btree_key *, const btree_key *);
 
 #define NODE_HASH_SIZE  256
 
+/* B-tree mutex nested subclasses */
+enum hfs_btree_mutex_classes {
+       CATALOG_BTREE_MUTEX,
+       EXTENTS_BTREE_MUTEX,
+       ATTR_BTREE_MUTEX,
+};
+
 /* A HFS BTree held in memory */
 struct hfs_btree {
        struct super_block *sb;
index 44d07c9..12d9bae 100644 (file)
@@ -420,14 +420,12 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!res) {
                if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
                        res =  -EIO;
-                       goto bail;
+                       goto bail_hfs_find;
                }
                hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
        }
-       if (res) {
-               hfs_find_exit(&fd);
-               goto bail_no_root;
-       }
+       if (res)
+               goto bail_hfs_find;
        res = -EINVAL;
        root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
        hfs_find_exit(&fd);
@@ -443,6 +441,8 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
        /* everything's okay */
        return 0;
 
+bail_hfs_find:
+       hfs_find_exit(&fd);
 bail_no_root:
        pr_err("get root inode failed\n");
 bail:
index eac6788..c4769a9 100644 (file)
@@ -253,7 +253,7 @@ static int vboxsf_dir_instantiate(struct inode *parent, struct dentry *dentry,
 }
 
 static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
-                            umode_t mode, int is_dir)
+                            umode_t mode, bool is_dir, bool excl, u64 *handle_ret)
 {
        struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent);
        struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
@@ -261,10 +261,12 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
        int err;
 
        params.handle = SHFL_HANDLE_NIL;
-       params.create_flags = SHFL_CF_ACT_CREATE_IF_NEW |
-                             SHFL_CF_ACT_FAIL_IF_EXISTS |
-                             SHFL_CF_ACCESS_READWRITE |
-                             (is_dir ? SHFL_CF_DIRECTORY : 0);
+       params.create_flags = SHFL_CF_ACT_CREATE_IF_NEW | SHFL_CF_ACCESS_READWRITE;
+       if (is_dir)
+               params.create_flags |= SHFL_CF_DIRECTORY;
+       if (excl)
+               params.create_flags |= SHFL_CF_ACT_FAIL_IF_EXISTS;
+
        params.info.attr.mode = (mode & 0777) |
                                (is_dir ? SHFL_TYPE_DIRECTORY : SHFL_TYPE_FILE);
        params.info.attr.additional = SHFLFSOBJATTRADD_NOTHING;
@@ -276,30 +278,81 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry,
        if (params.result != SHFL_FILE_CREATED)
                return -EPERM;
 
-       vboxsf_close(sbi->root, params.handle);
-
        err = vboxsf_dir_instantiate(parent, dentry, &params.info);
        if (err)
-               return err;
+               goto out;
 
        /* parent directory access/change time changed */
        sf_parent_i->force_restat = 1;
 
-       return 0;
+out:
+       if (err == 0 && handle_ret)
+               *handle_ret = params.handle;
+       else
+               vboxsf_close(sbi->root, params.handle);
+
+       return err;
 }
 
 static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns,
                             struct inode *parent, struct dentry *dentry,
                             umode_t mode, bool excl)
 {
-       return vboxsf_dir_create(parent, dentry, mode, 0);
+       return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL);
 }
 
 static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns,
                            struct inode *parent, struct dentry *dentry,
                            umode_t mode)
 {
-       return vboxsf_dir_create(parent, dentry, mode, 1);
+       return vboxsf_dir_create(parent, dentry, mode, true, true, NULL);
+}
+
+static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
+                                 struct file *file, unsigned int flags, umode_t mode)
+{
+       struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb);
+       struct vboxsf_handle *sf_handle;
+       struct dentry *res = NULL;
+       u64 handle;
+       int err;
+
+       if (d_in_lookup(dentry)) {
+               res = vboxsf_dir_lookup(parent, dentry, 0);
+               if (IS_ERR(res))
+                       return PTR_ERR(res);
+
+               if (res)
+                       dentry = res;
+       }
+
+       /* Only creates */
+       if (!(flags & O_CREAT) || d_really_is_positive(dentry))
+               return finish_no_open(file, res);
+
+       err = vboxsf_dir_create(parent, dentry, mode, false, flags & O_EXCL, &handle);
+       if (err)
+               goto out;
+
+       sf_handle = vboxsf_create_sf_handle(d_inode(dentry), handle, SHFL_CF_ACCESS_READWRITE);
+       if (IS_ERR(sf_handle)) {
+               vboxsf_close(sbi->root, handle);
+               err = PTR_ERR(sf_handle);
+               goto out;
+       }
+
+       err = finish_open(file, dentry, generic_file_open);
+       if (err) {
+               /* This also closes the handle passed to vboxsf_create_sf_handle() */
+               vboxsf_release_sf_handle(d_inode(dentry), sf_handle);
+               goto out;
+       }
+
+       file->private_data = sf_handle;
+       file->f_mode |= FMODE_CREATED;
+out:
+       dput(res);
+       return err;
 }
 
 static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry)
@@ -422,6 +475,7 @@ const struct inode_operations vboxsf_dir_iops = {
        .lookup  = vboxsf_dir_lookup,
        .create  = vboxsf_dir_mkfile,
        .mkdir   = vboxsf_dir_mkdir,
+       .atomic_open = vboxsf_dir_atomic_open,
        .rmdir   = vboxsf_dir_unlink,
        .unlink  = vboxsf_dir_unlink,
        .rename  = vboxsf_dir_rename,
index c4ab599..864c2fa 100644 (file)
@@ -20,17 +20,39 @@ struct vboxsf_handle {
        struct list_head head;
 };
 
-static int vboxsf_file_open(struct inode *inode, struct file *file)
+struct vboxsf_handle *vboxsf_create_sf_handle(struct inode *inode,
+                                             u64 handle, u32 access_flags)
 {
        struct vboxsf_inode *sf_i = VBOXSF_I(inode);
-       struct shfl_createparms params = {};
        struct vboxsf_handle *sf_handle;
-       u32 access_flags = 0;
-       int err;
 
        sf_handle = kmalloc(sizeof(*sf_handle), GFP_KERNEL);
        if (!sf_handle)
-               return -ENOMEM;
+               return ERR_PTR(-ENOMEM);
+
+       /* the host may have given us different attr then requested */
+       sf_i->force_restat = 1;
+
+       /* init our handle struct and add it to the inode's handles list */
+       sf_handle->handle = handle;
+       sf_handle->root = VBOXSF_SBI(inode->i_sb)->root;
+       sf_handle->access_flags = access_flags;
+       kref_init(&sf_handle->refcount);
+
+       mutex_lock(&sf_i->handle_list_mutex);
+       list_add(&sf_handle->head, &sf_i->handle_list);
+       mutex_unlock(&sf_i->handle_list_mutex);
+
+       return sf_handle;
+}
+
+static int vboxsf_file_open(struct inode *inode, struct file *file)
+{
+       struct vboxsf_sbi *sbi = VBOXSF_SBI(inode->i_sb);
+       struct shfl_createparms params = {};
+       struct vboxsf_handle *sf_handle;
+       u32 access_flags = 0;
+       int err;
 
        /*
         * We check the value of params.handle afterwards to find out if
@@ -83,23 +105,14 @@ static int vboxsf_file_open(struct inode *inode, struct file *file)
        err = vboxsf_create_at_dentry(file_dentry(file), &params);
        if (err == 0 && params.handle == SHFL_HANDLE_NIL)
                err = (params.result == SHFL_FILE_EXISTS) ? -EEXIST : -ENOENT;
-       if (err) {
-               kfree(sf_handle);
+       if (err)
                return err;
-       }
-
-       /* the host may have given us different attr then requested */
-       sf_i->force_restat = 1;
 
-       /* init our handle struct and add it to the inode's handles list */
-       sf_handle->handle = params.handle;
-       sf_handle->root = VBOXSF_SBI(inode->i_sb)->root;
-       sf_handle->access_flags = access_flags;
-       kref_init(&sf_handle->refcount);
-
-       mutex_lock(&sf_i->handle_list_mutex);
-       list_add(&sf_handle->head, &sf_i->handle_list);
-       mutex_unlock(&sf_i->handle_list_mutex);
+       sf_handle = vboxsf_create_sf_handle(inode, params.handle, access_flags);
+       if (IS_ERR(sf_handle)) {
+               vboxsf_close(sbi->root, params.handle);
+               return PTR_ERR(sf_handle);
+       }
 
        file->private_data = sf_handle;
        return 0;
@@ -114,22 +127,26 @@ static void vboxsf_handle_release(struct kref *refcount)
        kfree(sf_handle);
 }
 
-static int vboxsf_file_release(struct inode *inode, struct file *file)
+void vboxsf_release_sf_handle(struct inode *inode, struct vboxsf_handle *sf_handle)
 {
        struct vboxsf_inode *sf_i = VBOXSF_I(inode);
-       struct vboxsf_handle *sf_handle = file->private_data;
 
+       mutex_lock(&sf_i->handle_list_mutex);
+       list_del(&sf_handle->head);
+       mutex_unlock(&sf_i->handle_list_mutex);
+
+       kref_put(&sf_handle->refcount, vboxsf_handle_release);
+}
+
+static int vboxsf_file_release(struct inode *inode, struct file *file)
+{
        /*
         * When a file is closed on our (the guest) side, we want any subsequent
         * accesses done on the host side to see all changes done from our side.
         */
        filemap_write_and_wait(inode->i_mapping);
 
-       mutex_lock(&sf_i->handle_list_mutex);
-       list_del(&sf_handle->head);
-       mutex_unlock(&sf_i->handle_list_mutex);
-
-       kref_put(&sf_handle->refcount, vboxsf_handle_release);
+       vboxsf_release_sf_handle(inode, file->private_data);
        return 0;
 }
 
index 6a7a9ce..9047bef 100644 (file)
@@ -18,6 +18,8 @@
 #define VBOXSF_SBI(sb) ((struct vboxsf_sbi *)(sb)->s_fs_info)
 #define VBOXSF_I(i)    container_of(i, struct vboxsf_inode, vfs_inode)
 
+struct vboxsf_handle;
+
 struct vboxsf_options {
        unsigned long ttl;
        kuid_t uid;
@@ -80,6 +82,11 @@ extern const struct file_operations vboxsf_reg_fops;
 extern const struct address_space_operations vboxsf_reg_aops;
 extern const struct dentry_operations vboxsf_dentry_ops;
 
+/* from file.c */
+struct vboxsf_handle *vboxsf_create_sf_handle(struct inode *inode,
+                                             u64 handle, u32 access_flags);
+void vboxsf_release_sf_handle(struct inode *inode, struct vboxsf_handle *sf_handle);
+
 /* from utils.c */
 struct inode *vboxsf_new_inode(struct super_block *sb);
 int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
index d9d7d51..191d517 100644 (file)
@@ -483,7 +483,7 @@ xfs_attr_set_iter(
                if (error)
                        return error;
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_RM_LBLK:
                /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
                dac->dela_state = XFS_DAS_RM_LBLK;
@@ -496,7 +496,7 @@ xfs_attr_set_iter(
                        return -EAGAIN;
                }
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_RD_LEAF:
                /*
                 * This is the last step for leaf format. Read the block with
@@ -528,7 +528,7 @@ xfs_attr_set_iter(
                                return error;
                }
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_ALLOC_NODE:
                /*
                 * If there was an out-of-line value, allocate the blocks we
@@ -590,7 +590,7 @@ xfs_attr_set_iter(
                if (error)
                        return error;
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_RM_NBLK:
                /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
                dac->dela_state = XFS_DAS_RM_NBLK;
@@ -603,7 +603,7 @@ xfs_attr_set_iter(
                        return -EAGAIN;
                }
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_CLR_FLAG:
                /*
                 * The last state for node format. Look up the old attr and
@@ -1406,7 +1406,7 @@ xfs_attr_remove_iter(
                        state = dac->da_state;
                }
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_RMTBLK:
                dac->dela_state = XFS_DAS_RMTBLK;
 
@@ -1441,7 +1441,7 @@ xfs_attr_remove_iter(
                        return -EAGAIN;
                }
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_RM_NAME:
                /*
                 * If we came here fresh from a transaction roll, reattach all
@@ -1469,7 +1469,7 @@ xfs_attr_remove_iter(
                        return -EAGAIN;
                }
 
-               /* fallthrough */
+               fallthrough;
        case XFS_DAS_RM_SHRINK:
                /*
                 * If the result is small enough, push it all into the inode.
index f309fc1..e8e2b03 100644 (file)
@@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor {
        void *tailcall_target;
        void *tailcall_bypass;
        void *bypass_addr;
+       void *aux;
        union {
                struct {
                        struct bpf_map *map;
index 29dbb60..232daae 100644 (file)
@@ -758,6 +758,16 @@ ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
                              enum ethtool_link_mode_bit_indices link_mode);
 
 /**
+ * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller
+ *                           is responsible to free memory of vclock_index
+ * @dev: pointer to net_device structure
+ * @vclock_index: pointer to pointer of vclock index
+ *
+ * Return number of phc vclocks
+ */
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index);
+
+/**
  * ethtool_sprintf - Write formatted string to ethtool string data
  * @data: Pointer to start of string to update
  * @fmt: Format of string to write
index 37e1e8f..e2bc163 100644 (file)
@@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 extern int generic_parse_monolithic(struct fs_context *fc, void *data);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
+extern int vfs_parse_fs_param_source(struct fs_context *fc,
+                                    struct fs_parameter *param);
 
 /*
  * sget() wrappers to be called from the ->get_tree() op.
index 5310e21..dd874a1 100644 (file)
@@ -3,6 +3,7 @@
 #define _LINUX_KASAN_H
 
 #include <linux/bug.h>
+#include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
 
index acee44b..0f06c22 100644 (file)
 #define MARVELL_PHY_ID_88E1545         0x01410ea0
 #define MARVELL_PHY_ID_88E1548P                0x01410ec0
 #define MARVELL_PHY_ID_88E3016         0x01410e60
+#define MARVELL_PHY_ID_88X3310         0x002b09a0
 #define MARVELL_PHY_ID_88E2110         0x002b09b0
 #define MARVELL_PHY_ID_88X2222         0x01410f10
 
-/* PHY IDs and mask for Alaska 10G PHYs */
-#define MARVELL_PHY_ID_88X33X0_MASK    0xfffffff8
-#define MARVELL_PHY_ID_88X3310         0x002b09a0
-#define MARVELL_PHY_ID_88X3340         0x002b09a8
-
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0
 
index 9b7b7cd..23dadf7 100644 (file)
@@ -51,7 +51,6 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
                                  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page, int extra_count);
-extern void copy_huge_page(struct page *dst, struct page *src);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,10 +76,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 {
        return -ENOSYS;
 }
-
-static inline void copy_huge_page(struct page *dst, struct page *src)
-{
-}
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
index 57453db..7ca22e6 100644 (file)
@@ -906,6 +906,7 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
+void copy_huge_page(struct page *dst, struct page *src);
 
 /*
  * Compound pages have a destructor function.  Provide a
index aba237c..71fac92 100644 (file)
 #include <linux/device.h>
 #include <linux/pps_kernel.h>
 #include <linux/ptp_clock.h>
+#include <linux/timecounter.h>
+#include <linux/skbuff.h>
 
+#define PTP_CLOCK_NAME_LEN     32
 /**
  * struct ptp_clock_request - request PTP clock event
  *
@@ -134,7 +137,7 @@ struct ptp_system_timestamp {
 
 struct ptp_clock_info {
        struct module *owner;
-       char name[16];
+       char name[PTP_CLOCK_NAME_LEN];
        s32 max_adj;
        int n_alarm;
        int n_ext_ts;
@@ -304,6 +307,27 @@ int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay);
  */
 void ptp_cancel_worker_sync(struct ptp_clock *ptp);
 
+/**
+ * ptp_get_vclocks_index() - get all vclocks index on pclock, and
+ *                           caller is responsible to free memory
+ *                           of vclock_index
+ *
+ * @pclock_index: phc index of ptp pclock.
+ * @vclock_index: pointer to pointer of vclock index.
+ *
+ * return number of vclocks.
+ */
+int ptp_get_vclocks_index(int pclock_index, int **vclock_index);
+
+/**
+ * ptp_convert_timestamp() - convert timestamp to a ptp vclock time
+ *
+ * @hwtstamps:    skb_shared_hwtstamps structure pointer
+ * @vclock_index: phc index of ptp vclock.
+ */
+void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+                          int vclock_index);
+
 #else
 static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
                                                   struct device *parent)
@@ -323,6 +347,11 @@ static inline int ptp_schedule_worker(struct ptp_clock *ptp,
 { return -EOPNOTSUPP; }
 static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp)
 { }
+static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
+{ return 0; }
+static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+                                        int vclock_index)
+{ }
 
 #endif
 
index 83fb861..c976cc6 100644 (file)
@@ -291,7 +291,9 @@ static inline int page_referenced(struct page *page, int is_locked,
        return 0;
 }
 
-#define try_to_unmap(page, refs) false
+static inline void try_to_unmap(struct page *page, enum ttu_flags flags)
+{
+}
 
 static inline int page_mkclean(struct page *page)
 {
index d5ae621..a6f03b3 100644 (file)
@@ -115,7 +115,9 @@ struct stmmac_axi {
 
 #define EST_GCL                1024
 struct stmmac_est {
+       struct mutex lock;
        int enable;
+       u32 btr_reserve[2];
        u32 btr_offset[2];
        u32 btr[2];
        u32 ctr[2];
index 143568d..4b57bbb 100644 (file)
@@ -338,7 +338,7 @@ do {                                                                             \
        FP_SET_EXCEPTION(FP_EX_INVALID | FP_EX_INVALID_ISI);                 \
        break;                                                               \
       }                                                                             \
-    /* FALLTHRU */                                                          \
+    fallthrough;                                                            \
                                                                             \
   case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NORMAL):                           \
   case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_ZERO):                                     \
index 1533573..625d9c7 100644 (file)
@@ -201,6 +201,11 @@ struct bond_up_slave {
  */
 #define BOND_LINK_NOCHANGE -1
 
+struct bond_ipsec {
+       struct list_head list;
+       struct xfrm_state *xs;
+};
+
 /*
  * Here are the locking policies for the two bonding locks:
  * Get rcu_read_lock when reading or RTNL when writing slave list.
@@ -249,7 +254,9 @@ struct bonding {
 #endif /* CONFIG_DEBUG_FS */
        struct rtnl_link_stats64 bond_stats;
 #ifdef CONFIG_XFRM_OFFLOAD
-       struct xfrm_state *xs;
+       struct list_head ipsec_list;
+       /* protecting ipsec_list */
+       spinlock_t ipsec_lock;
 #endif /* CONFIG_XFRM_OFFLOAD */
 };
 
index 73af4a6..40296ed 100644 (file)
@@ -38,7 +38,7 @@ static inline bool net_busy_loop_on(void)
 
 static inline bool sk_can_busy_loop(const struct sock *sk)
 {
-       return sk->sk_ll_usec && !signal_pending(current);
+       return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
 }
 
 bool sk_busy_loop_end(void *p, unsigned long start_time);
diff --git a/include/net/caif/caif_hsi.h b/include/net/caif/caif_hsi.h
deleted file mode 100644 (file)
index 552cf68..0000000
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author:  Daniel Martensson / daniel.martensson@stericsson.com
- *         Dmitry.Tarnyagin  / dmitry.tarnyagin@stericsson.com
- */
-
-#ifndef CAIF_HSI_H_
-#define CAIF_HSI_H_
-
-#include <net/caif/caif_layer.h>
-#include <net/caif/caif_device.h>
-#include <linux/atomic.h>
-
-/*
- * Maximum number of CAIF frames that can reside in the same HSI frame.
- */
-#define CFHSI_MAX_PKTS 15
-
-/*
- * Maximum number of bytes used for the frame that can be embedded in the
- * HSI descriptor.
- */
-#define CFHSI_MAX_EMB_FRM_SZ 96
-
-/*
- * Decides if HSI buffers should be prefilled with 0xFF pattern for easier
- * debugging. Both TX and RX buffers will be filled before the transfer.
- */
-#define CFHSI_DBG_PREFILL              0
-
-/* Structure describing a HSI packet descriptor. */
-#pragma pack(1) /* Byte alignment. */
-struct cfhsi_desc {
-       u8 header;
-       u8 offset;
-       u16 cffrm_len[CFHSI_MAX_PKTS];
-       u8 emb_frm[CFHSI_MAX_EMB_FRM_SZ];
-};
-#pragma pack() /* Default alignment. */
-
-/* Size of the complete HSI packet descriptor. */
-#define CFHSI_DESC_SZ (sizeof(struct cfhsi_desc))
-
-/*
- * Size of the complete HSI packet descriptor excluding the optional embedded
- * CAIF frame.
- */
-#define CFHSI_DESC_SHORT_SZ (CFHSI_DESC_SZ - CFHSI_MAX_EMB_FRM_SZ)
-
-/*
- * Maximum bytes transferred in one transfer.
- */
-#define CFHSI_MAX_CAIF_FRAME_SZ 4096
-
-#define CFHSI_MAX_PAYLOAD_SZ (CFHSI_MAX_PKTS * CFHSI_MAX_CAIF_FRAME_SZ)
-
-/* Size of the complete HSI TX buffer. */
-#define CFHSI_BUF_SZ_TX (CFHSI_DESC_SZ + CFHSI_MAX_PAYLOAD_SZ)
-
-/* Size of the complete HSI RX buffer. */
-#define CFHSI_BUF_SZ_RX ((2 * CFHSI_DESC_SZ) + CFHSI_MAX_PAYLOAD_SZ)
-
-/* Bitmasks for the HSI descriptor. */
-#define CFHSI_PIGGY_DESC               (0x01 << 7)
-
-#define CFHSI_TX_STATE_IDLE                    0
-#define CFHSI_TX_STATE_XFER                    1
-
-#define CFHSI_RX_STATE_DESC                    0
-#define CFHSI_RX_STATE_PAYLOAD                 1
-
-/* Bitmasks for power management. */
-#define CFHSI_WAKE_UP                          0
-#define CFHSI_WAKE_UP_ACK                      1
-#define CFHSI_WAKE_DOWN_ACK                    2
-#define CFHSI_AWAKE                            3
-#define CFHSI_WAKELOCK_HELD                    4
-#define CFHSI_SHUTDOWN                         5
-#define CFHSI_FLUSH_FIFO                       6
-
-#ifndef CFHSI_INACTIVITY_TOUT
-#define CFHSI_INACTIVITY_TOUT                  (1 * HZ)
-#endif /* CFHSI_INACTIVITY_TOUT */
-
-#ifndef CFHSI_WAKE_TOUT
-#define CFHSI_WAKE_TOUT                        (3 * HZ)
-#endif /* CFHSI_WAKE_TOUT */
-
-#ifndef CFHSI_MAX_RX_RETRIES
-#define CFHSI_MAX_RX_RETRIES           (10 * HZ)
-#endif
-
-/* Structure implemented by the CAIF HSI driver. */
-struct cfhsi_cb_ops {
-       void (*tx_done_cb) (struct cfhsi_cb_ops *drv);
-       void (*rx_done_cb) (struct cfhsi_cb_ops *drv);
-       void (*wake_up_cb) (struct cfhsi_cb_ops *drv);
-       void (*wake_down_cb) (struct cfhsi_cb_ops *drv);
-};
-
-/* Structure implemented by HSI device. */
-struct cfhsi_ops {
-       int (*cfhsi_up) (struct cfhsi_ops *dev);
-       int (*cfhsi_down) (struct cfhsi_ops *dev);
-       int (*cfhsi_tx) (u8 *ptr, int len, struct cfhsi_ops *dev);
-       int (*cfhsi_rx) (u8 *ptr, int len, struct cfhsi_ops *dev);
-       int (*cfhsi_wake_up) (struct cfhsi_ops *dev);
-       int (*cfhsi_wake_down) (struct cfhsi_ops *dev);
-       int (*cfhsi_get_peer_wake) (struct cfhsi_ops *dev, bool *status);
-       int (*cfhsi_fifo_occupancy) (struct cfhsi_ops *dev, size_t *occupancy);
-       int (*cfhsi_rx_cancel)(struct cfhsi_ops *dev);
-       struct cfhsi_cb_ops *cb_ops;
-};
-
-/* Structure holds status of received CAIF frames processing */
-struct cfhsi_rx_state {
-       int state;
-       int nfrms;
-       int pld_len;
-       int retries;
-       bool piggy_desc;
-};
-
-/* Priority mapping */
-enum {
-       CFHSI_PRIO_CTL = 0,
-       CFHSI_PRIO_VI,
-       CFHSI_PRIO_VO,
-       CFHSI_PRIO_BEBK,
-       CFHSI_PRIO_LAST,
-};
-
-struct cfhsi_config {
-       u32 inactivity_timeout;
-       u32 aggregation_timeout;
-       u32 head_align;
-       u32 tail_align;
-       u32 q_high_mark;
-       u32 q_low_mark;
-};
-
-/* Structure implemented by CAIF HSI drivers. */
-struct cfhsi {
-       struct caif_dev_common cfdev;
-       struct net_device *ndev;
-       struct platform_device *pdev;
-       struct sk_buff_head qhead[CFHSI_PRIO_LAST];
-       struct cfhsi_cb_ops cb_ops;
-       struct cfhsi_ops *ops;
-       int tx_state;
-       struct cfhsi_rx_state rx_state;
-       struct cfhsi_config cfg;
-       int rx_len;
-       u8 *rx_ptr;
-       u8 *tx_buf;
-       u8 *rx_buf;
-       u8 *rx_flip_buf;
-       spinlock_t lock;
-       int flow_off_sent;
-       struct list_head list;
-       struct work_struct wake_up_work;
-       struct work_struct wake_down_work;
-       struct work_struct out_of_sync_work;
-       struct workqueue_struct *wq;
-       wait_queue_head_t wake_up_wait;
-       wait_queue_head_t wake_down_wait;
-       wait_queue_head_t flush_fifo_wait;
-       struct timer_list inactivity_timer;
-       struct timer_list rx_slowpath_timer;
-
-       /* TX aggregation */
-       int aggregation_len;
-       struct timer_list aggregation_timer;
-
-       unsigned long bits;
-};
-extern struct platform_driver cfhsi_driver;
-
-/**
- * enum ifla_caif_hsi - CAIF HSI NetlinkRT parameters.
- * @IFLA_CAIF_HSI_INACTIVITY_TOUT: Inactivity timeout before
- *                     taking the HSI wakeline down, in milliseconds.
- * When using RT Netlink to create, destroy or configure a CAIF HSI interface,
- * enum ifla_caif_hsi is used to specify the configuration attributes.
- */
-enum ifla_caif_hsi {
-       __IFLA_CAIF_HSI_UNSPEC,
-       __IFLA_CAIF_HSI_INACTIVITY_TOUT,
-       __IFLA_CAIF_HSI_AGGREGATION_TOUT,
-       __IFLA_CAIF_HSI_HEAD_ALIGN,
-       __IFLA_CAIF_HSI_TAIL_ALIGN,
-       __IFLA_CAIF_HSI_QHIGH_WATERMARK,
-       __IFLA_CAIF_HSI_QLOW_WATERMARK,
-       __IFLA_CAIF_HSI_MAX
-};
-
-struct cfhsi_ops *cfhsi_get_ops(void);
-
-#endif         /* CAIF_HSI_H_ */
index 56cb3c3..14efa0d 100644 (file)
@@ -45,7 +45,9 @@ skb_tunnel_info(const struct sk_buff *skb)
                return &md_dst->u.tun_info;
 
        dst = skb_dst(skb);
-       if (dst && dst->lwtstate)
+       if (dst && dst->lwtstate &&
+           (dst->lwtstate->type == LWTUNNEL_ENCAP_IP ||
+            dst->lwtstate->type == LWTUNNEL_ENCAP_IP6))
                return lwt_tun_info(dst->lwtstate);
 
        return NULL;
index f14149d..625a38c 100644 (file)
@@ -263,7 +263,7 @@ static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                 int (*output)(struct net *, struct sock *, struct sk_buff *));
 
-static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
+static inline unsigned int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
        int mtu;
 
index cb580b0..8b5af68 100644 (file)
@@ -105,7 +105,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
                               unsigned int *size, unsigned int remaining,
                               struct mptcp_out_options *opts);
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
 
 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
                         struct mptcp_out_options *opts);
@@ -227,9 +227,10 @@ static inline bool mptcp_established_options(struct sock *sk,
        return false;
 }
 
-static inline void mptcp_incoming_options(struct sock *sk,
+static inline bool mptcp_incoming_options(struct sock *sk,
                                          struct sk_buff *skb)
 {
+       return true;
 }
 
 static inline void mptcp_skb_ext_move(struct sk_buff *to,
index 09f2efe..13807ea 100644 (file)
@@ -30,7 +30,6 @@ void nf_conntrack_cleanup_net(struct net *net);
 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);
 
 void nf_conntrack_proto_pernet_init(struct net *net);
-void nf_conntrack_proto_pernet_fini(struct net *net);
 
 int nf_conntrack_proto_init(void);
 void nf_conntrack_proto_fini(void);
index c3094b8..37e5300 100644 (file)
@@ -27,6 +27,7 @@ struct nf_tcp_net {
        u8 tcp_loose;
        u8 tcp_be_liberal;
        u8 tcp_max_retrans;
+       u8 tcp_ignore_invalid_rst;
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
        unsigned int offload_timeout;
        unsigned int offload_pickup;
index 265fffa..5859e0a 100644 (file)
@@ -360,8 +360,7 @@ enum {
 #define SCTP_SCOPE_POLICY_MAX  SCTP_SCOPE_POLICY_LINK
 
 /* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>,
- * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24,
- * 192.88.99.0/24.
+ * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 192.88.99.0/24.
  * Also, RFC 8.4, non-unicast addresses are not considered valid SCTP
  * addresses.
  */
@@ -369,7 +368,6 @@ enum {
        ((htonl(INADDR_BROADCAST) == a) ||  \
         ipv4_is_multicast(a) ||            \
         ipv4_is_zeronet(a) ||              \
-        ipv4_is_test_198(a) ||             \
         ipv4_is_anycast_6to4(a))
 
 /* Flags used for the bind address copy functions.  */
index 8bdd800..f23cb25 100644 (file)
@@ -316,7 +316,9 @@ struct bpf_local_storage;
   *    @sk_timer: sock cleanup timer
   *    @sk_stamp: time stamp of last packet received
   *    @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
-  *    @sk_tsflags: SO_TIMESTAMPING socket options
+  *    @sk_tsflags: SO_TIMESTAMPING flags
+  *    @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
+  *                  for timestamping
   *    @sk_tskey: counter to disambiguate concurrent tstamp requests
   *    @sk_zckey: counter to order MSG_ZEROCOPY notifications
   *    @sk_socket: Identd and reporting IO signals
@@ -493,6 +495,7 @@ struct sock {
        seqlock_t               sk_stamp_seq;
 #endif
        u16                     sk_tsflags;
+       int                     sk_bind_phc;
        u8                      sk_shutdown;
        u32                     sk_tskey;
        atomic_t                sk_zckey;
@@ -2755,7 +2758,8 @@ void sock_def_readable(struct sock *sk);
 
 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
 void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
-int sock_set_timestamping(struct sock *sk, int optname, int val);
+int sock_set_timestamping(struct sock *sk, int optname,
+                         struct so_timestamping timestamping);
 
 void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
index e668f1b..17df9b0 100644 (file)
@@ -686,6 +686,10 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
+       /* mptcp hooks are only on the slow path */
+       if (sk_is_mptcp((struct sock *)tp))
+               return;
+
        tp->pred_flags = htonl((tp->tcp_header_len << 26) |
                               ntohl(TCP_FLAG_ACK) |
                               snd_wnd);
index c7135c9..b3b9371 100644 (file)
@@ -46,6 +46,7 @@ enum {
        ETHTOOL_MSG_FEC_SET,
        ETHTOOL_MSG_MODULE_EEPROM_GET,
        ETHTOOL_MSG_STATS_GET,
+       ETHTOOL_MSG_PHC_VCLOCKS_GET,
 
        /* add new constants above here */
        __ETHTOOL_MSG_USER_CNT,
@@ -88,6 +89,7 @@ enum {
        ETHTOOL_MSG_FEC_NTF,
        ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
        ETHTOOL_MSG_STATS_GET_REPLY,
+       ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
 
        /* add new constants above here */
        __ETHTOOL_MSG_KERNEL_CNT,
@@ -440,6 +442,19 @@ enum {
        ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1)
 };
 
+/* PHC VCLOCKS */
+
+enum {
+       ETHTOOL_A_PHC_VCLOCKS_UNSPEC,
+       ETHTOOL_A_PHC_VCLOCKS_HEADER,                   /* nest - _A_HEADER_* */
+       ETHTOOL_A_PHC_VCLOCKS_NUM,                      /* u32 */
+       ETHTOOL_A_PHC_VCLOCKS_INDEX,                    /* array, s32 */
+
+       /* add new constants above here */
+       __ETHTOOL_A_PHC_VCLOCKS_CNT,
+       ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1)
+};
+
 /* CABLE TEST */
 
 enum {
index 7ed0b3d..fcc61c7 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
 
-/* SO_TIMESTAMPING gets an integer bit field comprised of these values */
+/* SO_TIMESTAMPING flags */
 enum {
        SOF_TIMESTAMPING_TX_HARDWARE = (1<<0),
        SOF_TIMESTAMPING_TX_SOFTWARE = (1<<1),
@@ -30,8 +30,9 @@ enum {
        SOF_TIMESTAMPING_OPT_STATS = (1<<12),
        SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
        SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
+       SOF_TIMESTAMPING_BIND_PHC = (1 << 15),
 
-       SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
+       SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC,
        SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
                                 SOF_TIMESTAMPING_LAST
 };
@@ -47,6 +48,18 @@ enum {
                                         SOF_TIMESTAMPING_TX_ACK)
 
 /**
+ * struct so_timestamping - SO_TIMESTAMPING parameter
+ *
+ * @flags:     SO_TIMESTAMPING flags
+ * @bind_phc:  Index of PTP virtual clock bound to sock. This is available
+ *             if flag SOF_TIMESTAMPING_BIND_PHC is set.
+ */
+struct so_timestamping {
+       int flags;
+       int bind_phc;
+};
+
+/**
  * struct hwtstamp_config - %SIOCGHWTSTAMP and %SIOCSHWTSTAMP parameter
  *
  * @flags:     no flags defined right now, must be zero for %SIOCSHWTSTAMP
index 45c8d3b..0af9c11 100644 (file)
@@ -61,7 +61,7 @@ enum nfulnl_attr_type {
        NFULA_HWTYPE,                   /* hardware type */
        NFULA_HWHEADER,                 /* hardware header */
        NFULA_HWLEN,                    /* hardware header length */
-       NFULA_CT,                       /* nf_conntrack_netlink.h */
+       NFULA_CT,                       /* nfnetlink_conntrack.h */
        NFULA_CT_INFO,                  /* enum ip_conntrack_info */
        NFULA_VLAN,                     /* nested attribute: packet vlan info */
        NFULA_L2HDR,                    /* full L2 header */
index bcb2cb5..aed90c4 100644 (file)
@@ -51,11 +51,11 @@ enum nfqnl_attr_type {
        NFQA_IFINDEX_PHYSOUTDEV,        /* __u32 ifindex */
        NFQA_HWADDR,                    /* nfqnl_msg_packet_hw */
        NFQA_PAYLOAD,                   /* opaque data payload */
-       NFQA_CT,                        /* nf_conntrack_netlink.h */
+       NFQA_CT,                        /* nfnetlink_conntrack.h */
        NFQA_CT_INFO,                   /* enum ip_conntrack_info */
        NFQA_CAP_LEN,                   /* __u32 length of captured packet */
        NFQA_SKB_INFO,                  /* __u32 skb meta information */
-       NFQA_EXP,                       /* nf_conntrack_netlink.h */
+       NFQA_EXP,                       /* nfnetlink_conntrack.h */
        NFQA_UID,                       /* __u32 sk uid */
        NFQA_GID,                       /* __u32 sk gid */
        NFQA_SECCTX,                    /* security context string */
index 034ad93..9b15774 100644 (file)
@@ -2236,8 +2236,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
        if (aux->dst_trampoline)
                bpf_trampoline_put(aux->dst_trampoline);
-       for (i = 0; i < aux->func_cnt; i++)
+       for (i = 0; i < aux->func_cnt; i++) {
+               /* We can just unlink the subprog poke descriptor table as
+                * it was originally linked to the main program and is also
+                * released along with it.
+                */
+               aux->func[i]->aux->poke_tab = NULL;
                bpf_jit_free(aux->func[i]);
+       }
        if (aux->func_cnt) {
                kfree(aux->func);
                bpf_prog_unlock_free(aux->prog);
index 2546daf..fdc2089 100644 (file)
@@ -558,7 +558,8 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
 
        if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
                for (i = 0; i < map->max_entries; i++) {
-                       dst = READ_ONCE(dtab->netdev_map[i]);
+                       dst = rcu_dereference_check(dtab->netdev_map[i],
+                                                   rcu_read_lock_bh_held());
                        if (!is_valid_dst(dst, xdp, exclude_ifindex))
                                continue;
 
@@ -654,7 +655,8 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
 
        if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
                for (i = 0; i < map->max_entries; i++) {
-                       dst = READ_ONCE(dtab->netdev_map[i]);
+                       dst = rcu_dereference_check(dtab->netdev_map[i],
+                                                   rcu_read_lock_bh_held());
                        if (!dst || dst->dev->ifindex == exclude_ifindex)
                                continue;
 
index be38bb9..42a4063 100644 (file)
@@ -12121,33 +12121,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
                        goto out_free;
                func[i]->is_func = 1;
                func[i]->aux->func_idx = i;
-               /* the btf and func_info will be freed only at prog->aux */
+               /* Below members will be freed only at prog->aux */
                func[i]->aux->btf = prog->aux->btf;
                func[i]->aux->func_info = prog->aux->func_info;
+               func[i]->aux->poke_tab = prog->aux->poke_tab;
+               func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
 
                for (j = 0; j < prog->aux->size_poke_tab; j++) {
-                       u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
-                       int ret;
+                       struct bpf_jit_poke_descriptor *poke;
 
-                       if (!(insn_idx >= subprog_start &&
-                             insn_idx <= subprog_end))
-                               continue;
-
-                       ret = bpf_jit_add_poke_descriptor(func[i],
-                                                         &prog->aux->poke_tab[j]);
-                       if (ret < 0) {
-                               verbose(env, "adding tail call poke descriptor failed\n");
-                               goto out_free;
-                       }
-
-                       func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
-
-                       map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
-                       ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
-                       if (ret < 0) {
-                               verbose(env, "tracking tail call prog failed\n");
-                               goto out_free;
-                       }
+                       poke = &prog->aux->poke_tab[j];
+                       if (poke->insn_idx < subprog_end &&
+                           poke->insn_idx >= subprog_start)
+                               poke->aux = func[i]->aux;
                }
 
                /* Use bpf_prog_F_tag to indicate functions in stack traces.
@@ -12178,18 +12164,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
                cond_resched();
        }
 
-       /* Untrack main program's aux structs so that during map_poke_run()
-        * we will not stumble upon the unfilled poke descriptors; each
-        * of the main program's poke descs got distributed across subprogs
-        * and got tracked onto map, so we are sure that none of them will
-        * be missed after the operation below
-        */
-       for (i = 0; i < prog->aux->size_poke_tab; i++) {
-               map_ptr = prog->aux->poke_tab[i].tail_call.map;
-
-               map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
-       }
-
        /* at this point all bpf functions were successfully JITed
         * now populate all bpf_calls with correct addresses and
         * run last pass of JIT
@@ -12267,14 +12241,22 @@ static int jit_subprogs(struct bpf_verifier_env *env)
        bpf_prog_jit_attempt_done(prog);
        return 0;
 out_free:
+       /* We failed JIT'ing, so at this point we need to unregister poke
+        * descriptors from subprogs, so that kernel is not attempting to
+        * patch it anymore as we're freeing the subprog JIT memory.
+        */
+       for (i = 0; i < prog->aux->size_poke_tab; i++) {
+               map_ptr = prog->aux->poke_tab[i].tail_call.map;
+               map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+       }
+       /* At this point we're guaranteed that poke descriptors are not
+        * live anymore. We can just unlink its descriptor table as it's
+        * released with the main prog.
+        */
        for (i = 0; i < env->subprog_cnt; i++) {
                if (!func[i])
                        continue;
-
-               for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
-                       map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
-                       map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
-               }
+               func[i]->aux->poke_tab = NULL;
                bpf_jit_free(func[i]);
        }
        kfree(func);
index ee93b6e..8d6bf56 100644 (file)
@@ -911,13 +911,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
 
        opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
        if (opt == -ENOPARAM) {
-               if (strcmp(param->key, "source") == 0) {
-                       if (fc->source)
-                               return invalf(fc, "Multiple sources not supported");
-                       fc->source = param->string;
-                       param->string = NULL;
-                       return 0;
-               }
+               int ret;
+
+               ret = vfs_parse_fs_param_source(fc, param);
+               if (ret != -ENOPARAM)
+                       return ret;
                for_each_subsys(ss, i) {
                        if (strcmp(param->key, ss->legacy_name))
                                continue;
index 8372897..b6f28fa 100644 (file)
@@ -1045,8 +1045,8 @@ int gdb_serial_stub(struct kgdb_state *ks)
                                gdb_cmd_detachkill(ks);
                                return DBG_PASS_EVENT;
                        }
-#endif
                        fallthrough;
+#endif
                case 'C': /* Exception passing */
                        tmp = gdb_cmd_exception_pass(ks);
                        if (tmp > 0)
index 313d454..d998a76 100644 (file)
@@ -487,13 +487,13 @@ ref_scale_reader(void *arg)
        s64 duration;
 
        VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
-       set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+       WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)));
        set_user_nice(current, MAX_NICE);
        atomic_inc(&n_init);
        if (holdoff)
                schedule_timeout_interruptible(holdoff * HZ);
 repeat:
-       VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+       VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id());
 
        // Wait for signal that this reader can start.
        wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
@@ -503,7 +503,7 @@ repeat:
                goto end;
 
        // Make sure that the CPU is affinitized appropriately during testing.
-       WARN_ON_ONCE(smp_processor_id() != me);
+       WARN_ON_ONCE(raw_smp_processor_id() != me);
 
        WRITE_ONCE(rt->start_reader, 0);
        if (!atomic_dec_return(&n_started))
index 03a118d..8536c55 100644 (file)
@@ -953,10 +953,9 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
                in_qs = likely(!t->trc_reader_nesting);
        }
 
-       // Mark as checked.  Because this is called from the grace-period
-       // kthread, also remove the task from the holdout list.
+       // Mark as checked so that the grace-period kthread will
+       // remove it from the holdout list.
        t->trc_reader_checked = true;
-       trc_del_holdout(t);
 
        if (in_qs)
                return true;  // Already in quiescent state, done!!!
@@ -983,7 +982,6 @@ static void trc_wait_for_one_reader(struct task_struct *t,
        // The current task had better be in a quiescent state.
        if (t == current) {
                t->trc_reader_checked = true;
-               trc_del_holdout(t);
                WARN_ON_ONCE(t->trc_reader_nesting);
                return;
        }
index 3f937b2..6c76988 100644 (file)
@@ -795,9 +795,9 @@ void show_rcu_gp_kthreads(void)
        jr = j - data_race(rcu_state.gp_req_activity);
        js = j - data_race(rcu_state.gp_start);
        jw = j - data_race(rcu_state.gp_wake_time);
-       pr_info("%s: wait state: %s(%d) ->state: %#lx ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
+       pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
                rcu_state.name, gp_state_getname(rcu_state.gp_state),
-               rcu_state.gp_state, t ? t->__state : 0x1ffffL, t ? t->rt_priority : 0xffU,
+               rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU,
                js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
                (long)data_race(rcu_state.gp_seq),
                (long)data_race(rcu_get_root()->gp_seq_needed),
index 2377cbb..29e8fc5 100644 (file)
@@ -405,15 +405,15 @@ static int scftorture_invoker(void *arg)
 
        VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu);
        cpu = scfp->cpu % nr_cpu_ids;
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
+       WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(cpu)));
        set_user_nice(current, MAX_NICE);
        if (holdoff)
                schedule_timeout_interruptible(holdoff * HZ);
 
-       VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id());
+       VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, raw_smp_processor_id());
 
        // Make sure that the CPU is affinitized appropriately during testing.
-       curcpu = smp_processor_id();
+       curcpu = raw_smp_processor_id();
        WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids,
                  "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
                  __func__, scfp->cpu, curcpu, nr_cpu_ids);
index 8c55c47..c259842 100644 (file)
@@ -628,10 +628,8 @@ static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start,
 
        for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
                void *entry;
-               struct page *page;
 
                entry = xa_load(&dmirror->pt, pfn);
-               page = xa_untag_pointer(entry);
                if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC)
                        return -EPERM;
        }
index 924553a..dfc940d 100644 (file)
@@ -5440,8 +5440,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        continue;
                }
 
-               refs = min3(pages_per_huge_page(h) - pfn_offset,
-                           (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
+               /* vaddr may not be aligned to PAGE_SIZE */
+               refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+                   (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
 
                if (pages || vmas)
                        record_subpages_vmas(mem_map_offset(page, pfn_offset),
index 98e3059..d739cdd 100644 (file)
@@ -9,6 +9,7 @@
 #ifdef CONFIG_KASAN_HW_TAGS
 
 #include <linux/static_key.h>
+#include "../slab.h"
 
 DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
 extern bool kasan_flag_async __ro_after_init;
@@ -387,6 +388,17 @@ static inline void kasan_unpoison(const void *addr, size_t size, bool init)
 
        if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
                return;
+       /*
+        * Explicitly initialize the memory with the precise object size to
+        * avoid overwriting the SLAB redzone. This disables initialization in
+        * the arch code and may thus lead to performance penalty. The penalty
+        * is accepted since SLAB redzones aren't enabled in production builds.
+        */
+       if (__slub_debug_enabled() &&
+           init && ((unsigned long)size & KASAN_GRANULE_MASK)) {
+               init = false;
+               memzero_explicit((void *)addr, size);
+       }
        size = round_up(size, KASAN_GRANULE_SIZE);
 
        hw_set_mem_tag_range((void *)addr, size, tag, init);
index 23cbd9d..34a9ad3 100644 (file)
@@ -537,54 +537,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 }
 
 /*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page.  We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
-                               int nr_pages)
-{
-       int i;
-       struct page *dst_base = dst;
-       struct page *src_base = src;
-
-       for (i = 0; i < nr_pages; ) {
-               cond_resched();
-               copy_highpage(dst, src);
-
-               i++;
-               dst = mem_map_next(dst, dst_base, i);
-               src = mem_map_next(src, src_base, i);
-       }
-}
-
-void copy_huge_page(struct page *dst, struct page *src)
-{
-       int i;
-       int nr_pages;
-
-       if (PageHuge(src)) {
-               /* hugetlbfs page */
-               struct hstate *h = page_hstate(src);
-               nr_pages = pages_per_huge_page(h);
-
-               if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
-                       __copy_gigantic_page(dst, src, nr_pages);
-                       return;
-               }
-       } else {
-               /* thp page */
-               BUG_ON(!PageTransHuge(src));
-               nr_pages = thp_nr_pages(src);
-       }
-
-       for (i = 0; i < nr_pages; i++) {
-               cond_resched();
-               copy_highpage(dst + i, src + i);
-       }
-}
-
-/*
  * Copy the page to its new location
  */
 void migrate_page_states(struct page *newpage, struct page *page)
index 3b97e17..3e97e68 100644 (file)
@@ -3820,7 +3820,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 
-static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        return __should_fail_alloc_page(gfp_mask, order);
 }
@@ -5221,9 +5221,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
        unsigned int alloc_flags = ALLOC_WMARK_LOW;
        int nr_populated = 0, nr_account = 0;
 
-       if (unlikely(nr_pages <= 0))
-               return 0;
-
        /*
         * Skip populated array elements to determine if any pages need
         * to be allocated before disabling IRQs.
@@ -5231,19 +5228,35 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
        while (page_array && nr_populated < nr_pages && page_array[nr_populated])
                nr_populated++;
 
+       /* No pages requested? */
+       if (unlikely(nr_pages <= 0))
+               goto out;
+
        /* Already populated array? */
        if (unlikely(page_array && nr_pages - nr_populated == 0))
-               return nr_populated;
+               goto out;
 
        /* Use the single page allocator for one page. */
        if (nr_pages - nr_populated == 1)
                goto failed;
 
+#ifdef CONFIG_PAGE_OWNER
+       /*
+        * PAGE_OWNER may recurse into the allocator to allocate space to
+        * save the stack with pagesets.lock held. Releasing/reacquiring
+        * removes much of the performance benefit of bulk allocation so
+        * force the caller to allocate one page at a time as it'll have
+        * similar performance to added complexity to the bulk allocator.
+        */
+       if (static_branch_unlikely(&page_owner_inited))
+               goto failed;
+#endif
+
        /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */
        gfp &= gfp_allowed_mask;
        alloc_gfp = gfp;
        if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags))
-               return 0;
+               goto out;
        gfp = alloc_gfp;
 
        /* Find an allowed local zone that meets the low watermark. */
@@ -5311,6 +5324,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
        __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
        zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
 
+out:
        return nr_populated;
 
 failed_irq:
@@ -5326,7 +5340,7 @@ failed:
                nr_populated++;
        }
 
-       return nr_populated;
+       goto out;
 }
 EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
 
index 795f9d5..b9eb5c1 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1440,21 +1440,20 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                /*
                 * If the page is mlock()d, we cannot swap it out.
                 */
-               if (!(flags & TTU_IGNORE_MLOCK)) {
-                       if (vma->vm_flags & VM_LOCKED) {
-                               /* PTE-mapped THP are never marked as mlocked */
-                               if (!PageTransCompound(page) ||
-                                   (PageHead(page) && !PageDoubleMap(page))) {
-                                       /*
-                                        * Holding pte lock, we do *not* need
-                                        * mmap_lock here
-                                        */
-                                       mlock_vma_page(page);
-                               }
-                               ret = false;
-                               page_vma_mapped_walk_done(&pvmw);
-                               break;
-                       }
+               if (!(flags & TTU_IGNORE_MLOCK) &&
+                   (vma->vm_flags & VM_LOCKED)) {
+                       /*
+                        * PTE-mapped THP are never marked as mlocked: so do
+                        * not set it on a DoubleMap THP, nor on an Anon THP
+                        * (which may still be PTE-mapped after DoubleMap was
+                        * cleared).  But stop unmapping even in those cases.
+                        */
+                       if (!PageTransCompound(page) || (PageHead(page) &&
+                            !PageDoubleMap(page) && !PageAnon(page)))
+                               mlock_vma_page(page);
+                       page_vma_mapped_walk_done(&pvmw);
+                       ret = false;
+                       break;
                }
 
                /* Unexpected PMD-mapped THP? */
@@ -1986,8 +1985,10 @@ static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
                 */
                if (vma->vm_flags & VM_LOCKED) {
                        /*
-                        * PTE-mapped THP are never marked as mlocked, but
-                        * this function is never called when PageDoubleMap().
+                        * PTE-mapped THP are never marked as mlocked; but
+                        * this function is never called on a DoubleMap THP,
+                        * nor on an Anon THP (which may still be PTE-mapped
+                        * after DoubleMap was cleared).
                         */
                        mlock_vma_page(page);
                        /*
@@ -2022,6 +2023,10 @@ void page_mlock(struct page *page)
        VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
        VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
 
+       /* Anon THP are only marked as mlocked when singly mapped */
+       if (PageTransCompound(page) && PageAnon(page))
+               return;
+
        rmap_walk(page, &rwc);
 }
 
index 67e0663..f997fd5 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -216,10 +216,18 @@ DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
 #endif
 extern void print_tracking(struct kmem_cache *s, void *object);
 long validate_slab_cache(struct kmem_cache *s);
+static inline bool __slub_debug_enabled(void)
+{
+       return static_branch_unlikely(&slub_debug_enabled);
+}
 #else
 static inline void print_tracking(struct kmem_cache *s, void *object)
 {
 }
+static inline bool __slub_debug_enabled(void)
+{
+       return false;
+}
 #endif
 
 /*
@@ -229,11 +237,10 @@ static inline void print_tracking(struct kmem_cache *s, void *object)
  */
 static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
 {
-#ifdef CONFIG_SLUB_DEBUG
-       VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
-       if (static_branch_unlikely(&slub_debug_enabled))
+       if (IS_ENABLED(CONFIG_SLUB_DEBUG))
+               VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+       if (__slub_debug_enabled())
                return s->flags & flags;
-#endif
        return false;
 }
 
index dc863c1..e1644ac 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
  */
 
 #ifdef CONFIG_SLUB_DEBUG
-
 #ifdef CONFIG_SLUB_DEBUG_ON
 DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
 #else
 DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
 #endif
-
-static inline bool __slub_debug_enabled(void)
-{
-       return static_branch_unlikely(&slub_debug_enabled);
-}
-
-#else          /* CONFIG_SLUB_DEBUG */
-
-static inline bool __slub_debug_enabled(void)
-{
-       return false;
-}
-
 #endif         /* CONFIG_SLUB_DEBUG */
 
 static inline bool kmem_cache_debug(struct kmem_cache *s)
index 99c6cc7..9043d03 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -731,6 +731,16 @@ int __page_mapcount(struct page *page)
 }
 EXPORT_SYMBOL_GPL(__page_mapcount);
 
+void copy_huge_page(struct page *dst, struct page *src)
+{
+       unsigned i, nr = compound_nr(src);
+
+       for (i = 0; i < nr; i++) {
+               cond_resched();
+               copy_highpage(nth_page(dst, i), nth_page(src, i));
+       }
+}
+
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
 int sysctl_overcommit_ratio __read_mostly = 50;
 unsigned long sysctl_overcommit_kbytes __read_mostly;
index 400bd85..f6012f8 100644 (file)
@@ -203,6 +203,19 @@ static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr
        kfree(attr);
 }
 
+static void garp_attr_destroy_all(struct garp_applicant *app)
+{
+       struct rb_node *node, *next;
+       struct garp_attr *attr;
+
+       for (node = rb_first(&app->gid);
+            next = node ? rb_next(node) : NULL, node != NULL;
+            node = next) {
+               attr = rb_entry(node, struct garp_attr, node);
+               garp_attr_destroy(app, attr);
+       }
+}
+
 static int garp_pdu_init(struct garp_applicant *app)
 {
        struct sk_buff *skb;
@@ -609,6 +622,7 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl
 
        spin_lock_bh(&app->lock);
        garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+       garp_attr_destroy_all(app);
        garp_pdu_queue(app);
        spin_unlock_bh(&app->lock);
 
index bea6e43..35e04cc 100644 (file)
@@ -292,6 +292,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr)
        kfree(attr);
 }
 
+static void mrp_attr_destroy_all(struct mrp_applicant *app)
+{
+       struct rb_node *node, *next;
+       struct mrp_attr *attr;
+
+       for (node = rb_first(&app->mad);
+            next = node ? rb_next(node) : NULL, node != NULL;
+            node = next) {
+               attr = rb_entry(node, struct mrp_attr, node);
+               mrp_attr_destroy(app, attr);
+       }
+}
+
 static int mrp_pdu_init(struct mrp_applicant *app)
 {
        struct sk_buff *skb;
@@ -895,6 +908,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
 
        spin_lock_bh(&app->lock);
        mrp_mad_event(app, MRP_EVENT_TX);
+       mrp_attr_destroy_all(app);
        mrp_pdu_queue(app);
        spin_unlock_bh(&app->lock);
 
index f7d2f47..6e4a323 100644 (file)
@@ -562,7 +562,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
        struct net_bridge_port *p;
        int err = 0;
        unsigned br_hr, dev_hr;
-       bool changed_addr;
+       bool changed_addr, fdb_synced = false;
 
        /* Don't allow bridging non-ethernet like devices. */
        if ((dev->flags & IFF_LOOPBACK) ||
@@ -652,6 +652,19 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
        list_add_rcu(&p->list, &br->port_list);
 
        nbp_update_port_count(br);
+       if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) {
+               /* When updating the port count we also update all ports'
+                * promiscuous mode.
+                * A port leaving promiscuous mode normally gets the bridge's
+                * fdb synced to the unicast filter (if supported), however,
+                * `br_port_clear_promisc` does not distinguish between
+                * non-promiscuous ports and *new* ports, so we need to
+                * sync explicitly here.
+                */
+               fdb_synced = br_fdb_sync_static(br, p) == 0;
+               if (!fdb_synced)
+                       netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n");
+       }
 
        netdev_update_features(br->dev);
 
@@ -701,6 +714,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
        return 0;
 
 err7:
+       if (fdb_synced)
+               br_fdb_unsync_static(br, p);
        list_del_rcu(&p->list);
        br_fdb_delete_by_port(br, p, 0, 1);
        nbp_update_port_count(br);
index 53c3a9d..d0434dc 100644 (file)
@@ -3264,7 +3264,9 @@ static void br_multicast_pim(struct net_bridge *br,
            pim_hdr_type(pimhdr) != PIM_TYPE_HELLO)
                return;
 
+       spin_lock(&br->multicast_lock);
        br_ip4_multicast_mark_router(br, port);
+       spin_unlock(&br->multicast_lock);
 }
 
 static int br_ip4_multicast_mrd_rcv(struct net_bridge *br,
@@ -3275,7 +3277,9 @@ static int br_ip4_multicast_mrd_rcv(struct net_bridge *br,
            igmp_hdr(skb)->type != IGMP_MRDISC_ADV)
                return -ENOMSG;
 
+       spin_lock(&br->multicast_lock);
        br_ip4_multicast_mark_router(br, port);
+       spin_unlock(&br->multicast_lock);
 
        return 0;
 }
@@ -3343,7 +3347,9 @@ static void br_ip6_multicast_mrd_rcv(struct net_bridge *br,
        if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV)
                return;
 
+       spin_lock(&br->multicast_lock);
        br_ip6_multicast_mark_router(br, port);
+       spin_unlock(&br->multicast_lock);
 }
 
 static int br_multicast_ipv6_rcv(struct net_bridge *br,
index c253c2a..64b21f0 100644 (file)
@@ -6008,6 +6008,19 @@ static void gro_list_prepare(const struct list_head *head,
                        diffs = memcmp(skb_mac_header(p),
                                       skb_mac_header(skb),
                                       maclen);
+
+               diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+               if (!diffs) {
+                       struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+                       struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+                       diffs |= (!!p_ext) ^ (!!skb_ext);
+                       if (!diffs && unlikely(skb_ext))
+                               diffs |= p_ext->chain ^ skb_ext->chain;
+               }
+#endif
+
                NAPI_GRO_CB(p)->same_flow = !diffs;
        }
 }
@@ -6221,6 +6234,8 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
        case GRO_MERGED_FREE:
                if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
                        napi_skb_free_stolen_head(skb);
+               else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+                       __kfree_skb(skb);
                else
                        __kfree_skb_defer(skb);
                break;
@@ -6270,6 +6285,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
        skb_shinfo(skb)->gso_type = 0;
        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
        skb_ext_reset(skb);
+       nf_reset_ct(skb);
 
        napi->skb = skb;
 }
index 12aabcd..f63de96 100644 (file)
@@ -943,6 +943,7 @@ void __kfree_skb_defer(struct sk_buff *skb)
 
 void napi_skb_free_stolen_head(struct sk_buff *skb)
 {
+       nf_reset_ct(skb);
        skb_dst_drop(skb);
        skb_ext_put(skb);
        napi_skb_cache_put(skb);
index ba1c0f7..a3eea6e 100644 (file)
 #include <net/tcp.h>
 #include <net/busy_poll.h>
 
+#include <linux/ethtool.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -810,8 +812,47 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
        }
 }
 
-int sock_set_timestamping(struct sock *sk, int optname, int val)
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
+{
+       struct net *net = sock_net(sk);
+       struct net_device *dev = NULL;
+       bool match = false;
+       int *vclock_index;
+       int i, num;
+
+       if (sk->sk_bound_dev_if)
+               dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+       if (!dev) {
+               pr_err("%s: sock not bind to device\n", __func__);
+               return -EOPNOTSUPP;
+       }
+
+       num = ethtool_get_phc_vclocks(dev, &vclock_index);
+       for (i = 0; i < num; i++) {
+               if (*(vclock_index + i) == phc_index) {
+                       match = true;
+                       break;
+               }
+       }
+
+       if (num > 0)
+               kfree(vclock_index);
+
+       if (!match)
+               return -EINVAL;
+
+       sk->sk_bind_phc = phc_index;
+
+       return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+                         struct so_timestamping timestamping)
 {
+       int val = timestamping.flags;
+       int ret;
+
        if (val & ~SOF_TIMESTAMPING_MASK)
                return -EINVAL;
 
@@ -832,6 +873,12 @@ int sock_set_timestamping(struct sock *sk, int optname, int val)
            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
                return -EINVAL;
 
+       if (val & SOF_TIMESTAMPING_BIND_PHC) {
+               ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+               if (ret)
+                       return ret;
+       }
+
        sk->sk_tsflags = val;
        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 
@@ -907,6 +954,7 @@ EXPORT_SYMBOL(sock_set_mark);
 int sock_setsockopt(struct socket *sock, int level, int optname,
                    sockptr_t optval, unsigned int optlen)
 {
+       struct so_timestamping timestamping;
        struct sock_txtime sk_txtime;
        struct sock *sk = sock->sk;
        int val;
@@ -1068,12 +1116,22 @@ set_sndbuf:
        case SO_TIMESTAMP_NEW:
        case SO_TIMESTAMPNS_OLD:
        case SO_TIMESTAMPNS_NEW:
-               sock_set_timestamp(sk, valbool, optname);
+               sock_set_timestamp(sk, optname, valbool);
                break;
 
        case SO_TIMESTAMPING_NEW:
        case SO_TIMESTAMPING_OLD:
-               ret = sock_set_timestamping(sk, optname, val);
+               if (optlen == sizeof(timestamping)) {
+                       if (copy_from_sockptr(&timestamping, optval,
+                                             sizeof(timestamping))) {
+                               ret = -EFAULT;
+                               break;
+                       }
+               } else {
+                       memset(&timestamping, 0, sizeof(timestamping));
+                       timestamping.flags = val;
+               }
+               ret = sock_set_timestamping(sk, optname, timestamping);
                break;
 
        case SO_RCVLOWAT:
@@ -1201,7 +1259,7 @@ set_sndbuf:
                        if (val < 0)
                                ret = -EINVAL;
                        else
-                               sk->sk_ll_usec = val;
+                               WRITE_ONCE(sk->sk_ll_usec, val);
                }
                break;
        case SO_PREFER_BUSY_POLL:
@@ -1348,6 +1406,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                struct __kernel_old_timeval tm;
                struct  __kernel_sock_timeval stm;
                struct sock_txtime txtime;
+               struct so_timestamping timestamping;
        } v;
 
        int lv = sizeof(int);
@@ -1451,7 +1510,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                break;
 
        case SO_TIMESTAMPING_OLD:
-               v.val = sk->sk_tsflags;
+               lv = sizeof(v.timestamping);
+               v.timestamping.flags = sk->sk_tsflags;
+               v.timestamping.bind_phc = sk->sk_bind_phc;
                break;
 
        case SO_RCVTIMEO_OLD:
index af71b86..5ece05d 100644 (file)
@@ -113,11 +113,11 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds,
        int err, port;
 
        if (dst->index == info->tree_index && ds->index == info->sw_index &&
-           ds->ops->port_bridge_join)
+           ds->ops->port_bridge_leave)
                ds->ops->port_bridge_leave(ds, info->port, info->br);
 
        if ((dst->index != info->tree_index || ds->index != info->sw_index) &&
-           ds->ops->crosschip_bridge_join)
+           ds->ops->crosschip_bridge_leave)
                ds->ops->crosschip_bridge_leave(ds, info->tree_index,
                                                info->sw_index, info->port,
                                                info->br);
@@ -427,7 +427,7 @@ static int dsa_switch_lag_join(struct dsa_switch *ds,
                                                   info->port, info->lag,
                                                   info->info);
 
-       return 0;
+       return -EOPNOTSUPP;
 }
 
 static int dsa_switch_lag_leave(struct dsa_switch *ds,
@@ -440,7 +440,7 @@ static int dsa_switch_lag_leave(struct dsa_switch *ds,
                return ds->ops->crosschip_lag_leave(ds, info->sw_index,
                                                    info->port, info->lag);
 
-       return 0;
+       return -EOPNOTSUPP;
 }
 
 static int dsa_switch_mdb_add(struct dsa_switch *ds,
index 723c9a8..0a19470 100644 (file)
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)   += ethtool_nl.o
 ethtool_nl-y   := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
                   linkstate.o debug.o wol.o features.o privflags.o rings.o \
                   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-                  tunnels.o fec.o eeprom.o stats.o
+                  tunnels.o fec.o eeprom.o stats.o phc_vclocks.o
index f9dcbad..c63e073 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/rtnetlink.h>
+#include <linux/ptp_clock_kernel.h>
 
 #include "common.h"
 
@@ -397,6 +398,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
        [const_ilog2(SOF_TIMESTAMPING_OPT_STATS)]    = "option-stats",
        [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)]  = "option-pktinfo",
        [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)]  = "option-tx-swhw",
+       [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)]     = "bind-phc",
 };
 static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
 
@@ -554,6 +556,18 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
        return 0;
 }
 
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
+{
+       struct ethtool_ts_info info = { };
+       int num = 0;
+
+       if (!__ethtool_get_ts_info(dev, &info))
+               num = ptp_get_vclocks_index(info.phc_index, vclock_index);
+
+       return num;
+}
+EXPORT_SYMBOL(ethtool_get_phc_vclocks);
+
 const struct ethtool_phy_ops *ethtool_phy_ops;
 
 void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
index a734634..73e0f5b 100644 (file)
@@ -248,6 +248,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
        [ETHTOOL_MSG_TSINFO_GET]        = &ethnl_tsinfo_request_ops,
        [ETHTOOL_MSG_MODULE_EEPROM_GET] = &ethnl_module_eeprom_request_ops,
        [ETHTOOL_MSG_STATS_GET]         = &ethnl_stats_request_ops,
+       [ETHTOOL_MSG_PHC_VCLOCKS_GET]   = &ethnl_phc_vclocks_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -958,6 +959,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
                .policy = ethnl_stats_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
        },
+       {
+               .cmd    = ETHTOOL_MSG_PHC_VCLOCKS_GET,
+               .doit   = ethnl_default_doit,
+               .start  = ethnl_default_start,
+               .dumpit = ethnl_default_dumpit,
+               .done   = ethnl_default_done,
+               .policy = ethnl_phc_vclocks_get_policy,
+               .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1,
+       },
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
index 3e25a47..3fc395c 100644 (file)
@@ -347,6 +347,7 @@ extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
 extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 extern const struct ethnl_request_ops ethnl_stats_request_ops;
+extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -382,6 +383,7 @@ extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
 extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1];
 extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1];
+extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c
new file mode 100644 (file)
index 0000000..637b2f5
--- /dev/null
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 NXP
+ */
+#include "netlink.h"
+#include "common.h"
+
+struct phc_vclocks_req_info {
+       struct ethnl_req_info           base;
+};
+
+struct phc_vclocks_reply_data {
+       struct ethnl_reply_data         base;
+       int                             num;
+       int                             *index;
+};
+
+#define PHC_VCLOCKS_REPDATA(__reply_base) \
+       container_of(__reply_base, struct phc_vclocks_reply_data, base)
+
+const struct nla_policy ethnl_phc_vclocks_get_policy[] = {
+       [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base,
+                                   struct ethnl_reply_data *reply_base,
+                                   struct genl_info *info)
+{
+       struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base);
+       struct net_device *dev = reply_base->dev;
+       int ret;
+
+       ret = ethnl_ops_begin(dev);
+       if (ret < 0)
+               return ret;
+       data->num = ethtool_get_phc_vclocks(dev, &data->index);
+       ethnl_ops_complete(dev);
+
+       return ret;
+}
+
+static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base,
+                                 const struct ethnl_reply_data *reply_base)
+{
+       const struct phc_vclocks_reply_data *data =
+               PHC_VCLOCKS_REPDATA(reply_base);
+       int len = 0;
+
+       if (data->num > 0) {
+               len += nla_total_size(sizeof(u32));
+               len += nla_total_size(sizeof(s32) * data->num);
+       }
+
+       return len;
+}
+
+static int phc_vclocks_fill_reply(struct sk_buff *skb,
+                                 const struct ethnl_req_info *req_base,
+                                 const struct ethnl_reply_data *reply_base)
+{
+       const struct phc_vclocks_reply_data *data =
+               PHC_VCLOCKS_REPDATA(reply_base);
+
+       if (data->num <= 0)
+               return 0;
+
+       if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) ||
+           nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX,
+                   sizeof(s32) * data->num, data->index))
+               return -EMSGSIZE;
+
+       return 0;
+}
+
+static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+       const struct phc_vclocks_reply_data *data =
+               PHC_VCLOCKS_REPDATA(reply_base);
+
+       kfree(data->index);
+}
+
+const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = {
+       .request_cmd            = ETHTOOL_MSG_PHC_VCLOCKS_GET,
+       .reply_cmd              = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
+       .hdr_attr               = ETHTOOL_A_PHC_VCLOCKS_HEADER,
+       .req_info_size          = sizeof(struct phc_vclocks_req_info),
+       .reply_data_size        = sizeof(struct phc_vclocks_reply_data),
+
+       .prepare_data           = phc_vclocks_prepare_data,
+       .reply_size             = phc_vclocks_reply_size,
+       .fill_reply             = phc_vclocks_fill_reply,
+       .cleanup_data           = phc_vclocks_cleanup_data,
+};
index a933bd6..9fe13e4 100644 (file)
@@ -1376,7 +1376,7 @@ static void nl_fib_input(struct sk_buff *skb)
        portid = NETLINK_CB(skb).portid;      /* netlink portid */
        NETLINK_CB(skb).portid = 0;        /* from kernel */
        NETLINK_CB(skb).dst_group = 0;  /* unicast */
-       netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
+       nlmsg_unicast(net->ipv4.fibnl, skb, portid);
 }
 
 static int __net_init nl_fib_lookup_init(struct net *net)
index e65f4ef..ef78972 100644 (file)
@@ -580,10 +580,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
                nlmsg_free(rep);
                goto out;
        }
-       err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-                             MSG_DONTWAIT);
-       if (err > 0)
-               err = 0;
+       err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
 
 out:
        if (sk)
index f6cc26d..0dca007 100644 (file)
@@ -317,7 +317,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
        }
 
        dev->needed_headroom = t_hlen + hlen;
-       mtu -= t_hlen;
+       mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
 
        if (mtu < IPV4_MIN_MTU)
                mtu = IPV4_MIN_MTU;
@@ -348,6 +348,9 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
        t_hlen = nt->hlen + sizeof(struct iphdr);
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = IP_MAX_MTU - t_hlen;
+       if (dev->type == ARPHRD_ETHER)
+               dev->max_mtu -= dev->hard_header_len;
+
        ip_tunnel_add(itn, nt);
        return nt;
 
@@ -489,11 +492,14 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 
        tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
        pkt_size = skb->len - tunnel_hlen;
+       pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
 
-       if (df)
+       if (df) {
                mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
-       else
+               mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
+       } else {
                mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+       }
 
        if (skb_valid_dst(skb))
                skb_dst_update_pmtu_no_confirm(skb, mtu);
@@ -972,6 +978,9 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
        int t_hlen = tunnel->hlen + sizeof(struct iphdr);
        int max_mtu = IP_MAX_MTU - t_hlen;
 
+       if (dev->type == ARPHRD_ETHER)
+               max_mtu -= dev->hard_header_len;
+
        if (new_mtu < ETH_MIN_MTU)
                return -EINVAL;
 
@@ -1149,6 +1158,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
        if (tb[IFLA_MTU]) {
                unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
 
+               if (dev->type == ARPHRD_ETHER)
+                       max -= dev->hard_header_len;
+
                mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
        }
 
index 7b12a40..2dda856 100644 (file)
@@ -2119,7 +2119,7 @@ int ip_mr_input(struct sk_buff *skb)
                                raw_rcv(mroute_sk, skb);
                                return 0;
                        }
-                   }
+               }
        }
 
        /* already under rcu_read_lock() */
index 1b5b8af..ccacbde 100644 (file)
@@ -119,11 +119,8 @@ static int raw_diag_dump_one(struct netlink_callback *cb,
                return err;
        }
 
-       err = netlink_unicast(net->diag_nlsk, rep,
-                             NETLINK_CB(in_skb).portid,
-                             MSG_DONTWAIT);
-       if (err > 0)
-               err = 0;
+       err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
        return err;
 }
 
index d5ab5f2..8cb4404 100644 (file)
@@ -1375,6 +1375,9 @@ new_segment:
                        }
                        pfrag->offset += copy;
                } else {
+                       if (!sk_wmem_schedule(sk, copy))
+                               goto wait_for_space;
+
                        err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
                        if (err == -EMSGSIZE || err == -EEXIST) {
                                tcp_mark_push(tp, skb);
index e6ca5a1..149ceb5 100644 (file)
@@ -4247,6 +4247,9 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
 {
        trace_tcp_receive_reset(sk);
 
+       /* mptcp can't tell us to ignore reset pkts,
+        * so just ignore the return value of mptcp_incoming_options().
+        */
        if (sk_is_mptcp(sk))
                mptcp_incoming_options(sk, skb);
 
@@ -4941,8 +4944,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
        bool fragstolen;
        int eaten;
 
-       if (sk_is_mptcp(sk))
-               mptcp_incoming_options(sk, skb);
+       /* If a subflow has been reset, the packet should not continue
+        * to be processed, drop the packet.
+        */
+       if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+               __kfree_skb(skb);
+               return;
+       }
 
        if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
                __kfree_skb(skb);
@@ -5922,8 +5930,8 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
                tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
        tp->snd_cwnd_stamp = tcp_jiffies32;
 
-       icsk->icsk_ca_initialized = 0;
        bpf_skops_established(sk, bpf_op, skb);
+       /* Initialize congestion control unless BPF initialized it already: */
        if (!icsk->icsk_ca_initialized)
                tcp_init_congestion_control(sk);
        tcp_init_buffer_space(sk);
@@ -6523,8 +6531,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
        case TCP_CLOSING:
        case TCP_LAST_ACK:
                if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
-                       if (sk_is_mptcp(sk))
-                               mptcp_incoming_options(sk, skb);
+                       /* If a subflow has been reset, the packet should not
+                        * continue to be processed, drop the packet.
+                        */
+                       if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
+                               goto discard;
                        break;
                }
                fallthrough;
index e66ad6b..b9dc2d6 100644 (file)
@@ -342,7 +342,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
 
        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                return;
-       mtu = tcp_sk(sk)->mtu_info;
+       mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
        dst = inet_csk_update_pmtu(sk, mtu);
        if (!dst)
                return;
@@ -546,7 +546,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
                        if (sk->sk_state == TCP_LISTEN)
                                goto out;
 
-                       tp->mtu_info = info;
+                       WRITE_ONCE(tp->mtu_info, info);
                        if (!sock_owned_by_user(sk)) {
                                tcp_v4_mtu_reduced(sk);
                        } else {
index bde781f..29553fc 100644 (file)
@@ -1732,6 +1732,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
        return __tcp_mtu_to_mss(sk, pmtu) -
               (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
 }
+EXPORT_SYMBOL(tcp_mtu_to_mss);
 
 /* Inverse of above */
 int tcp_mss_to_mtu(struct sock *sk, int mss)
index 6268280..62cd4cd 100644 (file)
@@ -1102,7 +1102,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
        }
 
        ipcm_init_sk(&ipc, inet);
-       ipc.gso_size = up->gso_size;
+       ipc.gso_size = READ_ONCE(up->gso_size);
 
        if (msg->msg_controllen) {
                err = udp_cmsg_send(sk, msg, &ipc.gso_size);
@@ -2695,7 +2695,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
        case UDP_SEGMENT:
                if (val < 0 || val > USHRT_MAX)
                        return -EINVAL;
-               up->gso_size = val;
+               WRITE_ONCE(up->gso_size, val);
                break;
 
        case UDP_GRO:
@@ -2790,7 +2790,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
                break;
 
        case UDP_SEGMENT:
-               val = up->gso_size;
+               val = READ_ONCE(up->gso_size);
                break;
 
        case UDP_GRO:
index b2cee9a..1ed8c4d 100644 (file)
@@ -77,10 +77,8 @@ static int udp_dump_one(struct udp_table *tbl,
                kfree_skb(rep);
                goto out;
        }
-       err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-                             MSG_DONTWAIT);
-       if (err > 0)
-               err = 0;
+       err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
        if (sk)
                sock_put(sk);
index 54e06b8..9dde1e5 100644 (file)
@@ -525,8 +525,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 
                if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
                    (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
-                       pp = call_gro_receive(udp_gro_receive_segment, head, skb);
-               return pp;
+                       return call_gro_receive(udp_gro_receive_segment, head, skb);
+
+               /* no GRO, be sure flush the current packet */
+               goto out;
        }
 
        if (NAPI_GRO_CB(skb)->encap_mark ||
index 984050f..01bea76 100644 (file)
@@ -60,10 +60,38 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 {
        struct dst_entry *dst = skb_dst(skb);
        struct net_device *dev = dst->dev;
+       unsigned int hh_len = LL_RESERVED_SPACE(dev);
+       int delta = hh_len - skb_headroom(skb);
        const struct in6_addr *nexthop;
        struct neighbour *neigh;
        int ret;
 
+       /* Be paranoid, rather than too clever. */
+       if (unlikely(delta > 0) && dev->header_ops) {
+               /* pskb_expand_head() might crash, if skb is shared */
+               if (skb_shared(skb)) {
+                       struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+                       if (likely(nskb)) {
+                               if (skb->sk)
+                                       skb_set_owner_w(skb, skb->sk);
+                               consume_skb(skb);
+                       } else {
+                               kfree_skb(skb);
+                       }
+                       skb = nskb;
+               }
+               if (skb &&
+                   pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+                       kfree_skb(skb);
+                       skb = NULL;
+               }
+               if (!skb) {
+                       IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+                       return -ENOMEM;
+               }
+       }
+
        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 
@@ -479,7 +507,9 @@ int ip6_forward(struct sk_buff *skb)
        if (skb_warn_if_lro(skb))
                goto drop;
 
-       if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+       if (!net->ipv6.devconf_all->disable_policy &&
+           !idev->cnf.disable_policy &&
+           !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
                goto drop;
        }
index 578ab63..0ce52d4 100644 (file)
@@ -348,11 +348,20 @@ failure:
 static void tcp_v6_mtu_reduced(struct sock *sk)
 {
        struct dst_entry *dst;
+       u32 mtu;
 
        if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
                return;
 
-       dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
+       mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
+
+       /* Drop requests trying to increase our current mss.
+        * Check done in __ip6_rt_update_pmtu() is too late.
+        */
+       if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache)
+               return;
+
+       dst = inet6_csk_update_pmtu(sk, mtu);
        if (!dst)
                return;
 
@@ -433,6 +442,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        }
 
        if (type == ICMPV6_PKT_TOOBIG) {
+               u32 mtu = ntohl(info);
+
                /* We are not interested in TCP_LISTEN and open_requests
                 * (SYN-ACKs send out by Linux are always <576bytes so
                 * they should go through unfragmented).
@@ -443,7 +454,11 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                if (!ip6_sk_accept_pmtu(sk))
                        goto out;
 
-               tp->mtu_info = ntohl(info);
+               if (mtu < IPV6_MIN_MTU)
+                       goto out;
+
+               WRITE_ONCE(tp->mtu_info, mtu);
+
                if (!sock_owned_by_user(sk))
                        tcp_v6_mtu_reduced(sk);
                else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
@@ -540,7 +555,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
                opt = ireq->ipv6_opt;
                if (!opt)
                        opt = rcu_dereference(np->opt);
-               err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
+               err = ip6_xmit(sk, skb, fl6, skb->mark ? : sk->sk_mark, opt,
                               tclass, sk->sk_priority);
                rcu_read_unlock();
                err = net_xmit_eval(err);
index 368972d..0cc7ba5 100644 (file)
@@ -1296,7 +1296,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
        int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
 
        ipcm6_init(&ipc6);
-       ipc6.gso_size = up->gso_size;
+       ipc6.gso_size = READ_ONCE(up->gso_size);
        ipc6.sockc.tsflags = sk->sk_tsflags;
        ipc6.sockc.mark = sk->sk_mark;
 
index 57fa27c..d0d2800 100644 (file)
@@ -49,7 +49,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_state *x = dst->xfrm;
-       int mtu;
+       unsigned int mtu;
        bool toobig;
 
 #ifdef CONFIG_NETFILTER
index 349c6ac..e6795d5 100644 (file)
@@ -1635,14 +1635,16 @@ struct iucv_message_pending {
        u8  iptype;
        u32 ipmsgid;
        u32 iptrgcls;
-       union {
-               u32 iprmmsg1_u32;
-               u8  iprmmsg1[4];
-       } ln1msg1;
-       union {
-               u32 ipbfln1f;
-               u8  iprmmsg2[4];
-       } ln1msg2;
+       struct {
+               union {
+                       u32 iprmmsg1_u32;
+                       u8  iprmmsg1[4];
+               } ln1msg1;
+               union {
+                       u32 ipbfln1f;
+                       u8  iprmmsg2[4];
+               } ln1msg2;
+       } rmmsg;
        u32 res1[3];
        u32 ipbfln2f;
        u8  ippollfg;
@@ -1660,10 +1662,10 @@ static void iucv_message_pending(struct iucv_irq_data *data)
                msg.id = imp->ipmsgid;
                msg.class = imp->iptrgcls;
                if (imp->ipflags1 & IUCV_IPRMDATA) {
-                       memcpy(msg.rmmsg, imp->ln1msg1.iprmmsg1, 8);
+                       memcpy(msg.rmmsg, &imp->rmmsg, 8);
                        msg.length = 8;
                } else
-                       msg.length = imp->ln1msg2.ipbfln1f;
+                       msg.length = imp->rmmsg.ln1msg2.ipbfln1f;
                msg.reply_size = imp->ipbfln2f;
                path->handler->message_pending(path, &msg);
        }
index 52ea251..ff2cc0e 100644 (file)
@@ -44,6 +44,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
        SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
        SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
        SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
+       SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED),
        SNMP_MIB_SENTINEL
 };
 
index 193466c..0663cb1 100644 (file)
@@ -37,6 +37,7 @@ enum linux_mptcp_mib_field {
        MPTCP_MIB_RMSUBFLOW,            /* Remove a subflow */
        MPTCP_MIB_MPPRIOTX,             /* Transmit a MP_PRIO */
        MPTCP_MIB_MPPRIORX,             /* Received a MP_PRIO */
+       MPTCP_MIB_RCVPRUNED,            /* Incoming packet dropped due to memory limit */
        __MPTCP_MIB_MAX
 };
 
index 8f88dde..f48eb63 100644 (file)
@@ -57,10 +57,8 @@ static int mptcp_diag_dump_one(struct netlink_callback *cb,
                kfree_skb(rep);
                goto out;
        }
-       err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-                             MSG_DONTWAIT);
-       if (err > 0)
-               err = 0;
+       err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
        sock_put(sk);
 
index b5850af..4452455 100644 (file)
@@ -1035,7 +1035,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
        return hmac == mp_opt->ahmac;
 }
 
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
+/* Return false if a subflow has been reset, else return true */
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
@@ -1053,12 +1054,16 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
                        __mptcp_check_push(subflow->conn, sk);
                __mptcp_data_acked(subflow->conn);
                mptcp_data_unlock(subflow->conn);
-               return;
+               return true;
        }
 
        mptcp_get_options(sk, skb, &mp_opt);
+
+       /* The subflow can be in close state only if check_fully_established()
+        * just sent a reset. If so, tell the caller to ignore the current packet.
+        */
        if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
-               return;
+               return sk->sk_state != TCP_CLOSE;
 
        if (mp_opt.fastclose &&
            msk->local_key == mp_opt.rcvr_key) {
@@ -1100,7 +1105,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
        }
 
        if (!mp_opt.dss)
-               return;
+               return true;
 
        /* we can't wait for recvmsg() to update the ack_seq, otherwise
         * monodirectional flows will stuck
@@ -1119,12 +1124,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
                    schedule_work(&msk->work))
                        sock_hold(subflow->conn);
 
-               return;
+               return true;
        }
 
        mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
        if (!mpext)
-               return;
+               return true;
 
        memset(mpext, 0, sizeof(*mpext));
 
@@ -1153,6 +1158,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
                if (mpext->csum_reqd)
                        mpext->csum = mp_opt.csum;
        }
+
+       return true;
 }
 
 static void mptcp_set_rwin(const struct tcp_sock *tp)
index 7a5afa8..a889249 100644 (file)
@@ -474,7 +474,7 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
        bool cleanup, rx_empty;
 
        cleanup = (space > 0) && (space >= (old_space << 1));
-       rx_empty = !atomic_read(&sk->sk_rmem_alloc);
+       rx_empty = !__mptcp_rmem(sk);
 
        mptcp_for_each_subflow(msk, subflow) {
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -720,8 +720,10 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
                sk_rbuf = ssk_rbuf;
 
        /* over limit? can't append more skbs to msk, Also, no need to wake-up*/
-       if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
+       if (__mptcp_rmem(sk) > sk_rbuf) {
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
                return;
+       }
 
        /* Wake-up the reader only for in-sequence data */
        mptcp_data_lock(sk);
@@ -1754,7 +1756,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
                if (!(flags & MSG_PEEK)) {
                        /* we will bulk release the skb memory later */
                        skb->destructor = NULL;
-                       msk->rmem_released += skb->truesize;
+                       WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize);
                        __skb_unlink(skb, &msk->receive_queue);
                        __kfree_skb(skb);
                }
@@ -1873,7 +1875,7 @@ static void __mptcp_update_rmem(struct sock *sk)
 
        atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
        sk_mem_uncharge(sk, msk->rmem_released);
-       msk->rmem_released = 0;
+       WRITE_ONCE(msk->rmem_released, 0);
 }
 
 static void __mptcp_splice_receive_queue(struct sock *sk)
@@ -2380,7 +2382,7 @@ static int __mptcp_init_sock(struct sock *sk)
        msk->out_of_order_queue = RB_ROOT;
        msk->first_pending = NULL;
        msk->wmem_reserved = 0;
-       msk->rmem_released = 0;
+       WRITE_ONCE(msk->rmem_released, 0);
        msk->tx_pending_data = 0;
 
        msk->first = NULL;
index 426ed80..0f0c026 100644 (file)
@@ -296,9 +296,17 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
        return (struct mptcp_sock *)sk;
 }
 
+/* the msk socket don't use the backlog, also account for the bulk
+ * free memory
+ */
+static inline int __mptcp_rmem(const struct sock *sk)
+{
+       return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
+}
+
 static inline int __mptcp_space(const struct sock *sk)
 {
-       return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_released);
+       return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
 }
 
 static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
index 092d1f6..8c03afa 100644 (file)
@@ -157,19 +157,7 @@ static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optnam
                struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
                bool slow = lock_sock_fast(ssk);
 
-               switch (optname) {
-               case SO_TIMESTAMP_OLD:
-               case SO_TIMESTAMP_NEW:
-               case SO_TIMESTAMPNS_OLD:
-               case SO_TIMESTAMPNS_NEW:
-                       sock_set_timestamp(sk, optname, !!val);
-                       break;
-               case SO_TIMESTAMPING_NEW:
-               case SO_TIMESTAMPING_OLD:
-                       sock_set_timestamping(sk, optname, val);
-                       break;
-               }
-
+               sock_set_timestamp(sk, optname, !!val);
                unlock_sock_fast(ssk, slow);
        }
 
@@ -178,7 +166,8 @@ static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optnam
 }
 
 static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
-                                          sockptr_t optval, unsigned int optlen)
+                                          sockptr_t optval,
+                                          unsigned int optlen)
 {
        int val, ret;
 
@@ -205,14 +194,56 @@ static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
        case SO_TIMESTAMP_NEW:
        case SO_TIMESTAMPNS_OLD:
        case SO_TIMESTAMPNS_NEW:
-       case SO_TIMESTAMPING_OLD:
-       case SO_TIMESTAMPING_NEW:
                return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val);
        }
 
        return -ENOPROTOOPT;
 }
 
+static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
+                                                   int optname,
+                                                   sockptr_t optval,
+                                                   unsigned int optlen)
+{
+       struct mptcp_subflow_context *subflow;
+       struct sock *sk = (struct sock *)msk;
+       struct so_timestamping timestamping;
+       int ret;
+
+       if (optlen == sizeof(timestamping)) {
+               if (copy_from_sockptr(&timestamping, optval,
+                                     sizeof(timestamping)))
+                       return -EFAULT;
+       } else if (optlen == sizeof(int)) {
+               memset(&timestamping, 0, sizeof(timestamping));
+
+               if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int)))
+                       return -EFAULT;
+       } else {
+               return -EINVAL;
+       }
+
+       ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
+                             KERNEL_SOCKPTR(&timestamping),
+                             sizeof(timestamping));
+       if (ret)
+               return ret;
+
+       lock_sock(sk);
+
+       mptcp_for_each_subflow(msk, subflow) {
+               struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+               bool slow = lock_sock_fast(ssk);
+
+               sock_set_timestamping(sk, optname, timestamping);
+               unlock_sock_fast(ssk, slow);
+       }
+
+       release_sock(sk);
+
+       return 0;
+}
+
 static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
                                              unsigned int optlen)
 {
@@ -299,9 +330,12 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
        case SO_TIMESTAMP_NEW:
        case SO_TIMESTAMPNS_OLD:
        case SO_TIMESTAMPNS_NEW:
+               return mptcp_setsockopt_sol_socket_int(msk, optname, optval,
+                                                      optlen);
        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
-               return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen);
+               return mptcp_setsockopt_sol_socket_timestamping(msk, optname,
+                                                               optval, optlen);
        case SO_LINGER:
                return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen);
        case SO_RCVLOWAT:
index 66d0b18..966f777 100644 (file)
@@ -214,11 +214,6 @@ again:
                                 ntohs(inet_sk(sk_listener)->inet_sport),
                                 ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
                        if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
-                               sock_put((struct sock *)subflow_req->msk);
-                               mptcp_token_destroy_request(req);
-                               tcp_request_sock_ops.destructor(req);
-                               subflow_req->msk = NULL;
-                               subflow_req->mp_join = 0;
                                SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
                                return -EPERM;
                        }
@@ -230,6 +225,8 @@ again:
                if (unlikely(req->syncookie)) {
                        if (mptcp_can_accept_new_subflow(subflow_req->msk))
                                subflow_init_req_cookie_join_save(subflow_req, skb);
+                       else
+                               return -EPERM;
                }
 
                pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
@@ -269,9 +266,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
                if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
                        return -EINVAL;
 
-               if (mptcp_can_accept_new_subflow(subflow_req->msk))
-                       subflow_req->mp_join = 1;
-
+               subflow_req->mp_join = 1;
                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
        }
 
index abe0fd0..3712778 100644 (file)
@@ -37,7 +37,21 @@ static spinlock_t join_entry_locks[COOKIE_JOIN_SLOTS] __cacheline_aligned_in_smp
 
 static u32 mptcp_join_entry_hash(struct sk_buff *skb, struct net *net)
 {
-       u32 i = skb_get_hash(skb) ^ net_hash_mix(net);
+       static u32 mptcp_join_hash_secret __read_mostly;
+       struct tcphdr *th = tcp_hdr(skb);
+       u32 seq, i;
+
+       net_get_random_once(&mptcp_join_hash_secret,
+                           sizeof(mptcp_join_hash_secret));
+
+       if (th->syn)
+               seq = TCP_SKB_CB(skb)->seq;
+       else
+               seq = TCP_SKB_CB(skb)->seq - 1;
+
+       i = jhash_3words(seq, net_hash_mix(net),
+                        (__force __u32)th->source << 16 | (__force __u32)th->dest,
+                        mptcp_join_hash_secret);
 
        return i % ARRAY_SIZE(join_entries);
 }
index 9330908..ea1dd32 100644 (file)
@@ -17,3 +17,9 @@ config NCSI_OEM_CMD_GET_MAC
        help
          This allows to get MAC address from NCSI firmware and set them back to
                controller.
+config NCSI_OEM_CMD_KEEP_PHY
+       bool "Keep PHY Link up"
+       depends on NET_NCSI
+       help
+         This allows to keep PHY link up and prevents any channel resets during
+         the host load.
index cbbb0de..0b6cfd3 100644 (file)
@@ -78,6 +78,9 @@ enum {
 /* OEM Vendor Manufacture ID */
 #define NCSI_OEM_MFR_MLX_ID             0x8119
 #define NCSI_OEM_MFR_BCM_ID             0x113d
+#define NCSI_OEM_MFR_INTEL_ID           0x157
+/* Intel specific OEM command */
+#define NCSI_OEM_INTEL_CMD_KEEP_PHY     0x20   /* CMD ID for Keep PHY up */
 /* Broadcom specific OEM Command */
 #define NCSI_OEM_BCM_CMD_GMA            0x01   /* CMD ID for Get MAC */
 /* Mellanox specific OEM Command */
@@ -86,6 +89,7 @@ enum {
 #define NCSI_OEM_MLX_CMD_SMAF           0x01   /* CMD ID for Set MC Affinity */
 #define NCSI_OEM_MLX_CMD_SMAF_PARAM     0x07   /* Parameter for SMAF         */
 /* OEM Command payload lengths*/
+#define NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN 7
 #define NCSI_OEM_BCM_CMD_GMA_LEN        12
 #define NCSI_OEM_MLX_CMD_GMA_LEN        8
 #define NCSI_OEM_MLX_CMD_SMAF_LEN        60
@@ -271,6 +275,7 @@ enum {
        ncsi_dev_state_probe_mlx_gma,
        ncsi_dev_state_probe_mlx_smaf,
        ncsi_dev_state_probe_cis,
+       ncsi_dev_state_probe_keep_phy,
        ncsi_dev_state_probe_gvi,
        ncsi_dev_state_probe_gc,
        ncsi_dev_state_probe_gls,
index ca04b6d..89c7742 100644 (file)
@@ -689,6 +689,35 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
        return 0;
 }
 
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)
+
+static int ncsi_oem_keep_phy_intel(struct ncsi_cmd_arg *nca)
+{
+       unsigned char data[NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN];
+       int ret = 0;
+
+       nca->payload = NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN;
+
+       memset(data, 0, NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN);
+       *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID);
+
+       data[4] = NCSI_OEM_INTEL_CMD_KEEP_PHY;
+
+       /* PHY Link up attribute */
+       data[6] = 0x1;
+
+       nca->data = data;
+
+       ret = ncsi_xmit_cmd(nca);
+       if (ret)
+               netdev_err(nca->ndp->ndev.dev,
+                          "NCSI: Failed to transmit cmd 0x%x during configure\n",
+                          nca->type);
+       return ret;
+}
+
+#endif
+
 #if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
 
 /* NCSI OEM Command APIs */
@@ -700,7 +729,7 @@ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
        nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN;
 
        memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN);
-       *(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID);
+       *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_BCM_ID);
        data[5] = NCSI_OEM_BCM_CMD_GMA;
 
        nca->data = data;
@@ -724,7 +753,7 @@ static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca)
        nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN;
 
        memset(&u, 0, sizeof(u));
-       u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID);
+       u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID);
        u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA;
        u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM;
 
@@ -747,7 +776,7 @@ static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca)
        int ret = 0;
 
        memset(&u, 0, sizeof(u));
-       u.data_u32[0] = ntohl(NCSI_OEM_MFR_MLX_ID);
+       u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID);
        u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF;
        u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM;
        memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET],
@@ -1392,7 +1421,23 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
                }
 
                nd->state = ncsi_dev_state_probe_gvi;
+               if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY))
+                       nd->state = ncsi_dev_state_probe_keep_phy;
+               break;
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY)
+       case ncsi_dev_state_probe_keep_phy:
+               ndp->pending_req_num = 1;
+
+               nca.type = NCSI_PKT_CMD_OEM;
+               nca.package = ndp->active_package->id;
+               nca.channel = 0;
+               ret = ncsi_oem_keep_phy_intel(&nca);
+               if (ret)
+                       goto error;
+
+               nd->state = ncsi_dev_state_probe_gvi;
                break;
+#endif /* CONFIG_NCSI_OEM_CMD_KEEP_PHY */
        case ncsi_dev_state_probe_gvi:
        case ncsi_dev_state_probe_gc:
        case ncsi_dev_state_probe_gls:
index 888ccc2..d483748 100644 (file)
@@ -403,7 +403,7 @@ static int ncsi_rsp_handler_ev(struct ncsi_request *nr)
        /* Update to VLAN mode */
        cmd = (struct ncsi_cmd_ev_pkt *)skb_network_header(nr->cmd);
        ncm->enable = 1;
-       ncm->data[0] = ntohl(cmd->mode);
+       ncm->data[0] = ntohl((__force __be32)cmd->mode);
 
        return 0;
 }
@@ -699,12 +699,19 @@ static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr)
        return 0;
 }
 
+/* Response handler for Intel card */
+static int ncsi_rsp_handler_oem_intel(struct ncsi_request *nr)
+{
+       return 0;
+}
+
 static struct ncsi_rsp_oem_handler {
        unsigned int    mfr_id;
        int             (*handler)(struct ncsi_request *nr);
 } ncsi_rsp_oem_handlers[] = {
        { NCSI_OEM_MFR_MLX_ID, ncsi_rsp_handler_oem_mlx },
-       { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
+       { NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm },
+       { NCSI_OEM_MFR_INTEL_ID, ncsi_rsp_handler_oem_intel }
 };
 
 /* Response handler for OEM command */
index 96ba19f..83c52df 100644 (file)
@@ -149,7 +149,15 @@ static void nf_conntrack_all_lock(void)
 
        spin_lock(&nf_conntrack_locks_all_lock);
 
-       nf_conntrack_locks_all = true;
+       /* For nf_contrack_locks_all, only the latest time when another
+        * CPU will see an update is controlled, by the "release" of the
+        * spin_lock below.
+        * The earliest time is not controlled, an thus KCSAN could detect
+        * a race when nf_conntract_lock() reads the variable.
+        * WRITE_ONCE() is used to ensure the compiler will not
+        * optimize the write.
+        */
+       WRITE_ONCE(nf_conntrack_locks_all, true);
 
        for (i = 0; i < CONNTRACK_LOCKS; i++) {
                spin_lock(&nf_conntrack_locks[i]);
@@ -2457,7 +2465,6 @@ i_see_dead_people:
        }
 
        list_for_each_entry(net, net_exit_list, exit_list) {
-               nf_conntrack_proto_pernet_fini(net);
                nf_conntrack_ecache_pernet_fini(net);
                nf_conntrack_expect_pernet_fini(net);
                free_percpu(net->ct.stat);
index 4e1a9db..e81af33 100644 (file)
@@ -218,6 +218,7 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
        if (!help)
                return 0;
 
+       rcu_read_lock();
        helper = rcu_dereference(help->helper);
        if (!helper)
                goto out;
@@ -233,9 +234,11 @@ static int ctnetlink_dump_helpinfo(struct sk_buff *skb,
 
        nla_nest_end(skb, nest_helper);
 out:
+       rcu_read_unlock();
        return 0;
 
 nla_put_failure:
+       rcu_read_unlock();
        return -1;
 }
 
index 5564740..8f7a983 100644 (file)
@@ -697,13 +697,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
 #endif
 }
 
-void nf_conntrack_proto_pernet_fini(struct net *net)
-{
-#ifdef CONFIG_NF_CT_PROTO_GRE
-       nf_ct_gre_keymap_flush(net);
-#endif
-}
-
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
                  &nf_conntrack_htable_size, 0600);
 
index db11e40..728eeb0 100644 (file)
@@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
        return &net->ct.nf_ct_proto.gre;
 }
 
-void nf_ct_gre_keymap_flush(struct net *net)
-{
-       struct nf_gre_net *net_gre = gre_pernet(net);
-       struct nf_ct_gre_keymap *km, *tmp;
-
-       spin_lock_bh(&keymap_lock);
-       list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
-               list_del_rcu(&km->list);
-               kfree_rcu(km, rcu);
-       }
-       spin_unlock_bh(&keymap_lock);
-}
-
 static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
                                const struct nf_conntrack_tuple *t)
 {
index f7e8baf..3259416 100644 (file)
@@ -823,6 +823,22 @@ static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
        return true;
 }
 
+static bool tcp_can_early_drop(const struct nf_conn *ct)
+{
+       switch (ct->proto.tcp.state) {
+       case TCP_CONNTRACK_FIN_WAIT:
+       case TCP_CONNTRACK_LAST_ACK:
+       case TCP_CONNTRACK_TIME_WAIT:
+       case TCP_CONNTRACK_CLOSE:
+       case TCP_CONNTRACK_CLOSE_WAIT:
+               return true;
+       default:
+               break;
+       }
+
+       return false;
+}
+
 /* Returns verdict for packet, or -1 for invalid. */
 int nf_conntrack_tcp_packet(struct nf_conn *ct,
                            struct sk_buff *skb,
@@ -1030,10 +1046,30 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
                if (index != TCP_RST_SET)
                        break;
 
-               if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) {
+               /* If we are closing, tuple might have been re-used already.
+                * last_index, last_ack, and all other ct fields used for
+                * sequence/window validation are outdated in that case.
+                *
+                * As the conntrack can already be expired by GC under pressure,
+                * just skip validation checks.
+                */
+               if (tcp_can_early_drop(ct))
+                       goto in_window;
+
+               /* td_maxack might be outdated if we let a SYN through earlier */
+               if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) &&
+                   ct->proto.tcp.last_index != TCP_SYN_SET) {
                        u32 seq = ntohl(th->seq);
 
-                       if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
+                       /* If we are not in established state and SEQ=0 this is most
+                        * likely an answer to a SYN we let go through above (last_index
+                        * can be updated due to out-of-order ACKs).
+                        */
+                       if (seq == 0 && !nf_conntrack_tcp_established(ct))
+                               break;
+
+                       if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
+                           !tn->tcp_ignore_invalid_rst) {
                                /* Invalid RST  */
                                spin_unlock_bh(&ct->lock);
                                nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
@@ -1134,6 +1170,16 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
                        nf_ct_kill_acct(ct, ctinfo, skb);
                        return NF_ACCEPT;
                }
+
+               if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) {
+                       /* do not renew timeout on SYN retransmit.
+                        *
+                        * Else port reuse by client or NAT middlebox can keep
+                        * entry alive indefinitely (including nat info).
+                        */
+                       return NF_ACCEPT;
+               }
+
                /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
                 * pickup with loose=1. Avoid large ESTABLISHED timeout.
                 */
@@ -1155,22 +1201,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
        return NF_ACCEPT;
 }
 
-static bool tcp_can_early_drop(const struct nf_conn *ct)
-{
-       switch (ct->proto.tcp.state) {
-       case TCP_CONNTRACK_FIN_WAIT:
-       case TCP_CONNTRACK_LAST_ACK:
-       case TCP_CONNTRACK_TIME_WAIT:
-       case TCP_CONNTRACK_CLOSE:
-       case TCP_CONNTRACK_CLOSE_WAIT:
-               return true;
-       default:
-               break;
-       }
-
-       return false;
-}
-
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
@@ -1437,6 +1467,9 @@ void nf_conntrack_tcp_init_net(struct net *net)
         */
        tn->tcp_be_liberal = 0;
 
+       /* If it's non-zero, we turn off RST sequence number check */
+       tn->tcp_ignore_invalid_rst = 0;
+
        /* Max number of the retransmitted packets without receiving an (acceptable)
         * ACK from the destination. If this number is reached, a shorter timer
         * will be started.
index f57a951..214d9f9 100644 (file)
@@ -579,6 +579,7 @@ enum nf_ct_sysctl_index {
 #endif
        NF_SYSCTL_CT_PROTO_TCP_LOOSE,
        NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+       NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
        NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
        NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
        NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
@@ -798,6 +799,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
                .extra1         = SYSCTL_ZERO,
                .extra2         = SYSCTL_ONE,
        },
+       [NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
+               .procname       = "nf_conntrack_tcp_ignore_invalid_rst",
+               .maxlen         = sizeof(u8),
+               .mode           = 0644,
+               .proc_handler   = proc_dou8vec_minmax,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_ONE,
+       },
        [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
                .procname       = "nf_conntrack_tcp_max_retrans",
                .maxlen         = sizeof(u8),
@@ -1004,6 +1013,7 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
        XASSIGN(LOOSE, &tn->tcp_loose);
        XASSIGN(LIBERAL, &tn->tcp_be_liberal);
        XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
+       XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
 #undef XASSIGN
 
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
index 390d446..de182d1 100644 (file)
@@ -3446,7 +3446,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
        return 0;
 
 err_destroy_flow_rule:
-       nft_flow_rule_destroy(flow);
+       if (flow)
+               nft_flow_rule_destroy(flow);
 err_release_rule:
        nf_tables_rule_release(&ctx, rule);
 err_release_expr:
index 913ac45..8088b99 100644 (file)
@@ -23,15 +23,21 @@ static int nft_last_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 {
        struct nft_last_priv *priv = nft_expr_priv(expr);
        u64 last_jiffies;
+       u32 last_set = 0;
        int err;
 
-       if (tb[NFTA_LAST_MSECS]) {
+       if (tb[NFTA_LAST_SET]) {
+               last_set = ntohl(nla_get_be32(tb[NFTA_LAST_SET]));
+               if (last_set == 1)
+                       priv->last_set = 1;
+       }
+
+       if (last_set && tb[NFTA_LAST_MSECS]) {
                err = nf_msecs_to_jiffies64(tb[NFTA_LAST_MSECS], &last_jiffies);
                if (err < 0)
                        return err;
 
-               priv->last_jiffies = jiffies + (unsigned long)last_jiffies;
-               priv->last_set = 1;
+               priv->last_jiffies = jiffies - (unsigned long)last_jiffies;
        }
 
        return 0;
index d233ac4..380f95a 100644 (file)
@@ -2471,7 +2471,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 
        nlmsg_end(skb, rep);
 
-       netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
+       nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);
 }
 EXPORT_SYMBOL(netlink_ack);
 
index c89c8da..d4a2db0 100644 (file)
@@ -670,13 +670,13 @@ static bool cmp_key(const struct sw_flow_key *key1,
 {
        const long *cp1 = (const long *)((const u8 *)key1 + key_start);
        const long *cp2 = (const long *)((const u8 *)key2 + key_start);
-       long diffs = 0;
        int i;
 
        for (i = key_start; i < key_end; i += sizeof(long))
-               diffs |= *cp1++ ^ *cp2++;
+               if (*cp1++ ^ *cp2++)
+                       return false;
 
-       return diffs == 0;
+       return true;
 }
 
 static bool flow_cmp_masked_key(const struct sw_flow *flow,
index a656baa..1b4b351 100644 (file)
@@ -322,11 +322,22 @@ err_alloc:
 
 static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
 {
+       struct flow_block_cb *block_cb, *tmp_cb;
        struct tcf_ct_flow_table *ct_ft;
+       struct flow_block *block;
 
        ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
                             rwork);
        nf_flow_table_free(&ct_ft->nf_ft);
+
+       /* Remove any remaining callbacks before cleanup */
+       block = &ct_ft->nf_ft.flow_block;
+       down_write(&ct_ft->nf_ft.flow_block_lock);
+       list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) {
+               list_del(&block_cb->list);
+               flow_block_cb_free(block_cb);
+       }
+       up_write(&ct_ft->nf_ft.flow_block_lock);
        kfree(ct_ft);
 
        module_put(THIS_MODULE);
@@ -1026,7 +1037,8 @@ do_nat:
                /* This will take care of sending queued events
                 * even if the connection is already confirmed.
                 */
-               nf_conntrack_confirm(skb);
+               if (nf_conntrack_confirm(skb) != NF_ACCEPT)
+                       goto drop;
        }
 
        if (!skip_add)
index 66fe2b8..07b30d0 100644 (file)
@@ -564,7 +564,7 @@ static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch)
        /* if there's no entry, it means that the schedule didn't
         * start yet, so force all gates to be open, this is in
         * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
-        * "AdminGateSates"
+        * "AdminGateStates"
         */
        gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
 
index 493fc01..760b367 100644 (file)
@@ -284,10 +284,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
                goto out;
        }
 
-       err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-                             MSG_DONTWAIT);
-       if (err > 0)
-               err = 0;
+       err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
        return err;
 }
index 3c1fbf3..ec0f525 100644 (file)
@@ -398,7 +398,8 @@ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr)
                retval = SCTP_SCOPE_LINK;
        } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) ||
                   ipv4_is_private_172(addr->v4.sin_addr.s_addr) ||
-                  ipv4_is_private_192(addr->v4.sin_addr.s_addr)) {
+                  ipv4_is_private_192(addr->v4.sin_addr.s_addr) ||
+                  ipv4_is_test_198(addr->v4.sin_addr.s_addr)) {
                retval = SCTP_SCOPE_PRIVATE;
        } else {
                retval = SCTP_SCOPE_GLOBAL;
index 6c08e50..b8fa8f1 100644 (file)
@@ -1163,7 +1163,7 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
                                       const struct sctp_transport *transport,
                                       __u32 probe_size)
 {
-       struct sctp_sender_hb_info hbinfo;
+       struct sctp_sender_hb_info hbinfo = {};
        struct sctp_chunk *retval;
 
        retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0,
index 5f23804..397a624 100644 (file)
@@ -335,10 +335,13 @@ void sctp_transport_pl_recv(struct sctp_transport *t)
                        t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
                        sctp_assoc_sync_pmtu(t->asoc);
                }
-       } else if (t->pl.state == SCTP_PL_COMPLETE && ++t->pl.raise_count == 30) {
-               /* Raise probe_size again after 30 * interval in Search Complete */
-               t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */
-               t->pl.probe_size += SCTP_PL_MIN_STEP;
+       } else if (t->pl.state == SCTP_PL_COMPLETE) {
+               t->pl.raise_count++;
+               if (t->pl.raise_count == 30) {
+                       /* Raise probe_size again after 30 * interval in Search Complete */
+                       t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */
+                       t->pl.probe_size += SCTP_PL_MIN_STEP;
+               }
        }
 }
 
index bd9233d..0b2dad3 100644 (file)
 #include <linux/sockios.h>
 #include <net/busy_poll.h>
 #include <linux/errqueue.h>
+#include <linux/ptp_clock_kernel.h>
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
 unsigned int sysctl_net_busy_read __read_mostly;
@@ -873,12 +874,18 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
                empty = 0;
        if (shhwtstamps &&
            (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
-           !skb_is_swtx_tstamp(skb, false_tstamp) &&
-           ktime_to_timespec64_cond(shhwtstamps->hwtstamp, tss.ts + 2)) {
-               empty = 0;
-               if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
-                   !skb_is_err_queue(skb))
-                       put_ts_pktinfo(msg, skb);
+           !skb_is_swtx_tstamp(skb, false_tstamp)) {
+               if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC)
+                       ptp_convert_timestamp(shhwtstamps, sk->sk_bind_phc);
+
+               if (ktime_to_timespec64_cond(shhwtstamps->hwtstamp,
+                                            tss.ts + 2)) {
+                       empty = 0;
+
+                       if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
+                           !skb_is_err_queue(skb))
+                               put_ts_pktinfo(msg, skb);
+               }
        }
        if (!empty) {
                if (sock_flag(sk, SOCK_TSTAMP_NEW))
index 9ff64f9..7e7d7f4 100644 (file)
@@ -295,10 +295,8 @@ again:
 
                goto again;
        }
-       err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-                             MSG_DONTWAIT);
-       if (err > 0)
-               err = 0;
+       err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
 out:
        if (sk)
                sock_put(sk);
index 520434e..036998d 100644 (file)
@@ -331,6 +331,7 @@ $(obj)/%.o: $(src)/%.c
                -Wno-gnu-variable-sized-type-not-at-end \
                -Wno-address-of-packed-member -Wno-tautological-compare \
                -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
+               -fno-asynchronous-unwind-tables \
                -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
                -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
                $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
index 53e300f..33d0bde 100644 (file)
@@ -96,6 +96,7 @@ static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
 static int opt_timeout = 1000;
 static bool opt_need_wakeup = true;
 static u32 opt_num_xsks = 1;
+static u32 prog_id;
 static bool opt_busy_poll;
 static bool opt_reduced_cap;
 
@@ -461,6 +462,23 @@ static void *poller(void *arg)
        return NULL;
 }
 
+static void remove_xdp_program(void)
+{
+       u32 curr_prog_id = 0;
+
+       if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
+               printf("bpf_get_link_xdp_id failed\n");
+               exit(EXIT_FAILURE);
+       }
+
+       if (prog_id == curr_prog_id)
+               bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+       else if (!curr_prog_id)
+               printf("couldn't find a prog id on a given interface\n");
+       else
+               printf("program on interface changed, not removing\n");
+}
+
 static void int_exit(int sig)
 {
        benchmark_done = true;
@@ -471,6 +489,9 @@ static void __exit_with_error(int error, const char *file, const char *func,
 {
        fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
                line, error, strerror(error));
+
+       if (opt_num_xsks > 1)
+               remove_xdp_program();
        exit(EXIT_FAILURE);
 }
 
@@ -490,6 +511,9 @@ static void xdpsock_cleanup(void)
                if (write(sock, &cmd, sizeof(int)) < 0)
                        exit_with_error(errno);
        }
+
+       if (opt_num_xsks > 1)
+               remove_xdp_program();
 }
 
 static void swap_mac_addresses(void *data)
@@ -857,6 +881,10 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
        if (ret)
                exit_with_error(-ret);
 
+       ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
+       if (ret)
+               exit_with_error(-ret);
+
        xsk->app_stats.rx_empty_polls = 0;
        xsk->app_stats.fill_fail_polls = 0;
        xsk->app_stats.copy_tx_sendtos = 0;
index 2b758a1..5b8a274 100644 (file)
@@ -341,6 +341,7 @@ static int set_mtkaif_rx(struct mtk_base_afe *afe)
        case MT8183_MTKAIF_PROTOCOL_1:
                regmap_write(afe->regmap, AFE_AUD_PAD_TOP, 0x31);
                regmap_write(afe->regmap, AFE_ADDA_MTKAIF_CFG0, 0x0);
+               break;
        default:
                break;
        }
index 39bb322..b11cfc8 100644 (file)
@@ -97,7 +97,7 @@ clean: bpftool_clean runqslower_clean resolve_btfids_clean
        $(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpf
        $(Q)$(RM) -r -- $(OUTPUT)feature
 
-install: $(PROGS) bpftool_install runqslower_install
+install: $(PROGS) bpftool_install
        $(call QUIET_INSTALL, bpf_jit_disasm)
        $(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/bin
        $(Q)$(INSTALL) $(OUTPUT)bpf_jit_disasm $(DESTDIR)$(prefix)/bin/bpf_jit_disasm
@@ -118,9 +118,6 @@ bpftool_clean:
 runqslower:
        $(call descend,runqslower)
 
-runqslower_install:
-       $(call descend,runqslower,install)
-
 runqslower_clean:
        $(call descend,runqslower,clean)
 
@@ -131,5 +128,5 @@ resolve_btfids_clean:
        $(call descend,resolve_btfids,clean)
 
 .PHONY: all install clean bpftool bpftool_install bpftool_clean \
-       runqslower runqslower_install runqslower_clean \
+       runqslower runqslower_clean \
        resolve_btfids resolve_btfids_clean
index e7e7eee..24734f2 100644 (file)
@@ -43,11 +43,13 @@ static int fprintf_json(void *out, const char *fmt, ...)
 {
        va_list ap;
        char *s;
+       int err;
 
        va_start(ap, fmt);
-       if (vasprintf(&s, fmt, ap) < 0)
-               return -1;
+       err = vasprintf(&s, fmt, ap);
        va_end(ap);
+       if (err < 0)
+               return -1;
 
        if (!oper_count) {
                int i;
index 645530c..ab9353f 100644 (file)
@@ -74,7 +74,7 @@ int handle__sched_switch(u64 *ctx)
        u32 pid;
 
        /* ivcsw: treat like an enqueue event and store timestamp */
-       if (prev->state == TASK_RUNNING)
+       if (prev->__state == TASK_RUNNING)
                trace_enqueue(prev);
 
        pid = next->pid;
index 1e04ce7..6f5e275 100644 (file)
@@ -10136,7 +10136,7 @@ int bpf_link__unpin(struct bpf_link *link)
 
        err = unlink(link->pin_path);
        if (err != 0)
-               return libbpf_err_errno(err);
+               return -errno;
 
        pr_debug("link fd=%d: unpinned from %s\n", link->fd, link->pin_path);
        zfree(&link->pin_path);
@@ -11197,7 +11197,7 @@ int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms)
 
        cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, timeout_ms);
        if (cnt < 0)
-               return libbpf_err_errno(cnt);
+               return -errno;
 
        for (i = 0; i < cnt; i++) {
                struct perf_cpu_buf *cpu_buf = pb->events[i].data.ptr;
index ee27d68..b5940e6 100644 (file)
@@ -715,6 +715,8 @@ out:
        bpf_object__close(obj);
 }
 
+#include "tailcall_bpf2bpf4.skel.h"
+
 /* test_tailcall_bpf2bpf_4 checks that tailcall counter is correctly preserved
  * across tailcalls combined with bpf2bpf calls. for making sure that tailcall
  * counter behaves correctly, bpf program will go through following flow:
@@ -727,10 +729,15 @@ out:
  * the loop begins. At the end of the test make sure that the global counter is
  * equal to 31, because tailcall counter includes the first two tailcalls
  * whereas global counter is incremented only on loop presented on flow above.
+ *
+ * The noise parameter is used to insert bpf_map_update calls into the logic
+ * to force verifier to patch instructions. This allows us to ensure jump
+ * logic remains correct with instruction movement.
  */
-static void test_tailcall_bpf2bpf_4(void)
+static void test_tailcall_bpf2bpf_4(bool noise)
 {
-       int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+       int err, map_fd, prog_fd, main_fd, data_fd, i;
+       struct tailcall_bpf2bpf4__bss val;
        struct bpf_map *prog_array, *data_map;
        struct bpf_program *prog;
        struct bpf_object *obj;
@@ -774,11 +781,6 @@ static void test_tailcall_bpf2bpf_4(void)
                        goto out;
        }
 
-       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
-                               &duration, &retval, NULL);
-       CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
-             err, errno, retval);
-
        data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
        if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
                return;
@@ -788,9 +790,21 @@ static void test_tailcall_bpf2bpf_4(void)
                return;
 
        i = 0;
+       val.noise = noise;
+       val.count = 0;
+       err = bpf_map_update_elem(data_fd, &i, &val, BPF_ANY);
+       if (CHECK_FAIL(err))
+               goto out;
+
+       err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+                               &duration, &retval, NULL);
+       CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
+             err, errno, retval);
+
+       i = 0;
        err = bpf_map_lookup_elem(data_fd, &i, &val);
-       CHECK(err || val != 31, "tailcall count", "err %d errno %d count %d\n",
-             err, errno, val);
+       CHECK(err || val.count != 31, "tailcall count", "err %d errno %d count %d\n",
+             err, errno, val.count);
 
 out:
        bpf_object__close(obj);
@@ -815,5 +829,7 @@ void test_tailcalls(void)
        if (test__start_subtest("tailcall_bpf2bpf_3"))
                test_tailcall_bpf2bpf_3();
        if (test__start_subtest("tailcall_bpf2bpf_4"))
-               test_tailcall_bpf2bpf_4();
+               test_tailcall_bpf2bpf_4(false);
+       if (test__start_subtest("tailcall_bpf2bpf_5"))
+               test_tailcall_bpf2bpf_4(true);
 }
index 77df6d4..e89368a 100644 (file)
@@ -3,6 +3,13 @@
 #include <bpf/bpf_helpers.h>
 
 struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 1);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(__u32));
+} nop_table SEC(".maps");
+
+struct {
        __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
        __uint(max_entries, 3);
        __uint(key_size, sizeof(__u32));
@@ -10,10 +17,21 @@ struct {
 } jmp_table SEC(".maps");
 
 int count = 0;
+int noise = 0;
+
+__always_inline int subprog_noise(void)
+{
+       __u32 key = 0;
+
+       bpf_map_lookup_elem(&nop_table, &key);
+       return 0;
+}
 
 __noinline
 int subprog_tail_2(struct __sk_buff *skb)
 {
+       if (noise)
+               subprog_noise();
        bpf_tail_call_static(skb, &jmp_table, 2);
        return skb->len * 3;
 }
index 615ab25..010b59b 100644 (file)
@@ -45,6 +45,7 @@ enum vm_guest_mode {
        VM_MODE_P40V48_64K,
        VM_MODE_PXXV48_4K,      /* For 48bits VA but ANY bits PA */
        VM_MODE_P47V64_4K,
+       VM_MODE_P44V64_4K,
        NUM_VM_MODES,
 };
 
@@ -62,7 +63,7 @@ enum vm_guest_mode {
 
 #elif defined(__s390x__)
 
-#define VM_MODE_DEFAULT                        VM_MODE_P47V64_4K
+#define VM_MODE_DEFAULT                        VM_MODE_P44V64_4K
 #define MIN_PAGE_SHIFT                 12U
 #define ptes_per_page(page_size)       ((page_size) / 16)
 
index 9f49f6c..632b74d 100644 (file)
@@ -401,7 +401,7 @@ unexpected_exception:
 void vm_init_descriptor_tables(struct kvm_vm *vm)
 {
        vm->handlers = vm_vaddr_alloc(vm, sizeof(struct handlers),
-                       vm->page_size, 0, 0);
+                       vm->page_size);
 
        *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
 }
index 25bff30..c330f41 100644 (file)
@@ -22,6 +22,22 @@ void guest_modes_append_default(void)
                }
        }
 #endif
+#ifdef __s390x__
+       {
+               int kvm_fd, vm_fd;
+               struct kvm_s390_vm_cpu_processor info;
+
+               kvm_fd = open_kvm_dev_path_or_exit();
+               vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);
+               kvm_device_access(vm_fd, KVM_S390_VM_CPU_MODEL,
+                                 KVM_S390_VM_CPU_PROCESSOR, &info, false);
+               close(vm_fd);
+               close(kvm_fd);
+               /* Starting with z13 we have 47bits of physical address */
+               if (info.ibc >= 0x30)
+                       guest_mode_append(VM_MODE_P47V64_4K, true, true);
+       }
+#endif
 }
 
 void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg)
index 5b56b57..10a8ed6 100644 (file)
@@ -176,6 +176,7 @@ const char *vm_guest_mode_string(uint32_t i)
                [VM_MODE_P40V48_64K]    = "PA-bits:40,  VA-bits:48, 64K pages",
                [VM_MODE_PXXV48_4K]     = "PA-bits:ANY, VA-bits:48,  4K pages",
                [VM_MODE_P47V64_4K]     = "PA-bits:47,  VA-bits:64,  4K pages",
+               [VM_MODE_P44V64_4K]     = "PA-bits:44,  VA-bits:64,  4K pages",
        };
        _Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
                       "Missing new mode strings?");
@@ -194,6 +195,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
        { 40, 48, 0x10000, 16 },
        {  0,  0,  0x1000, 12 },
        { 47, 64,  0x1000, 12 },
+       { 44, 64,  0x1000, 12 },
 };
 _Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
               "Missing new mode params?");
@@ -282,6 +284,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
        case VM_MODE_P47V64_4K:
                vm->pgtable_levels = 5;
                break;
+       case VM_MODE_P44V64_4K:
+               vm->pgtable_levels = 5;
+               break;
        default:
                TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
        }
index 85b18bb..72a1c9b 100644 (file)
@@ -377,7 +377,8 @@ static void test_add_max_memory_regions(void)
                (max_mem_slots - 1), MEM_REGION_SIZE >> 10);
 
        mem = mmap(NULL, (size_t)max_mem_slots * MEM_REGION_SIZE + alignment,
-                  PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+                  PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0);
        TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
        mem_aligned = (void *)(((size_t) mem + alignment - 1) & ~(alignment - 1));
 
index 42bd658..af27c7e 100644 (file)
@@ -615,7 +615,7 @@ int main(void)
 
        vm_init_descriptor_tables(vm);
        vcpu_init_descriptor_tables(vm, VCPU_ID);
-       vm_handle_exception(vm, GP_VECTOR, guest_gp_handler);
+       vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
 
        pr_info("Testing access to Hyper-V specific MSRs\n");
        guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva),
index 523371c..da2325f 100644 (file)
@@ -71,7 +71,7 @@ static void mmu_role_test(u32 *cpuid_reg, u32 evil_cpuid_val)
        /* Set up a #PF handler to eat the RSVD #PF and signal all done! */
        vm_init_descriptor_tables(vm);
        vcpu_init_descriptor_tables(vm, VCPU_ID);
-       vm_handle_exception(vm, PF_VECTOR, guest_pf_handler);
+       vm_install_exception_handler(vm, PF_VECTOR, guest_pf_handler);
 
        r = _vcpu_run(vm, VCPU_ID);
        TEST_ASSERT(r == 0, "vcpu_run failed: %d\n", r);
index c1f8318..d0fe2fd 100644 (file)
@@ -53,15 +53,28 @@ static inline void sync_with_host(uint64_t phase)
                     : "+a" (phase));
 }
 
-void self_smi(void)
+static void self_smi(void)
 {
        x2apic_write_reg(APIC_ICR,
                         APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI);
 }
 
-void guest_code(void *arg)
+static void l2_guest_code(void)
 {
+       sync_with_host(8);
+
+       sync_with_host(10);
+
+       vmcall();
+}
+
+static void guest_code(void *arg)
+{
+       #define L2_GUEST_STACK_SIZE 64
+       unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
        uint64_t apicbase = rdmsr(MSR_IA32_APICBASE);
+       struct svm_test_data *svm = arg;
+       struct vmx_pages *vmx_pages = arg;
 
        sync_with_host(1);
 
@@ -74,21 +87,50 @@ void guest_code(void *arg)
        sync_with_host(4);
 
        if (arg) {
-               if (cpu_has_svm())
-                       generic_svm_setup(arg, NULL, NULL);
-               else
-                       GUEST_ASSERT(prepare_for_vmx_operation(arg));
+               if (cpu_has_svm()) {
+                       generic_svm_setup(svm, l2_guest_code,
+                                         &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+               } else {
+                       GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+                       GUEST_ASSERT(load_vmcs(vmx_pages));
+                       prepare_vmcs(vmx_pages, l2_guest_code,
+                                    &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+               }
 
                sync_with_host(5);
 
                self_smi();
 
                sync_with_host(7);
+
+               if (cpu_has_svm()) {
+                       run_guest(svm->vmcb, svm->vmcb_gpa);
+                       svm->vmcb->save.rip += 3;
+                       run_guest(svm->vmcb, svm->vmcb_gpa);
+               } else {
+                       vmlaunch();
+                       vmresume();
+               }
+
+               /* Stages 8-11 are eaten by SMM (SMRAM_STAGE reported instead) */
+               sync_with_host(12);
        }
 
        sync_with_host(DONE);
 }
 
+void inject_smi(struct kvm_vm *vm)
+{
+       struct kvm_vcpu_events events;
+
+       vcpu_events_get(vm, VCPU_ID, &events);
+
+       events.smi.pending = 1;
+       events.flags |= KVM_VCPUEVENT_VALID_SMM;
+
+       vcpu_events_set(vm, VCPU_ID, &events);
+}
+
 int main(int argc, char *argv[])
 {
        vm_vaddr_t nested_gva = 0;
@@ -147,6 +189,22 @@ int main(int argc, char *argv[])
                            "Unexpected stage: #%x, got %x",
                            stage, stage_reported);
 
+               /*
+                * Enter SMM during L2 execution and check that we correctly
+                * return from it. Do not perform save/restore while in SMM yet.
+                */
+               if (stage == 8) {
+                       inject_smi(vm);
+                       continue;
+               }
+
+               /*
+                * Perform save/restore while the guest is in SMM triggered
+                * during L2 execution.
+                */
+               if (stage == 10)
+                       inject_smi(vm);
+
                state = vcpu_save_state(vm, VCPU_ID);
                kvm_vm_release(vm);
                kvm_vm_restart(vm, O_RDWR);
index c19ecc6..ecbf57f 100755 (executable)
@@ -313,9 +313,10 @@ check_exception()
        fi
        log_test $? 0 "IPv4: ${desc}"
 
-       if [ "$with_redirect" = "yes" ]; then
+       # No PMTU info for test "redirect" and "mtu exception plus redirect"
+       if [ "$with_redirect" = "yes" ] && [ "$desc" != "redirect exception plus mtu" ]; then
                ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
-               grep -q "${H2_N2_IP6} from :: via ${R2_LLADDR} dev br0.*${mtu}"
+               grep -v "mtu" | grep -q "${H2_N2_IP6} .*via ${R2_LLADDR} dev br0"
        elif [ -n "${mtu}" ]; then
                ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
                grep -q "${mtu}"
index 9a191c1..f02f4de 100755 (executable)
@@ -1409,7 +1409,7 @@ syncookies_tests()
        ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
        ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
        run_tests $ns1 $ns2 10.0.1.1
-       chk_join_nr "subflows limited by server w cookies" 2 2 1
+       chk_join_nr "subflows limited by server w cookies" 2 1 1
 
        # test signal address with cookies
        reset_with_cookies
index 21091be..aee631c 100644 (file)
@@ -47,7 +47,7 @@ static void usage(const char *error)
 {
        if (error)
                printf("invalid option: %s\n", error);
-       printf("timestamping interface option*\n\n"
+       printf("timestamping <interface> [bind_phc_index] [option]*\n\n"
               "Options:\n"
               "  IP_MULTICAST_LOOP - looping outgoing multicasts\n"
               "  SO_TIMESTAMP - normal software time stamping, ms resolution\n"
@@ -58,6 +58,7 @@ static void usage(const char *error)
               "  SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n"
               "  SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n"
               "  SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n"
+              "  SOF_TIMESTAMPING_BIND_PHC - request to bind a PHC of PTP vclock\n"
               "  SIOCGSTAMP - check last socket time stamp\n"
               "  SIOCGSTAMPNS - more accurate socket time stamp\n"
               "  PTPV2 - use PTPv2 messages\n");
@@ -311,7 +312,6 @@ static void recvpacket(int sock, int recvmsg_flags,
 
 int main(int argc, char **argv)
 {
-       int so_timestamping_flags = 0;
        int so_timestamp = 0;
        int so_timestampns = 0;
        int siocgstamp = 0;
@@ -325,6 +325,8 @@ int main(int argc, char **argv)
        struct ifreq device;
        struct ifreq hwtstamp;
        struct hwtstamp_config hwconfig, hwconfig_requested;
+       struct so_timestamping so_timestamping_get = { 0, -1 };
+       struct so_timestamping so_timestamping = { 0, -1 };
        struct sockaddr_in addr;
        struct ip_mreq imr;
        struct in_addr iaddr;
@@ -342,7 +344,12 @@ int main(int argc, char **argv)
                exit(1);
        }
 
-       for (i = 2; i < argc; i++) {
+       if (argc >= 3 && sscanf(argv[2], "%d", &so_timestamping.bind_phc) == 1)
+               val = 3;
+       else
+               val = 2;
+
+       for (i = val; i < argc; i++) {
                if (!strcasecmp(argv[i], "SO_TIMESTAMP"))
                        so_timestamp = 1;
                else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS"))
@@ -356,17 +363,19 @@ int main(int argc, char **argv)
                else if (!strcasecmp(argv[i], "PTPV2"))
                        ptpv2 = 1;
                else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE"))
-                       so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE;
+                       so_timestamping.flags |= SOF_TIMESTAMPING_TX_HARDWARE;
                else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE"))
-                       so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
+                       so_timestamping.flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
                else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE"))
-                       so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE;
+                       so_timestamping.flags |= SOF_TIMESTAMPING_RX_HARDWARE;
                else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE"))
-                       so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
+                       so_timestamping.flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
                else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE"))
-                       so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE;
+                       so_timestamping.flags |= SOF_TIMESTAMPING_SOFTWARE;
                else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE"))
-                       so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
+                       so_timestamping.flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
+               else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_BIND_PHC"))
+                       so_timestamping.flags |= SOF_TIMESTAMPING_BIND_PHC;
                else
                        usage(argv[i]);
        }
@@ -385,10 +394,10 @@ int main(int argc, char **argv)
        hwtstamp.ifr_data = (void *)&hwconfig;
        memset(&hwconfig, 0, sizeof(hwconfig));
        hwconfig.tx_type =
-               (so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
+               (so_timestamping.flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
                HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
        hwconfig.rx_filter =
-               (so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
+               (so_timestamping.flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
                ptpv2 ? HWTSTAMP_FILTER_PTP_V2_L4_SYNC :
                HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE;
        hwconfig_requested = hwconfig;
@@ -413,6 +422,9 @@ int main(int argc, char **argv)
                 sizeof(struct sockaddr_in)) < 0)
                bail("bind");
 
+       if (setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, interface, if_len))
+               bail("bind device");
+
        /* set multicast group for outgoing packets */
        inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */
        addr.sin_addr = iaddr;
@@ -444,10 +456,9 @@ int main(int argc, char **argv)
                           &enabled, sizeof(enabled)) < 0)
                bail("setsockopt SO_TIMESTAMPNS");
 
-       if (so_timestamping_flags &&
-               setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING,
-                          &so_timestamping_flags,
-                          sizeof(so_timestamping_flags)) < 0)
+       if (so_timestamping.flags &&
+           setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &so_timestamping,
+                      sizeof(so_timestamping)) < 0)
                bail("setsockopt SO_TIMESTAMPING");
 
        /* request IP_PKTINFO for debugging purposes */
@@ -468,14 +479,18 @@ int main(int argc, char **argv)
        else
                printf("SO_TIMESTAMPNS %d\n", val);
 
-       if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) {
+       len = sizeof(so_timestamping_get);
+       if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &so_timestamping_get,
+                      &len) < 0) {
                printf("%s: %s\n", "getsockopt SO_TIMESTAMPING",
                       strerror(errno));
        } else {
-               printf("SO_TIMESTAMPING %d\n", val);
-               if (val != so_timestamping_flags)
-                       printf("   not the expected value %d\n",
-                              so_timestamping_flags);
+               printf("SO_TIMESTAMPING flags %d, bind phc %d\n",
+                      so_timestamping_get.flags, so_timestamping_get.bind_phc);
+               if (so_timestamping_get.flags != so_timestamping.flags ||
+                   so_timestamping_get.bind_phc != so_timestamping.bind_phc)
+                       printf("   not expected, flags %d, bind phc %d\n",
+                              so_timestamping.flags, so_timestamping.bind_phc);
        }
 
        /* send packets forever every five seconds */
index cd6430b..8748199 100644 (file)
@@ -5,7 +5,7 @@ TEST_PROGS := nft_trans_stress.sh nft_fib.sh nft_nat.sh bridge_brouter.sh \
        conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
        nft_concat_range.sh nft_conntrack_helper.sh \
        nft_queue.sh nft_meta.sh nf_nat_edemux.sh \
-       ipip-conntrack-mtu.sh
+       ipip-conntrack-mtu.sh conntrack_tcp_unreplied.sh
 
 LDLIBS = -lmnl
 TEST_GEN_FILES =  nf-queue
diff --git a/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh b/tools/testing/selftests/netfilter/conntrack_tcp_unreplied.sh
new file mode 100755 (executable)
index 0000000..e7d7bf1
--- /dev/null
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check that UNREPLIED tcp conntrack will eventually timeout.
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+waittime=20
+sfx=$(mktemp -u "XXXXXXXX")
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+       echo "SKIP: Could not run test without nft tool"
+       exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+       echo "SKIP: Could not run test without ip tool"
+       exit $ksft_skip
+fi
+
+cleanup() {
+       ip netns pids $ns1 | xargs kill 2>/dev/null
+       ip netns pids $ns2 | xargs kill 2>/dev/null
+
+       ip netns del $ns1
+       ip netns del $ns2
+}
+
+ipv4() {
+    echo -n 192.168.$1.2
+}
+
+check_counter()
+{
+       ns=$1
+       name=$2
+       expect=$3
+       local lret=0
+
+       cnt=$(ip netns exec $ns2 nft list counter inet filter "$name" | grep -q "$expect")
+       if [ $? -ne 0 ]; then
+               echo "ERROR: counter $name in $ns2 has unexpected value (expected $expect)" 1>&2
+               ip netns exec $ns2 nft list counter inet filter "$name" 1>&2
+               lret=1
+       fi
+
+       return $lret
+}
+
+# Create test namespaces
+ip netns add $ns1 || exit 1
+
+trap cleanup EXIT
+
+ip netns add $ns2 || exit 1
+
+# Connect the namespace to the host using a veth pair
+ip -net $ns1 link add name veth1 type veth peer name veth2
+ip -net $ns1 link set netns $ns2 dev veth2
+
+ip -net $ns1 link set up dev lo
+ip -net $ns2 link set up dev lo
+ip -net $ns1 link set up dev veth1
+ip -net $ns2 link set up dev veth2
+
+ip -net $ns2 addr add 10.11.11.2/24 dev veth2
+ip -net $ns2 route add default via 10.11.11.1
+
+ip netns exec $ns2 sysctl -q net.ipv4.conf.veth2.forwarding=1
+
+# add a rule inside NS so we enable conntrack
+ip netns exec $ns1 iptables -A INPUT -m state --state established,related -j ACCEPT
+
+ip -net $ns1 addr add 10.11.11.1/24 dev veth1
+ip -net $ns1 route add 10.99.99.99 via 10.11.11.2
+
+# Check connectivity works
+ip netns exec $ns1 ping -q -c 2 10.11.11.2 >/dev/null || exit 1
+
+ip netns exec $ns2 nc -l -p 8080 < /dev/null &
+
+# however, conntrack entries are there
+
+ip netns exec $ns2 nft -f - <<EOF
+table inet filter {
+       counter connreq { }
+       counter redir { }
+       chain input {
+               type filter hook input priority 0; policy accept;
+               ct state new tcp flags syn ip daddr 10.99.99.99 tcp dport 80 counter name "connreq" accept
+               ct state new ct status dnat tcp dport 8080 counter name "redir" accept
+       }
+}
+EOF
+if [ $? -ne 0 ]; then
+       echo "ERROR: Could not load nft rules"
+       exit 1
+fi
+
+ip netns exec $ns2 sysctl -q net.netfilter.nf_conntrack_tcp_timeout_syn_sent=10
+
+echo "INFO: connect $ns1 -> $ns2 to the virtual ip"
+ip netns exec $ns1 bash -c 'while true ; do
+       nc -p 60000 10.99.99.99 80
+       sleep 1
+       done' &
+
+sleep 1
+
+ip netns exec $ns2 nft -f - <<EOF
+table inet nat {
+       chain prerouting {
+               type nat hook prerouting priority 0; policy accept;
+               ip daddr 10.99.99.99 tcp dport 80 redirect to :8080
+       }
+}
+EOF
+if [ $? -ne 0 ]; then
+       echo "ERROR: Could not load nat redirect"
+       exit 1
+fi
+
+count=$(ip netns exec $ns2 conntrack -L -p tcp --dport 80 2>/dev/null | wc -l)
+if [ $count -eq 0 ]; then
+       echo "ERROR: $ns2 did not pick up tcp connection from peer"
+       exit 1
+fi
+
+echo "INFO: NAT redirect added in ns $ns2, waiting for $waittime seconds for nat to take effect"
+for i in $(seq 1 $waittime); do
+       echo -n "."
+
+       sleep 1
+
+       count=$(ip netns exec $ns2 conntrack -L -p tcp --reply-port-src 8080 2>/dev/null | wc -l)
+       if [ $count -gt 0 ]; then
+               echo
+               echo "PASS: redirection took effect after $i seconds"
+               break
+       fi
+
+       m=$((i%20))
+       if [ $m -eq 0 ]; then
+               echo " waited for $i seconds"
+       fi
+done
+
+expect="packets 1 bytes 60"
+check_counter "$ns2" "redir" "$expect"
+if [ $? -ne 0 ]; then
+       ret=1
+fi
+
+if [ $ret -eq 0 ];then
+       echo "PASS: redirection counter has expected values"
+else
+       echo "ERROR: no tcp connection was redirected"
+fi
+
+exit $ret
index f08f5e8..0be80c2 100644 (file)
@@ -186,7 +186,6 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                    coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
                        r = kvm_io_bus_unregister_dev(kvm,
                                zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
-                       kvm_iodevice_destructor(&dev->dev);
 
                        /*
                         * On failure, unregister destroys all devices on the
@@ -196,6 +195,7 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                         */
                        if (r)
                                break;
+                       kvm_iodevice_destructor(&dev->dev);
                }
        }
 
index 7d95126..9869598 100644 (file)
@@ -935,7 +935,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
                stat_data->kvm = kvm;
                stat_data->desc = pdesc;
                stat_data->kind = KVM_STAT_VCPU;
-               kvm->debugfs_stat_data[i] = stat_data;
+               kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
                                    kvm->debugfs_dentry, stat_data,
                                    &stat_fops_per_vm);