Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

author David S. Miller <davem@davemloft.net>

Thu, 23 May 2019 21:45:36 +0000 (14:45 -0700)

committer David S. Miller <davem@davemloft.net>

Thu, 23 May 2019 21:45:36 +0000 (14:45 -0700)
author David S. Miller <davem@davemloft.net>
Thu, 23 May 2019 21:45:36 +0000 (14:45 -0700)
committer David S. Miller <davem@davemloft.net>
Thu, 23 May 2019 21:45:36 +0000 (14:45 -0700)
diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h

index a88f927..e4c4d8e 100644 (file)
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -34,5 +34,5 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
                    const struct nft_pktinfo *pkt);
  
  void nft_fib_store_result(void *reg, const struct nft_fib *priv,
-                         const struct nft_pktinfo *pkt, int index);
+                         const struct net_device *dev);
  #endif
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c

index 94eb25b..c8888e5 100644 (file)
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -58,11 +58,6 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
  }
  EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
  
-static int get_ifindex(const struct net_device *dev)
-{
-       return dev ? dev->ifindex : 0;
-}
-
  void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
                    const struct nft_pktinfo *pkt)
  {
@@ -94,8 +89,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
  
         if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
             nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
-               nft_fib_store_result(dest, priv, pkt,
-                                    nft_in(pkt)->ifindex);
+               nft_fib_store_result(dest, priv, nft_in(pkt));
                 return;
         }
  
@@ -108,8 +102,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
         if (ipv4_is_zeronet(iph->saddr)) {
                 if (ipv4_is_lbcast(iph->daddr) ||
                     ipv4_is_local_multicast(iph->daddr)) {
-                       nft_fib_store_result(dest, priv, pkt,
-                                            get_ifindex(pkt->skb->dev));
+                       nft_fib_store_result(dest, priv, pkt->skb->dev);
                         return;
                 }
         }
@@ -150,17 +143,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
                 found = oif;
         }
  
-       switch (priv->result) {
-       case NFT_FIB_RESULT_OIF:
-               *dest = found->ifindex;
-               break;
-       case NFT_FIB_RESULT_OIFNAME:
-               strncpy((char *)dest, found->name, IFNAMSIZ);
-               break;
-       default:
-               WARN_ON_ONCE(1);
-               break;
-       }
+       nft_fib_store_result(dest, priv, found);
  }
  EXPORT_SYMBOL_GPL(nft_fib4_eval);
  
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c

index 73cdc0b..ec068b0 100644 (file)
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -169,8 +169,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
  
         if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
             nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
-               nft_fib_store_result(dest, priv, pkt,
-                                    nft_in(pkt)->ifindex);
+               nft_fib_store_result(dest, priv, nft_in(pkt));
                 return;
         }
  
@@ -187,18 +186,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
         if (oif && oif != rt->rt6i_idev->dev)
                 goto put_rt_err;
  
-       switch (priv->result) {
-       case NFT_FIB_RESULT_OIF:
-               *dest = rt->rt6i_idev->dev->ifindex;
-               break;
-       case NFT_FIB_RESULT_OIFNAME:
-               strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ);
-               break;
-       default:
-               WARN_ON_ONCE(1);
-               break;
-       }
-
+       nft_fib_store_result(dest, priv, rt->rt6i_idev->dev);
   put_rt_err:
         ip6_rt_put(rt);
  }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c

index 1445755..8ebf211 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2312,7 +2312,6 @@ static void __net_exit __ip_vs_cleanup(struct net *net)
  {
         struct netns_ipvs *ipvs = net_ipvs(net);
  
-       nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
         ip_vs_service_net_cleanup(ipvs);        /* ip_vs_flush() with locks */
         ip_vs_conn_net_cleanup(ipvs);
         ip_vs_app_net_cleanup(ipvs);
@@ -2327,6 +2326,7 @@ static void __net_exit __ip_vs_dev_cleanup(struct net *net)
  {
         struct netns_ipvs *ipvs = net_ipvs(net);
         EnterFunction(2);
+       nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
         ipvs->enable = 0;       /* Disable packet reception */
         smp_wmb();
         ip_vs_sync_net_cleanup(ipvs);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c

index 96825e2..2413174 100644 (file)
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -244,8 +244,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
         rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
         outdev = rt->dst.dev;
  
-       if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
-           (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
+       if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
                 return NF_ACCEPT;
  
         if (skb_try_make_writable(skb, sizeof(*iph)))
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c

index ccc06f7..53aeb12 100644 (file)
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -170,7 +170,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
         if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
                 return true;
  
-       nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_TCP,
+       nf_nat_csum_recalc(skb, nf_ct_l3num(ct), IPPROTO_UDP,
                            udph, &udph->check, datalen, oldlen);
  
         return true;
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c

index 9dc1d6e..b5b2be5 100644 (file)
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -255,6 +255,7 @@ static unsigned int nf_iterate(struct sk_buff *skb,
  repeat:
                 verdict = nf_hook_entry_hookfn(hook, skb, state);
                 if (verdict != NF_ACCEPT) {
+                       *index = i;
                         if (verdict != NF_REPEAT)
                                 return verdict;
                         goto repeat;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c

index 28241e8..4b51599 100644 (file)
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2270,13 +2270,13 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
                                     u32 flags, int family,
                                     const struct nft_table *table,
                                     const struct nft_chain *chain,
-                                   const struct nft_rule *rule)
+                                   const struct nft_rule *rule,
+                                   const struct nft_rule *prule)
  {
         struct nlmsghdr *nlh;
         struct nfgenmsg *nfmsg;
         const struct nft_expr *expr, *next;
         struct nlattr *list;
-       const struct nft_rule *prule;
         u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
  
         nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
@@ -2296,8 +2296,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
                          NFTA_RULE_PAD))
                 goto nla_put_failure;
  
-       if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) {
-               prule = list_prev_entry(rule, list);
+       if (event != NFT_MSG_DELRULE && prule) {
                 if (nla_put_be64(skb, NFTA_RULE_POSITION,
                                  cpu_to_be64(prule->handle),
                                  NFTA_RULE_PAD))
@@ -2344,7 +2343,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
  
         err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
                                        event, 0, ctx->family, ctx->table,
-                                      ctx->chain, rule);
+                                      ctx->chain, rule, NULL);
         if (err < 0) {
                 kfree_skb(skb);
                 goto err;
@@ -2369,12 +2368,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
                                   const struct nft_chain *chain)
  {
         struct net *net = sock_net(skb->sk);
+       const struct nft_rule *rule, *prule;
         unsigned int s_idx = cb->args[0];
-       const struct nft_rule *rule;
  
+       prule = NULL;
         list_for_each_entry_rcu(rule, &chain->rules, list) {
                 if (!nft_is_active(net, rule))
-                       goto cont;
+                       goto cont_skip;
                 if (*idx < s_idx)
                         goto cont;
                 if (*idx > s_idx) {
@@ -2386,11 +2386,13 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
                                         NFT_MSG_NEWRULE,
                                         NLM_F_MULTI | NLM_F_APPEND,
                                         table->family,
-                                       table, chain, rule) < 0)
+                                       table, chain, rule, prule) < 0)
                         return 1;
  
                 nl_dump_check_consistent(cb, nlmsg_hdr(skb));
  cont:
+               prule = rule;
+cont_skip:
                 (*idx)++;
         }
         return 0;
@@ -2546,7 +2548,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
  
         err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
                                        nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
-                                      family, table, chain, rule);
+                                      family, table, chain, rule, NULL);
         if (err < 0)
                 goto err;
  
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c

index 21df8cc..77f00a9 100644 (file)
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -135,17 +135,17 @@ int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
  EXPORT_SYMBOL_GPL(nft_fib_dump);
  
  void nft_fib_store_result(void *reg, const struct nft_fib *priv,
-                         const struct nft_pktinfo *pkt, int index)
+                         const struct net_device *dev)
  {
-       struct net_device *dev;
         u32 *dreg = reg;
+       int index;
  
         switch (priv->result) {
         case NFT_FIB_RESULT_OIF:
+               index = dev ? dev->ifindex : 0;
                 *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index;
                 break;
         case NFT_FIB_RESULT_OIFNAME:
-               dev = dev_get_by_index_rcu(nft_net(pkt), index);
                 if (priv->flags & NFTA_FIB_F_PRESENT)
                         *dreg = !!dev;
                 else
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c

index ffb25d5..aa5f571 100644 (file)
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -13,7 +13,6 @@
  #include <net/netfilter/nf_conntrack_core.h>
  #include <linux/netfilter/nf_conntrack_common.h>
  #include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_conntrack_helper.h>
  
  struct nft_flow_offload {
         struct nft_flowtable    *flowtable;
@@ -50,15 +49,20 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
         return 0;
  }
  
-static bool nft_flow_offload_skip(struct sk_buff *skb)
+static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
  {
-       struct ip_options *opt  = &(IPCB(skb)->opt);
-
-       if (unlikely(opt->optlen))
-               return true;
         if (skb_sec_path(skb))
                 return true;
  
+       if (family == NFPROTO_IPV4) {
+               const struct ip_options *opt;
+
+               opt = &(IPCB(skb)->opt);
+
+               if (unlikely(opt->optlen))
+                       return true;
+       }
+
         return false;
  }
  
@@ -68,15 +72,15 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
  {
         struct nft_flow_offload *priv = nft_expr_priv(expr);
         struct nf_flowtable *flowtable = &priv->flowtable->data;
-       const struct nf_conn_help *help;
         enum ip_conntrack_info ctinfo;
         struct nf_flow_route route;
         struct flow_offload *flow;
         enum ip_conntrack_dir dir;
+       bool is_tcp = false;
         struct nf_conn *ct;
         int ret;
  
-       if (nft_flow_offload_skip(pkt->skb))
+       if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt)))
                 goto out;
  
         ct = nf_ct_get(pkt->skb, &ctinfo);
@@ -85,14 +89,16 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
  
         switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
         case IPPROTO_TCP:
+               is_tcp = true;
+               break;
         case IPPROTO_UDP:
                 break;
         default:
                 goto out;
         }
  
-       help = nfct_help(ct);
-       if (help)
+       if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
+           ct->status & IPS_SEQ_ADJUST)
                 goto out;
  
         if (!nf_ct_is_confirmed(ct))
@@ -109,6 +115,11 @@ static void nft_flow_offload_eval(const struct nft_expr *expr,
         if (!flow)
                 goto err_flow_alloc;
  
+       if (is_tcp) {
+               ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+               ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+       }
+
         ret = flow_offload_add(flowtable, flow);
         if (ret < 0)
                 goto err_flow_add;
diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile

index 3e6d1bc..4144984 100644 (file)
--- a/tools/testing/selftests/netfilter/Makefile
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -2,6 +2,6 @@
  # Makefile for netfilter selftests
  
  TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \
-       conntrack_icmp_related.sh
+       conntrack_icmp_related.sh nft_flowtable.sh
  
  include ../lib.mk
diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh

new file mode 100755 (executable)

index 0000000..fe52488
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_flowtable.sh
@@ -0,0 +1,324 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This tests basic flowtable functionality.
+# Creates following topology:
+#
+# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
+# Router1 is the one doing flow offloading, Router2 has no special
+# purpose other than having a link that is smaller than either Originator
+# and responder, i.e. TCPMSS announced values are too large and will still
+# result in fragmentation and/or PMTU discovery.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+ns1in=""
+ns2in=""
+ns1out=""
+ns2out=""
+
+log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+       echo "SKIP: Could not run test without nft tool"
+       exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+       echo "SKIP: Could not run test without ip tool"
+       exit $ksft_skip
+fi
+
+which nc > /dev/null 2>&1
+if [ $? -ne 0 ];then
+       echo "SKIP: Could not run test without nc (netcat)"
+       exit $ksft_skip
+fi
+
+ip netns add nsr1
+if [ $? -ne 0 ];then
+       echo "SKIP: Could not create net namespace"
+       exit $ksft_skip
+fi
+
+ip netns add ns1
+ip netns add ns2
+
+ip netns add nsr2
+
+cleanup() {
+       for i in 1 2; do
+               ip netns del ns$i
+               ip netns del nsr$i
+       done
+
+       rm -f "$ns1in" "$ns1out"
+       rm -f "$ns2in" "$ns2out"
+
+       [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
+}
+
+trap cleanup EXIT
+
+sysctl -q net.netfilter.nf_log_all_netns=1
+
+ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1
+ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2
+
+ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2
+
+for dev in lo veth0 veth1; do
+  for i in 1 2; do
+    ip -net nsr$i link set $dev up
+  done
+done
+
+ip -net nsr1 addr add 10.0.1.1/24 dev veth0
+ip -net nsr1 addr add dead:1::1/64 dev veth0
+
+ip -net nsr2 addr add 10.0.2.1/24 dev veth1
+ip -net nsr2 addr add dead:2::1/64 dev veth1
+
+# set different MTUs so we need to push packets coming from ns1 (large MTU)
+# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
+# or to do PTMU discovery (send ICMP error back to originator).
+# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
+# is NOT the lowest link mtu.
+
+ip -net nsr1 link set veth0 mtu 9000
+ip -net ns1 link set eth0 mtu 9000
+
+ip -net nsr2 link set veth1 mtu 2000
+ip -net ns2 link set eth0 mtu 2000
+
+# transfer-net between nsr1 and nsr2.
+# these addresses are not used for connections.
+ip -net nsr1 addr add 192.168.10.1/24 dev veth1
+ip -net nsr1 addr add fee1:2::1/64 dev veth1
+
+ip -net nsr2 addr add 192.168.10.2/24 dev veth0
+ip -net nsr2 addr add fee1:2::2/64 dev veth0
+
+for i in 1 2; do
+  ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+  ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+  ip -net ns$i link set lo up
+  ip -net ns$i link set eth0 up
+  ip -net ns$i addr add 10.0.$i.99/24 dev eth0
+  ip -net ns$i route add default via 10.0.$i.1
+  ip -net ns$i addr add dead:$i::99/64 dev eth0
+  ip -net ns$i route add default via dead:$i::1
+  ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null
+
+  # don't set ip DF bit for first two tests
+  ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
+done
+
+ip -net nsr1 route add default via 192.168.10.2
+ip -net nsr2 route add default via 192.168.10.1
+
+ip netns exec nsr1 nft -f - <<EOF
+table inet filter {
+  flowtable f1 {
+     hook ingress priority 0
+     devices = { veth0, veth1 }
+   }
+
+   chain forward {
+      type filter hook forward priority 0; policy drop;
+
+      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
+      meta oif "veth1" tcp dport 12345 flow offload @f1 counter
+
+      # use packet size to trigger 'should be offloaded by now'.
+      # otherwise, if 'flow offload' expression never offloads, the
+      # test will pass.
+      tcp dport 12345 meta length gt 200 ct mark set 1 counter
+
+      # this turns off flow offloading internally, so expect packets again
+      tcp flags fin,rst ct mark set 0 accept
+
+      # this allows large packets from responder, we need this as long
+      # as PMTUd is off.
+      # This rule is deleted for the last test, when we expect PMTUd
+      # to kick in and ensure all packets meet mtu requirements.
+      meta length gt 1500 accept comment something-to-grep-for
+
+      # next line blocks connection w.o. working offload.
+      # we only do this for reverse dir, because we expect packets to
+      # enter slow path due to MTU mismatch of veth0 and veth1.
+      tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop
+
+      ct state established,related accept
+
+      # for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed)
+      meta length lt 200 oif "veth1" tcp dport 12345 counter accept
+
+      meta nfproto ipv4 meta l4proto icmp accept
+      meta nfproto ipv6 meta l4proto icmpv6 accept
+   }
+}
+EOF
+
+if [ $? -ne 0 ]; then
+       echo "SKIP: Could not load nft ruleset"
+       exit $ksft_skip
+fi
+
+# test basic connectivity
+ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null
+if [ $? -ne 0 ];then
+  echo "ERROR: ns1 cannot reach ns2" 1>&2
+  bash
+  exit 1
+fi
+
+ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null
+if [ $? -ne 0 ];then
+  echo "ERROR: ns2 cannot reach ns1" 1>&2
+  exit 1
+fi
+
+if [ $ret -eq 0 ];then
+       echo "PASS: netns routing/connectivity: ns1 can reach ns2"
+fi
+
+ns1in=$(mktemp)
+ns1out=$(mktemp)
+ns2in=$(mktemp)
+ns2out=$(mktemp)
+
+make_file()
+{
+       name=$1
+       who=$2
+
+       SIZE=$((RANDOM % (1024 * 8)))
+       TSIZE=$((SIZE * 1024))
+
+       dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
+
+       SIZE=$((RANDOM % 1024))
+       SIZE=$((SIZE + 128))
+       TSIZE=$((TSIZE + SIZE))
+       dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
+}
+
+check_transfer()
+{
+       in=$1
+       out=$2
+       what=$3
+
+       cmp "$in" "$out" > /dev/null 2>&1
+       if [ $? -ne 0 ] ;then
+               echo "FAIL: file mismatch for $what" 1>&2
+               ls -l "$in"
+               ls -l "$out"
+               return 1
+       fi
+
+       return 0
+}
+
+test_tcp_forwarding()
+{
+       local nsa=$1
+       local nsb=$2
+       local lret=0
+
+       ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" &
+       lpid=$!
+
+       sleep 1
+       ip netns exec $nsa nc -w 4 10.0.2.99 12345 < "$ns1in" > "$ns1out" &
+       cpid=$!
+
+       sleep 3
+
+       kill $lpid
+       kill $cpid
+       wait
+
+       check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"
+       if [ $? -ne 0 ];then
+               lret=1
+       fi
+
+       check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"
+       if [ $? -ne 0 ];then
+               lret=1
+       fi
+
+       return $lret
+}
+
+make_file "$ns1in" "ns1"
+make_file "$ns2in" "ns2"
+
+# First test:
+# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
+test_tcp_forwarding ns1 ns2
+if [ $? -eq 0 ] ;then
+       echo "PASS: flow offloaded for ns1/ns2"
+else
+       echo "FAIL: flow offload for ns1/ns2:" 1>&2
+       ip netns exec nsr1 nft list ruleset
+       ret=1
+fi
+
+# delete default route, i.e. ns2 won't be able to reach ns1 and
+# will depend on ns1 being masqueraded in nsr1.
+# expect ns1 has nsr1 address.
+ip -net ns2 route del default via 10.0.2.1
+ip -net ns2 route del default via dead:2::1
+ip -net ns2 route add 192.168.10.1 via 10.0.2.1
+
+# Second test:
+# Same, but with NAT enabled.
+ip netns exec nsr1 nft -f - <<EOF
+table ip nat {
+   chain postrouting {
+      type nat hook postrouting priority 0; policy accept;
+      meta oifname "veth1" masquerade
+   }
+}
+EOF
+
+test_tcp_forwarding ns1 ns2
+
+if [ $? -eq 0 ] ;then
+       echo "PASS: flow offloaded for ns1/ns2 with NAT"
+else
+       echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
+       ip netns exec nsr1 nft list ruleset
+       ret=1
+fi
+
+# Third test:
+# Same as second test, but with PMTU discovery enabled.
+handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2)
+
+ip netns exec nsr1 nft delete rule inet filter forward $handle
+if [ $? -ne 0 ] ;then
+       echo "FAIL: Could not delete large-packet accept rule"
+       exit 1
+fi
+
+ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+
+test_tcp_forwarding ns1 ns2
+if [ $? -eq 0 ] ;then
+       echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery"
+else
+       echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
+       ip netns exec nsr1 nft list ruleset
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh

index 14fcf31..1be55e7 100755 (executable)
--- a/tools/testing/selftests/netfilter/nft_nat.sh
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -36,7 +36,11 @@ trap cleanup EXIT
  ip netns add ns1
  ip netns add ns2
  
-ip link add veth0 netns ns0 type veth peer name eth0 netns ns1
+ip link add veth0 netns ns0 type veth peer name eth0 netns ns1 > /dev/null 2>&1
+if [ $? -ne 0 ];then
+    echo "SKIP: No virtual ethernet pair device support in kernel"
+    exit $ksft_skip
+fi
  ip link add veth1 netns ns0 type veth peer name eth0 netns ns2
  
  ip -net ns0 link set lo up
author	David S. Miller <davem@davemloft.net>
	Thu, 23 May 2019 21:45:36 +0000 (14:45 -0700)
committer	David S. Miller <davem@davemloft.net>
	Thu, 23 May 2019 21:45:36 +0000 (14:45 -0700)
include/net/netfilter/nft_fib.h		patch \| blob \| history
net/ipv4/netfilter/nft_fib_ipv4.c		patch \| blob \| history
net/ipv6/netfilter/nft_fib_ipv6.c		patch \| blob \| history
net/netfilter/ipvs/ip_vs_core.c		patch \| blob \| history
net/netfilter/nf_flow_table_ip.c		patch \| blob \| history
net/netfilter/nf_nat_helper.c		patch \| blob \| history
net/netfilter/nf_queue.c		patch \| blob \| history
net/netfilter/nf_tables_api.c		patch \| blob \| history
net/netfilter/nft_fib.c		patch \| blob \| history
net/netfilter/nft_flow_offload.c		patch \| blob \| history
tools/testing/selftests/netfilter/Makefile		patch \| blob \| history
tools/testing/selftests/netfilter/nft_flowtable.sh	[new file with mode: 0755]	patch \| blob
tools/testing/selftests/netfilter/nft_nat.sh		patch \| blob \| history