tcp: TCP experimental option for SMC
authorUrsula Braun <ubraun@linux.vnet.ibm.com>
Wed, 25 Oct 2017 09:01:45 +0000 (11:01 +0200)
committerDavid S. Miller <davem@davemloft.net>
Thu, 26 Oct 2017 09:00:29 +0000 (18:00 +0900)
The SMC protocol [1] relies on the use of a new TCP experimental
option [2, 3]. With this option, SMC capabilities are exchanged
between peers during the TCP three way handshake. This patch adds
support for this experimental option to TCP.

References:
[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609
[2] Shared Use of TCP Experimental Options RFC 6994:
    https://tools.ietf.org/rfc/rfc6994.txt
[3] IANA ExID SMCR:
http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/linux/tcp.h
include/net/inet_sock.h
include/net/tcp.h
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c

index 173a7c2..8c43138 100644 (file)
@@ -98,7 +98,8 @@ struct tcp_options_received {
                tstamp_ok : 1,  /* TIMESTAMP seen on SYN packet         */
                dsack : 1,      /* D-SACK is scheduled                  */
                wscale_ok : 1,  /* Wscale seen on SYN packet            */
-               sack_ok : 4,    /* SACK seen on SYN packet              */
+               sack_ok : 3,    /* SACK seen on SYN packet              */
+               smc_ok : 1,     /* SMC seen on SYN packet               */
                snd_wscale : 4, /* Window scaling received from sender  */
                rcv_wscale : 4; /* Window scaling to send to receiver   */
        u8      num_sacks;      /* Number of SACK blocks                */
@@ -110,6 +111,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
        rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
        rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+#if IS_ENABLED(CONFIG_SMC)
+       rx_opt->smc_ok = 0;
+#endif
 }
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
@@ -229,7 +233,8 @@ struct tcp_sock {
                syn_fastopen_ch:1, /* Active TFO re-enabling probe */
                syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
                save_syn:1,     /* Save headers of SYN packet */
-               is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+               is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+               syn_smc:1;      /* SYN includes SMC */
        u32     tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
index 425752f..c49938d 100644 (file)
@@ -92,7 +92,8 @@ struct inet_request_sock {
                                wscale_ok  : 1,
                                ecn_ok     : 1,
                                acked      : 1,
-                               no_srccheck: 1;
+                               no_srccheck: 1,
+                               smc_ok     : 1;
        kmemcheck_bitfield_end(flags);
        u32                     ir_mark;
        union {
index 2392f74..285bc82 100644 (file)
@@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
  */
 #define TCPOPT_FASTOPEN_MAGIC  0xF989
+#define TCPOPT_SMC_MAGIC       0xE2D4C3D9
 
 /*
  *     TCP option lengths
@@ -203,6 +204,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_MD5SIG         18
 #define TCPOLEN_FASTOPEN_BASE  2
 #define TCPOLEN_EXP_FASTOPEN_BASE  4
+#define TCPOLEN_EXP_SMC_BASE   6
 
 /* But this is what stacks really send out. */
 #define TCPOLEN_TSTAMP_ALIGNED         12
@@ -213,6 +215,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERBLOCK          8
 #define TCPOLEN_MD5SIG_ALIGNED         20
 #define TCPOLEN_MSS_ALIGNED            4
+#define TCPOLEN_EXP_SMC_BASE_ALIGNED   8
 
 /* Flags in tp->nonagle */
 #define TCP_NAGLE_OFF          1       /* Nagle's algo is disabled */
@@ -2108,4 +2111,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 {
        return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
 }
+
+#if IS_ENABLED(CONFIG_SMC)
+extern struct static_key_false tcp_have_smc;
+#endif
 #endif /* _TCP_H */
index 8f36277..f6e1c00 100644 (file)
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/errqueue.h>
+#include <linux/static_key.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -302,6 +303,11 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
 atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
+#if IS_ENABLED(CONFIG_SMC)
+DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
+EXPORT_SYMBOL(tcp_have_smc);
+#endif
+
 /*
  * Current number of TCP sockets.
  */
index 893286d..337f601 100644 (file)
@@ -76,6 +76,8 @@
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
 #include <trace/events/tcp.h>
+#include <linux/unaligned/access_ok.h>
+#include <linux/static_key.h>
 
 int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -3737,6 +3739,21 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
        foc->exp = exp_opt;
 }
 
+static void smc_parse_options(const struct tcphdr *th,
+                             struct tcp_options_received *opt_rx,
+                             const unsigned char *ptr,
+                             int opsize)
+{
+#if IS_ENABLED(CONFIG_SMC)
+       if (static_branch_unlikely(&tcp_have_smc)) {
+               if (th->syn && !(opsize & 1) &&
+                   opsize >= TCPOLEN_EXP_SMC_BASE &&
+                   get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+                       opt_rx->smc_ok = 1;
+       }
+#endif
+}
+
 /* Look for tcp options. Normally only called on SYN and SYNACK packets.
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
@@ -3844,6 +3861,9 @@ void tcp_parse_options(const struct net *net,
                                        tcp_parse_fastopen_option(opsize -
                                                TCPOLEN_EXP_FASTOPEN_BASE,
                                                ptr + 2, th->syn, foc, true);
+                               else
+                                       smc_parse_options(th, opt_rx, ptr,
+                                                         opsize);
                                break;
 
                        }
@@ -5598,6 +5618,16 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
        return false;
 }
 
+static void smc_check_reset_syn(struct tcp_sock *tp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+       if (static_branch_unlikely(&tcp_have_smc)) {
+               if (tp->syn_smc && !tp->rx_opt.smc_ok)
+                       tp->syn_smc = 0;
+       }
+#endif
+}
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                                         const struct tcphdr *th)
 {
@@ -5704,6 +5734,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 * is initialized. */
                tp->copied_seq = tp->rcv_nxt;
 
+               smc_check_reset_syn(tp);
+
                smp_mb();
 
                tcp_finish_connect(sk, skb);
@@ -6157,6 +6189,9 @@ static void tcp_openreq_init(struct request_sock *req,
        ireq->ir_rmt_port = tcp_hdr(skb)->source;
        ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
        ireq->ir_mark = inet_request_mark(sk, skb);
+#if IS_ENABLED(CONFIG_SMC)
+       ireq->smc_ok = rx_opt->smc_ok;
+#endif
 }
 
 struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
index a952357..056009f 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/workqueue.h>
+#include <linux/static_key.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
@@ -416,6 +417,21 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
 
+static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+                                   struct request_sock *req,
+                                   struct tcp_sock *newtp)
+{
+#if IS_ENABLED(CONFIG_SMC)
+       struct inet_request_sock *ireq;
+
+       if (static_branch_unlikely(&tcp_have_smc)) {
+               ireq = inet_rsk(req);
+               if (oldtp->syn_smc && !ireq->smc_ok)
+                       newtp->syn_smc = 0;
+       }
+#endif
+}
+
 /* This is not only more efficient than what we used to do, it eliminates
  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
  *
@@ -433,6 +449,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
                struct tcp_request_sock *treq = tcp_rsk(req);
                struct inet_connection_sock *newicsk = inet_csk(newsk);
                struct tcp_sock *newtp = tcp_sk(newsk);
+               struct tcp_sock *oldtp = tcp_sk(sk);
+
+               smc_check_reset_syn_req(oldtp, req, newtp);
 
                /* Now setup tcp_sock */
                newtp->pred_flags = 0;
index 1f01f4c..c8fc512 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/compiler.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
+#include <linux/static_key.h>
 
 #include <trace/events/tcp.h>
 
@@ -422,6 +423,22 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5             (1 << 2)
 #define OPTION_WSCALE          (1 << 3)
 #define OPTION_FAST_OPEN_COOKIE        (1 << 8)
+#define OPTION_SMC             (1 << 9)
+
+static void smc_options_write(__be32 *ptr, u16 *options)
+{
+#if IS_ENABLED(CONFIG_SMC)
+       if (static_branch_unlikely(&tcp_have_smc)) {
+               if (unlikely(OPTION_SMC & *options)) {
+                       *ptr++ = htonl((TCPOPT_NOP  << 24) |
+                                      (TCPOPT_NOP  << 16) |
+                                      (TCPOPT_EXP <<  8) |
+                                      (TCPOLEN_EXP_SMC_BASE));
+                       *ptr++ = htonl(TCPOPT_SMC_MAGIC);
+               }
+       }
+#endif
+}
 
 struct tcp_out_options {
        u16 options;            /* bit field of OPTION_* */
@@ -540,6 +557,41 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                }
                ptr += (len + 3) >> 2;
        }
+
+       smc_options_write(ptr, &options);
+}
+
+static void smc_set_option(const struct tcp_sock *tp,
+                          struct tcp_out_options *opts,
+                          unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+       if (static_branch_unlikely(&tcp_have_smc)) {
+               if (tp->syn_smc) {
+                       if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+                               opts->options |= OPTION_SMC;
+                               *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+                       }
+               }
+       }
+#endif
+}
+
+static void smc_set_option_cond(const struct tcp_sock *tp,
+                               const struct inet_request_sock *ireq,
+                               struct tcp_out_options *opts,
+                               unsigned int *remaining)
+{
+#if IS_ENABLED(CONFIG_SMC)
+       if (static_branch_unlikely(&tcp_have_smc)) {
+               if (tp->syn_smc && ireq->smc_ok) {
+                       if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+                               opts->options |= OPTION_SMC;
+                               *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
+                       }
+               }
+       }
+#endif
 }
 
 /* Compute TCP options for SYN packets. This is not the final
@@ -607,11 +659,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
                }
        }
 
+       smc_set_option(tp, opts, &remaining);
+
        return MAX_TCP_OPTION_SPACE - remaining;
 }
 
 /* Set up TCP options for SYN-ACKs. */
-static unsigned int tcp_synack_options(struct request_sock *req,
+static unsigned int tcp_synack_options(const struct sock *sk,
+                                      struct request_sock *req,
                                       unsigned int mss, struct sk_buff *skb,
                                       struct tcp_out_options *opts,
                                       const struct tcp_md5sig_key *md5,
@@ -667,6 +722,8 @@ static unsigned int tcp_synack_options(struct request_sock *req,
                }
        }
 
+       smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+
        return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -3195,8 +3252,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
        md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
 #endif
        skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
-       tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
-                         sizeof(*th);
+       tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
+                                            foc) + sizeof(*th);
 
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);