mptcp: add MPTCP protocol
authorJiung <jiung.yu@samsung.com>
Wed, 18 Apr 2018 08:56:33 +0000 (17:56 +0900)
committerSeung-Woo Kim <sw0312.kim@samsung.com>
Mon, 3 Sep 2018 06:33:13 +0000 (15:33 +0900)
Merge MPTCP v0.94 code into rpi3 4.14 kernel
MultiPath TCP (MPTCP) [1] is an effort towards enabling the simultaneous
use of several IP-addresses/interfaces by a modification of TCP that
presents a regular TCP interface to applications, while in fact spreading
data across several subflows. Benefits of this include better resource
utilization, better throughput and smoother reaction to failures.

[1] MPTCP official website : https://www.multipath-tcp.org/

Change-Id: I7fe9f0f8449db32b63430397981859b136587dc9
Signed-off-by: Yu jiung <jiung.yu@samsung.com>
58 files changed:
Documentation/networking/ip-sysctl.txt
drivers/infiniband/hw/cxgb4/cm.c
include/linux/skbuff.h
include/linux/tcp.h
include/net/inet_common.h
include/net/inet_connection_sock.h
include/net/inet_sock.h
include/net/mptcp.h [new file with mode: 0644]
include/net/mptcp_v4.h [new file with mode: 0644]
include/net/mptcp_v6.h [new file with mode: 0644]
include/net/net_namespace.h
include/net/netns/mptcp.h [new file with mode: 0644]
include/net/sock.h
include/net/tcp.h
include/net/tcp_states.h
include/net/transp_v6.h
include/uapi/linux/if.h
include/uapi/linux/tcp.h
net/Kconfig
net/Makefile
net/core/dev.c
net/core/skbuff.c
net/core/sock.c
net/ipv4/Kconfig
net/ipv4/af_inet.c
net/ipv4/inet_connection_sock.c
net/ipv4/ip_sockglue.c
net/ipv4/syncookies.c
net/ipv4/tcp.c
net/ipv4/tcp_fastopen.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c
net/ipv4/tcp_timer.c
net/ipv6/addrconf.c
net/ipv6/af_inet6.c
net/ipv6/ipv6_sockglue.c
net/ipv6/syncookies.c
net/ipv6/tcp_ipv6.c
net/mptcp/Kconfig [new file with mode: 0644]
net/mptcp/Makefile [new file with mode: 0644]
net/mptcp/mptcp_balia.c [new file with mode: 0644]
net/mptcp/mptcp_binder.c [new file with mode: 0644]
net/mptcp/mptcp_coupled.c [new file with mode: 0644]
net/mptcp/mptcp_ctrl.c [new file with mode: 0644]
net/mptcp/mptcp_fullmesh.c [new file with mode: 0644]
net/mptcp/mptcp_input.c [new file with mode: 0644]
net/mptcp/mptcp_ipv4.c [new file with mode: 0644]
net/mptcp/mptcp_ipv6.c [new file with mode: 0644]
net/mptcp/mptcp_ndiffports.c [new file with mode: 0644]
net/mptcp/mptcp_olia.c [new file with mode: 0644]
net/mptcp/mptcp_output.c [new file with mode: 0644]
net/mptcp/mptcp_pm.c [new file with mode: 0644]
net/mptcp/mptcp_redundant.c [new file with mode: 0644]
net/mptcp/mptcp_rr.c [new file with mode: 0644]
net/mptcp/mptcp_sched.c [new file with mode: 0644]
net/mptcp/mptcp_wvegas.c [new file with mode: 0644]

index d499676890d8ecf71de4b02f724c011132145f4f..e4541d4fb9e3d93cd5c4f5851beefd2a316c55cf 100644 (file)
@@ -728,6 +728,18 @@ tcp_challenge_ack_limit - INTEGER
        in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
        Default: 100
 
+MPTCP variables:
+
+mptcp_enabled - INTEGER
+       Enable or disable Multipath TCP for new connections.
+       Possible values are:
+
+       0: Multipath TCP is disabled on all TCP-sockets that are newly created.
+       1: Multipath TCP is enabled by default on all new TCP-sockets. Note that
+          existing sockets in LISTEN-state will still use regular TCP.
+       2: Enables Multipath TCP only upon the request of the application
+          through the socket-option MPTCP_ENABLED.
+
 UDP variables:
 
 udp_l3mdev_accept - BOOLEAN
index daf7a56e5d7ebe4a7b6dffd93245d5a1655b3805..2c17c2d5ac12ca622b07209a1228aec0b3174dee 100644 (file)
@@ -3752,7 +3752,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
         */
        memset(&tmp_opt, 0, sizeof(tmp_opt));
        tcp_clear_options(&tmp_opt);
-       tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
+       tcp_parse_options(&init_net, skb, &tmp_opt, NULL, 0, NULL, NULL);
 
        req = __skb_push(skb, sizeof(*req));
        memset(req, 0, sizeof(*req));
index 6dd77767fd5b85ae6a1c1ce3dc3cbb2c4de9a329..77085987219c9d066568a442436779d3f86dcabe 100644 (file)
@@ -685,7 +685,7 @@ struct sk_buff {
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
-       char                    cb[48] __aligned(8);
+       char                    cb[80] __aligned(8);
 
        unsigned long           _skb_refdst;
        void                    (*destructor)(struct sk_buff *skb);
index fe322fa611e663e50a67f5153f7a1d683e573d34..d00325e9787734109a2744d3c0ed7d0e64bd90bb 100644 (file)
@@ -58,7 +58,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 /* TCP Fast Open */
 #define TCP_FASTOPEN_COOKIE_MIN        4       /* Min Fast Open Cookie size in bytes */
 #define TCP_FASTOPEN_COOKIE_MAX        16      /* Max Fast Open Cookie size in bytes */
-#define TCP_FASTOPEN_COOKIE_SIZE 8     /* the size employed by this impl. */
+#define TCP_FASTOPEN_COOKIE_SIZE 4     /* the size employed by this impl. */
 
 /* TCP Fast Open Cookie as stored in memory */
 struct tcp_fastopen_cookie {
@@ -83,6 +83,56 @@ struct tcp_sack_block {
        u32     end_seq;
 };
 
+struct tcp_out_options {
+       u16     options;        /* bit field of OPTION_* */
+       u8      ws;             /* window scale, 0 to disable */
+       u8      num_sack_blocks;/* number of SACK blocks to include */
+       u8      hash_size;      /* bytes in hash_location */
+       u16     mss;            /* 0 to disable */
+       __u8    *hash_location; /* temporary pointer, overloaded */
+       __u32   tsval, tsecr;   /* need to include OPTION_TS */
+       struct tcp_fastopen_cookie *fastopen_cookie;    /* Fast open cookie */
+#ifdef CONFIG_MPTCP
+       u16     mptcp_options;  /* bit field of MPTCP related OPTION_* */
+       u8      dss_csum:1,     /* dss-checksum required? */
+               add_addr_v4:1,
+               add_addr_v6:1,
+               mptcp_ver:4;
+
+       union {
+               struct {
+                       __u64   sender_key;     /* sender's key for mptcp */
+                       __u64   receiver_key;   /* receiver's key for mptcp */
+               } mp_capable;
+
+               struct {
+                       __u64   sender_truncated_mac;
+                       __u32   sender_nonce;
+                                       /* random number of the sender */
+                       __u32   token;  /* token for mptcp */
+                       u8      low_prio:1;
+               } mp_join_syns;
+       };
+
+       struct {
+               __u64 trunc_mac;
+               struct in_addr addr;
+               u16 port;
+               u8 addr_id;
+       } add_addr4;
+
+       struct {
+               __u64 trunc_mac;
+               struct in6_addr addr;
+               u16 port;
+               u8 addr_id;
+       } add_addr6;
+
+       u16     remove_addrs;   /* list of address id */
+       u8      addr_id;        /* address id (mp_join or add_address) */
+#endif /* CONFIG_MPTCP */
+};
+
 /*These are used to set the sack_ok field in struct tcp_options_received */
 #define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
 #define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
@@ -106,6 +156,9 @@ struct tcp_options_received {
        u16     mss_clamp;      /* Maximal mss, negotiated at connection setup */
 };
 
+struct mptcp_cb;
+struct mptcp_tcp_sock;
+
 static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
        rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
@@ -141,6 +194,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
        return (struct tcp_request_sock *)req;
 }
 
+struct tcp_md5sig_key;
+
 struct tcp_sock {
        /* inet_connection_sock has to be the first member of tcp_sock */
        struct inet_connection_sock     inet_conn;
@@ -363,6 +418,44 @@ struct tcp_sock {
         */
        struct request_sock *fastopen_rsk;
        u32     *saved_syn;
+
+       /* MPTCP/TCP-specific callbacks */
+       const struct tcp_sock_ops       *ops;
+
+       struct mptcp_cb         *mpcb;
+       struct sock             *meta_sk;
+       /* We keep these flags even if CONFIG_MPTCP is not checked, because
+        * it allows checking MPTCP capability just by checking the mpc flag,
+        * rather than adding ifdefs everywhere.
+        */
+       u32     mpc:1,          /* Other end is multipath capable */
+               inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
+               send_mp_fclose:1,
+               request_mptcp:1, /* Did we send out an MP_CAPABLE?
+                                 * (this speeds up mptcp_doit() in tcp_recvmsg)
+                                 */
+               pf:1, /* Potentially Failed state: when this flag is set, we
+                      * stop using the subflow
+                      */
+               mp_killed:1, /* Killed with a tcp_done in mptcp? */
+               was_meta_sk:1,  /* This was a meta sk (in case of reuse) */
+               is_master_sk:1,
+               close_it:1,     /* Must close socket in mptcp_data_ready? */
+               closing:1,
+               mptcp_ver:4,
+               mptcp_sched_setsockopt:1,
+               mptcp_pm_setsockopt:1,
+               record_master_info:1;
+       struct mptcp_tcp_sock *mptcp;
+#ifdef CONFIG_MPTCP
+#define MPTCP_SCHED_NAME_MAX 16
+#define MPTCP_PM_NAME_MAX 16
+       struct hlist_nulls_node tk_table;
+       u32             mptcp_loc_token;
+       u64             mptcp_loc_key;
+       char            mptcp_sched_name[MPTCP_SCHED_NAME_MAX];
+       char            mptcp_pm_name[MPTCP_PM_NAME_MAX];
+#endif /* CONFIG_MPTCP */
 };
 
 enum tsq_enum {
@@ -374,6 +467,8 @@ enum tsq_enum {
        TCP_MTU_REDUCED_DEFERRED,  /* tcp_v{4|6}_err() could not call
                                    * tcp_v{4|6}_mtu_reduced()
                                    */
+       MPTCP_PATH_MANAGER_DEFERRED, /* MPTCP deferred creation of new subflows */
+       MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
 };
 
 enum tsq_flags {
@@ -383,6 +478,8 @@ enum tsq_flags {
        TCPF_WRITE_TIMER_DEFERRED       = (1UL << TCP_WRITE_TIMER_DEFERRED),
        TCPF_DELACK_TIMER_DEFERRED      = (1UL << TCP_DELACK_TIMER_DEFERRED),
        TCPF_MTU_REDUCED_DEFERRED       = (1UL << TCP_MTU_REDUCED_DEFERRED),
+       TCPF_PATH_MANAGER_DEFERRED      = (1UL << MPTCP_PATH_MANAGER_DEFERRED),
+       TCPF_SUB_DEFERRED               = (1UL << MPTCP_SUB_DEFERRED),
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -405,6 +502,7 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key     *tw_md5_key;
 #endif
+       struct mptcp_tw           *mptcp_tw;
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
index 5a54c9570977ea871c85847315b546e0addc4cb4..646c9baa50a4836af25244aee97f552a1bee6ed1 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef _INET_COMMON_H
 #define _INET_COMMON_H
 
+#include <net/sock.h>
+
 extern const struct proto_ops inet_stream_ops;
 extern const struct proto_ops inet_dgram_ops;
 
@@ -14,6 +16,8 @@ struct sock;
 struct sockaddr;
 struct socket;
 
+int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
+int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
 int inet_release(struct socket *sock);
 int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
                        int addr_len, int flags);
index 13e4c89a8231bf953f057676700d1ab8409da20c..25c47b8bd2f05146e0a33dc5d65a421c0552699e 100644 (file)
@@ -30,6 +30,7 @@
 
 struct inet_bind_bucket;
 struct tcp_congestion_ops;
+struct tcp_options_received;
 
 /*
  * Pointers to address related TCP functions
index 8e51b4a69088c211f79b1d5e26029c56df93b99a..0db104abc4365266ace46183a2de11518857349b 100644 (file)
@@ -90,7 +90,9 @@ struct inet_request_sock {
                                wscale_ok  : 1,
                                ecn_ok     : 1,
                                acked      : 1,
-                               no_srccheck: 1;
+                               no_srccheck: 1,
+                               mptcp_rqsk : 1,
+                               saw_mpc    : 1;
        u32                     ir_mark;
        union {
                struct ip_options_rcu __rcu     *ireq_opt;
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
new file mode 100644 (file)
index 0000000..ae0ef83
--- /dev/null
@@ -0,0 +1,1511 @@
+/*
+ *     MPTCP implementation
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _MPTCP_H
+#define _MPTCP_H
+
+#include <linux/inetdevice.h>
+#include <linux/ipv6.h>
+#include <linux/list.h>
+#include <linux/net.h>
+#include <linux/netpoll.h>
+#include <linux/siphash.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <crypto/hash.h>
+#include <net/tcp.h>
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       #define ntohll(x)  be64_to_cpu(x)
+       #define htonll(x)  cpu_to_be64(x)
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       #define ntohll(x) (x)
+       #define htonll(x) (x)
+#endif
+
+struct mptcp_loc4 {
+       u8              loc4_id;
+       u8              low_prio:1;
+       int             if_idx;
+       struct in_addr  addr;
+};
+
+struct mptcp_rem4 {
+       u8              rem4_id;
+       __be16          port;
+       struct in_addr  addr;
+};
+
+struct mptcp_loc6 {
+       u8              loc6_id;
+       u8              low_prio:1;
+       int             if_idx;
+       struct in6_addr addr;
+};
+
+struct mptcp_rem6 {
+       u8              rem6_id;
+       __be16          port;
+       struct in6_addr addr;
+};
+
+struct mptcp_request_sock {
+       struct tcp_request_sock         req;
+       struct hlist_nulls_node         hash_entry;
+
+       union {
+               struct {
+                       /* Only on initial subflows */
+                       u64             mptcp_loc_key;
+                       u64             mptcp_rem_key;
+                       u32             mptcp_loc_token;
+               };
+
+               struct {
+                       /* Only on additional subflows */
+                       u32             mptcp_rem_nonce;
+                       u32             mptcp_loc_nonce;
+                       u64             mptcp_hash_tmac;
+               };
+       };
+
+       u8                              loc_id;
+       u8                              rem_id; /* Address-id in the MP_JOIN */
+       u8                              dss_csum:1,
+                                       is_sub:1, /* Is this a new subflow? */
+                                       low_prio:1, /* Interface set to low-prio? */
+                                       rcv_low_prio:1,
+                                       mptcp_ver:4;
+};
+
+struct mptcp_options_received {
+       u16     saw_mpc:1,
+               dss_csum:1,
+               drop_me:1,
+
+               is_mp_join:1,
+               join_ack:1,
+
+               saw_low_prio:2, /* 0x1 - low-prio set for this subflow
+                                * 0x2 - low-prio set for another subflow
+                                */
+               low_prio:1,
+
+               saw_add_addr:2, /* Saw at least one add_addr option:
+                                * 0x1: IPv4 - 0x2: IPv6
+                                */
+               more_add_addr:1, /* Saw one more add-addr. */
+
+               saw_rem_addr:1, /* Saw at least one rem_addr option */
+               more_rem_addr:1, /* Saw one more rem-addr. */
+
+               mp_fail:1,
+               mp_fclose:1;
+       u8      rem_id;         /* Address-id in the MP_JOIN */
+       u8      prio_addr_id;   /* Address-id in the MP_PRIO */
+
+       const unsigned char *add_addr_ptr; /* Pointer to add-address option */
+       const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
+
+       u32     data_ack;
+       u32     data_seq;
+       u16     data_len;
+
+       u8      mptcp_ver; /* MPTCP version */
+
+       /* Key inside the option (from mp_capable or fast_close) */
+       u64     mptcp_sender_key;
+       u64     mptcp_receiver_key;
+
+       u32     mptcp_rem_token; /* Remote token */
+
+       u32     mptcp_recv_nonce;
+       u64     mptcp_recv_tmac;
+       u8      mptcp_recv_mac[20];
+};
+
+struct mptcp_tcp_sock {
+       struct tcp_sock *next;          /* Next subflow socket */
+       struct hlist_node cb_list;
+       struct mptcp_options_received rx_opt;
+
+        /* Those three fields record the current mapping */
+       u64     map_data_seq;
+       u32     map_subseq;
+       u16     map_data_len;
+       u16     slave_sk:1,
+               fully_established:1,
+               establish_increased:1,
+               second_packet:1,
+               attached:1,
+               send_mp_fail:1,
+               include_mpc:1,
+               mapping_present:1,
+               map_data_fin:1,
+               low_prio:1, /* use this socket as backup */
+               rcv_low_prio:1, /* Peer sent low-prio option to us */
+               send_mp_prio:1, /* Trigger to send mp_prio on this socket */
+               pre_established:1; /* State between sending 3rd ACK and
+                                   * receiving the fourth ack of new subflows.
+                                   */
+
+       /* isn: needed to translate abs to relative subflow seqnums */
+       u32     snt_isn;
+       u32     rcv_isn;
+       u8      path_index;
+       u8      loc_id;
+       u8      rem_id;
+
+#define MPTCP_SCHED_SIZE 16
+       u8      mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8);
+
+       int     init_rcv_wnd;
+       u32     infinite_cutoff_seq;
+       struct delayed_work work;
+       u32     mptcp_loc_nonce;
+       struct tcp_sock *tp; /* Where is my daddy? */
+       u32     last_end_data_seq;
+
+       /* MP_JOIN subflow: timer for retransmitting the 3rd ack */
+       struct timer_list mptcp_ack_timer;
+
+       /* HMAC of the third ack */
+       char sender_mac[20];
+};
+
+struct mptcp_tw {
+       struct list_head list;
+       u64 loc_key;
+       u64 rcv_nxt;
+       struct mptcp_cb __rcu *mpcb;
+       u8 meta_tw:1,
+          in_list:1;
+};
+
+#define MPTCP_PM_NAME_MAX 16
+struct mptcp_pm_ops {
+       struct list_head list;
+
+       /* Signal the creation of a new MPTCP-session. */
+       void (*new_session)(const struct sock *meta_sk);
+       void (*release_sock)(struct sock *meta_sk);
+       void (*fully_established)(struct sock *meta_sk);
+       void (*new_remote_address)(struct sock *meta_sk);
+       void (*subflow_error)(struct sock *meta_sk, struct sock *sk);
+       int  (*get_local_id)(sa_family_t family, union inet_addr *addr,
+                            struct net *net, bool *low_prio);
+       void (*addr_signal)(struct sock *sk, unsigned int *size,
+                           struct tcp_out_options *opts, struct sk_buff *skb);
+       void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr,
+                         sa_family_t family, __be16 port, u8 id);
+       void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id);
+       void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr);
+       void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr);
+       void (*delete_subflow)(struct sock *sk);
+
+       char            name[MPTCP_PM_NAME_MAX];
+       struct module   *owner;
+};
+
+#define MPTCP_SCHED_NAME_MAX 16
+struct mptcp_sched_ops {
+       struct list_head list;
+
+       struct sock *           (*get_subflow)(struct sock *meta_sk,
+                                              struct sk_buff *skb,
+                                              bool zero_wnd_test);
+       struct sk_buff *        (*next_segment)(struct sock *meta_sk,
+                                               int *reinject,
+                                               struct sock **subsk,
+                                               unsigned int *limit);
+       void                    (*init)(struct sock *sk);
+       void                    (*release)(struct sock *sk);
+
+       char                    name[MPTCP_SCHED_NAME_MAX];
+       struct module           *owner;
+};
+
+struct mptcp_cb {
+       /* list of sockets in this multipath connection */
+       struct tcp_sock *connection_list;
+       /* list of sockets that need a call to release_cb */
+       struct hlist_head callback_list;
+
+       /* High-order bits of 64-bit sequence numbers */
+       u32 snd_high_order[2];
+       u32 rcv_high_order[2];
+
+       u16     send_infinite_mapping:1,
+               in_time_wait:1,
+               list_rcvd:1, /* XXX TO REMOVE */
+               addr_signal:1, /* Path-manager wants us to call addr_signal */
+               dss_csum:1,
+               server_side:1,
+               infinite_mapping_rcv:1,
+               infinite_mapping_snd:1,
+               dfin_combined:1,   /* Was the DFIN combined with subflow-fin? */
+               passive_close:1,
+               snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
+               rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */
+
+       /* socket count in this connection */
+       u8 cnt_subflows;
+       u8 cnt_established;
+
+#define MPTCP_SCHED_DATA_SIZE 8
+       u8 mptcp_sched[MPTCP_SCHED_DATA_SIZE] __aligned(8);
+       struct mptcp_sched_ops *sched_ops;
+
+       struct sk_buff_head reinject_queue;
+       /* First cache-line boundary is here minus 8 bytes. But from the
+        * reinject-queue only the next and prev pointers are regularly
+        * accessed. Thus, the whole data-path is on a single cache-line.
+        */
+
+       u64     csum_cutoff_seq;
+       u64     infinite_rcv_seq;
+
+       /***** Start of fields, used for connection closure */
+       spinlock_t       tw_lock;
+       unsigned char    mptw_state;
+       u8               dfin_path_index;
+
+       struct list_head tw_list;
+
+       /***** Start of fields, used for subflow establishment and closure */
+       atomic_t        mpcb_refcnt;
+
+       /* Mutex needed, because otherwise mptcp_close will complain that the
+        * socket is owned by the user.
+        * E.g., mptcp_sub_close_wq is taking the meta-lock.
+        */
+       struct mutex    mpcb_mutex;
+
+       /***** Start of fields, used for subflow establishment */
+       struct sock *meta_sk;
+
+       /* Master socket, also part of the connection_list, this
+        * socket is the one that the application sees.
+        */
+       struct sock *master_sk;
+
+       __u64   mptcp_loc_key;
+       __u64   mptcp_rem_key;
+       __u32   mptcp_loc_token;
+       __u32   mptcp_rem_token;
+
+#define MPTCP_PM_SIZE 608
+       u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);
+       struct mptcp_pm_ops *pm_ops;
+
+       u32 path_index_bits;
+       /* Next pi to pick up in case a new path becomes available */
+       u8 next_path_index;
+
+       __u8    mptcp_ver;
+
+       /* Original snd/rcvbuf of the initial subflow.
+        * Used for the new subflows on the server-side to allow correct
+        * autotuning
+        */
+       int orig_sk_rcvbuf;
+       int orig_sk_sndbuf;
+       u32 orig_window_clamp;
+
+       struct tcp_info *master_info;
+};
+
+#define MPTCP_VERSION_0 0
+#define MPTCP_VERSION_1 1
+
+#define MPTCP_SUB_CAPABLE                      0
+#define MPTCP_SUB_LEN_CAPABLE_SYN              12
+#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN                12
+#define MPTCP_SUB_LEN_CAPABLE_ACK              20
+#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN                20
+
+#define MPTCP_SUB_JOIN                 1
+#define MPTCP_SUB_LEN_JOIN_SYN         12
+#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN   12
+#define MPTCP_SUB_LEN_JOIN_SYNACK      16
+#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN        16
+#define MPTCP_SUB_LEN_JOIN_ACK         24
+#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN   24
+
+#define MPTCP_SUB_DSS          2
+#define MPTCP_SUB_LEN_DSS      4
+#define MPTCP_SUB_LEN_DSS_ALIGN        4
+
+/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
+ * as they are part of the DSS-option.
+ * To get the total length, just add the different options together.
+ */
+#define MPTCP_SUB_LEN_SEQ      10
+#define MPTCP_SUB_LEN_SEQ_CSUM 12
+#define MPTCP_SUB_LEN_SEQ_ALIGN        12
+
+#define MPTCP_SUB_LEN_SEQ_64           14
+#define MPTCP_SUB_LEN_SEQ_CSUM_64      16
+#define MPTCP_SUB_LEN_SEQ_64_ALIGN     16
+
+#define MPTCP_SUB_LEN_ACK      4
+#define MPTCP_SUB_LEN_ACK_ALIGN        4
+
+#define MPTCP_SUB_LEN_ACK_64           8
+#define MPTCP_SUB_LEN_ACK_64_ALIGN     8
+
+/* This is the "default" option-length we will send out most often.
+ * MPTCP DSS-header
+ * 32-bit data sequence number
+ * 32-bit data ack
+ *
+ * It is necessary to calculate the effective MSS we will be using when
+ * sending data.
+ */
+#define MPTCP_SUB_LEN_DSM_ALIGN  (MPTCP_SUB_LEN_DSS_ALIGN +            \
+                                 MPTCP_SUB_LEN_SEQ_ALIGN +             \
+                                 MPTCP_SUB_LEN_ACK_ALIGN)
+
+#define MPTCP_SUB_ADD_ADDR             3
+#define MPTCP_SUB_LEN_ADD_ADDR4                8
+#define MPTCP_SUB_LEN_ADD_ADDR4_VER1   16
+#define MPTCP_SUB_LEN_ADD_ADDR6                20
+#define MPTCP_SUB_LEN_ADD_ADDR6_VER1   28
+#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN  8
+#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1     16
+#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN  20
+#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1     28
+
+#define MPTCP_SUB_REMOVE_ADDR  4
+#define MPTCP_SUB_LEN_REMOVE_ADDR      4
+
+#define MPTCP_SUB_PRIO         5
+#define MPTCP_SUB_LEN_PRIO     3
+#define MPTCP_SUB_LEN_PRIO_ADDR        4
+#define MPTCP_SUB_LEN_PRIO_ALIGN       4
+
+#define MPTCP_SUB_FAIL         6
+#define MPTCP_SUB_LEN_FAIL     12
+#define MPTCP_SUB_LEN_FAIL_ALIGN       12
+
+#define MPTCP_SUB_FCLOSE       7
+#define MPTCP_SUB_LEN_FCLOSE   12
+#define MPTCP_SUB_LEN_FCLOSE_ALIGN     12
+
+
+#define OPTION_MPTCP           (1 << 5)
+
+/* Max number of fastclose retransmissions */
+#define MPTCP_FASTCLOSE_RETRIES 3
+
+#ifdef CONFIG_MPTCP
+
+/* Used for checking if the mptcp initialization has been successful */
+extern bool mptcp_init_failed;
+
+/* MPTCP options */
+#define OPTION_TYPE_SYN                (1 << 0)
+#define OPTION_TYPE_SYNACK     (1 << 1)
+#define OPTION_TYPE_ACK                (1 << 2)
+#define OPTION_MP_CAPABLE      (1 << 3)
+#define OPTION_DATA_ACK                (1 << 4)
+#define OPTION_ADD_ADDR                (1 << 5)
+#define OPTION_MP_JOIN         (1 << 6)
+#define OPTION_MP_FAIL         (1 << 7)
+#define OPTION_MP_FCLOSE       (1 << 8)
+#define OPTION_REMOVE_ADDR     (1 << 9)
+#define OPTION_MP_PRIO         (1 << 10)
+
+/* MPTCP flags: both TX and RX */
+#define MPTCPHDR_SEQ           0x01 /* DSS.M option is present */
+#define MPTCPHDR_FIN           0x02 /* DSS.F option is present */
+#define MPTCPHDR_SEQ64_INDEX   0x04 /* index of seq in mpcb->snd_high_order */
+/* MPTCP flags: RX only */
+#define MPTCPHDR_ACK           0x08
+#define MPTCPHDR_SEQ64_SET     0x10 /* Did we received a 64-bit seq number?  */
+#define MPTCPHDR_SEQ64_OFO     0x20 /* Is it not in our circular array? */
+#define MPTCPHDR_DSS_CSUM      0x40
+/* MPTCP flags: TX only */
+#define MPTCPHDR_INF           0x08
+#define MPTCP_REINJECT         0x10 /* Did we reinject this segment? */
+
+struct mptcp_option {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8    ver:4,
+               sub:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u8    sub:4,
+               ver:4;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+};
+
+struct mp_capable {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8    ver:4,
+               sub:4;
+       __u8    h:1,
+               rsv:5,
+               b:1,
+               a:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u8    sub:4,
+               ver:4;
+       __u8    a:1,
+               b:1,
+               rsv:5,
+               h:1;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+       __u64   sender_key;
+       __u64   receiver_key;
+} __attribute__((__packed__));
+
+struct mp_join {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8    b:1,
+               rsv:3,
+               sub:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u8    sub:4,
+               rsv:3,
+               b:1;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+       __u8    addr_id;
+       union {
+               struct {
+                       u32     token;
+                       u32     nonce;
+               } syn;
+               struct {
+                       __u64   mac;
+                       u32     nonce;
+               } synack;
+               struct {
+                       __u8    mac[20];
+               } ack;
+       } u;
+} __attribute__((__packed__));
+
+struct mp_dss {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u16   rsv1:4,
+               sub:4,
+               A:1,
+               a:1,
+               M:1,
+               m:1,
+               F:1,
+               rsv2:3;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u16   sub:4,
+               rsv1:4,
+               rsv2:3,
+               F:1,
+               m:1,
+               M:1,
+               a:1,
+               A:1;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+};
+
+struct mp_add_addr {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8    ipver:4,
+               sub:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u8    sub:4,
+               ipver:4;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+       __u8    addr_id;
+       union {
+               struct {
+                       struct in_addr  addr;
+                       __be16          port;
+                       __u8            mac[8];
+               } v4;
+               struct {
+                       struct in6_addr addr;
+                       __be16          port;
+                       __u8            mac[8];
+               } v6;
+       } u;
+} __attribute__((__packed__));
+
+struct mp_remove_addr {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8    rsv:4,
+               sub:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u8    sub:4,
+               rsv:4;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+       /* list of addr_id */
+       __u8    addrs_id;
+};
+
+struct mp_fail {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u16   rsv1:4,
+               sub:4,
+               rsv2:8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u16   sub:4,
+               rsv1:4,
+               rsv2:8;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+       __be64  data_seq;
+} __attribute__((__packed__));
+
+struct mp_fclose {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u16   rsv1:4,
+               sub:4,
+               rsv2:8;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u16   sub:4,
+               rsv1:4,
+               rsv2:8;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+       __u64   key;
+} __attribute__((__packed__));
+
+struct mp_prio {
+       __u8    kind;
+       __u8    len;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u8    b:1,
+               rsv:3,
+               sub:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+       __u8    sub:4,
+               rsv:3,
+               b:1;
+#else
+#error "Adjust your <asm/byteorder.h> defines"
+#endif
+       __u8    addr_id;
+} __attribute__((__packed__));
+
+static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum)
+{
+       return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
+}
+
+#define MPTCP_SYSCTL   1
+
+extern int sysctl_mptcp_enabled;
+extern int sysctl_mptcp_version;
+extern int sysctl_mptcp_checksum;
+extern int sysctl_mptcp_debug;
+extern int sysctl_mptcp_syn_retries;
+
+extern struct workqueue_struct *mptcp_wq;
+
+#define mptcp_debug(fmt, args...)                                      \
+       do {                                                            \
+               if (unlikely(sysctl_mptcp_debug))                       \
+                       pr_err(__FILE__ ": " fmt, ##args);      \
+       } while (0)
+
+/* Iterates over all subflows */
+#define mptcp_for_each_tp(mpcb, tp)                                    \
+       for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)
+
+#define mptcp_for_each_sk(mpcb, sk)                                    \
+       for ((sk) = (struct sock *)(mpcb)->connection_list;             \
+            sk;                                                        \
+            sk = (struct sock *)tcp_sk(sk)->mptcp->next)
+
+#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)                   \
+       for (__sk = (struct sock *)(__mpcb)->connection_list,           \
+            __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \
+            __sk;                                                      \
+            __sk = __temp,                                             \
+            __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)
+
+/* Iterates over all bit set to 1 in a bitset */
+#define mptcp_for_each_bit_set(b, i)                                   \
+       for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
+
+#define mptcp_for_each_bit_unset(b, i)                                 \
+       mptcp_for_each_bit_set(~b, i)
+
+#define MPTCP_INC_STATS(net, field)    SNMP_INC_STATS((net)->mptcp.mptcp_statistics, field)
+#define MPTCP_INC_STATS_BH(net, field) __SNMP_INC_STATS((net)->mptcp.mptcp_statistics, field)
+
+enum {
+       MPTCP_MIB_NUM = 0,
+       MPTCP_MIB_MPCAPABLEPASSIVE,     /* Received SYN with MP_CAPABLE */
+       MPTCP_MIB_MPCAPABLEACTIVE,      /* Sent SYN with MP_CAPABLE */
+       MPTCP_MIB_MPCAPABLEACTIVEACK,   /* Received SYN/ACK with MP_CAPABLE */
+       MPTCP_MIB_MPCAPABLEPASSIVEACK,  /* Received third ACK with MP_CAPABLE */
+       MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
+       MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+       MPTCP_MIB_MPCAPABLERETRANSFALLBACK,/* Client-side stopped sending MP_CAPABLE after too many SYN-retransmissions */
+       MPTCP_MIB_CSUMENABLED,          /* Created MPTCP-connection with DSS-checksum enabled */
+       MPTCP_MIB_RETRANSSEGS,          /* Segments retransmitted at the MPTCP-level */
+       MPTCP_MIB_MPFAILRX,             /* Received an MP_FAIL */
+       MPTCP_MIB_CSUMFAIL,             /* Received segment with invalid checksum */
+       MPTCP_MIB_FASTCLOSERX,          /* Recevied a FAST_CLOSE */
+       MPTCP_MIB_FASTCLOSETX,          /* Sent a FAST_CLOSE */
+       MPTCP_MIB_FBACKSUB,             /* Fallback upon ack without data-ack on new subflow */
+       MPTCP_MIB_FBACKINIT,            /* Fallback upon ack without data-ack on initial subflow */
+       MPTCP_MIB_FBDATASUB,            /* Fallback upon data without DSS at the beginning on new subflow */
+       MPTCP_MIB_FBDATAINIT,           /* Fallback upon data without DSS at the beginning on initial subflow */
+       MPTCP_MIB_REMADDRSUB,           /* Remove subflow due to REMOVE_ADDR */
+       MPTCP_MIB_JOINNOTOKEN,          /* Received MP_JOIN but the token was not found */
+       MPTCP_MIB_JOINFALLBACK,         /* Received MP_JOIN on session that has fallen back to reg. TCP */
+       MPTCP_MIB_JOINSYNTX,            /* Sent a SYN + MP_JOIN */
+       MPTCP_MIB_JOINSYNRX,            /* Received a SYN + MP_JOIN */
+       MPTCP_MIB_JOINSYNACKRX,         /* Received a SYN/ACK + MP_JOIN */
+       MPTCP_MIB_JOINSYNACKMAC,        /* HMAC was wrong on SYN/ACK + MP_JOIN */
+       MPTCP_MIB_JOINACKRX,            /* Received an ACK + MP_JOIN */
+       MPTCP_MIB_JOINACKMAC,           /* HMAC was wrong on ACK + MP_JOIN */
+       MPTCP_MIB_JOINACKFAIL,          /* Third ACK on new subflow did not contain an MP_JOIN */
+       MPTCP_MIB_JOINACKRTO,           /* Retransmission timer for third ACK + MP_JOIN timed out */
+       MPTCP_MIB_JOINACKRXMIT,         /* Retransmitted an ACK + MP_JOIN */
+       MPTCP_MIB_NODSSWINDOW,          /* Received too many packets without a DSS-option */
+       MPTCP_MIB_DSSNOMATCH,           /* Received a new mapping that did not match the previous one */
+       MPTCP_MIB_INFINITEMAPRX,        /* Received an infinite mapping */
+       MPTCP_MIB_DSSTCPMISMATCH,       /* DSS-mapping did not map with TCP's sequence numbers */
+       MPTCP_MIB_DSSTRIMHEAD,          /* Trimmed segment at the head (coalescing middlebox) */
+       MPTCP_MIB_DSSSPLITTAIL,         /* Trimmed segment at the tail (coalescing middlebox) */
+       MPTCP_MIB_PURGEOLD,             /* Removed old skb from the rcv-queue due to missing DSS-mapping */
+       MPTCP_MIB_ADDADDRRX,            /* Received an ADD_ADDR */
+       MPTCP_MIB_ADDADDRTX,            /* Sent an ADD_ADDR */
+       MPTCP_MIB_REMADDRRX,            /* Received a REMOVE_ADDR */
+       MPTCP_MIB_REMADDRTX,            /* Sent a REMOVE_ADDR */
+       __MPTCP_MIB_MAX
+};
+
+#define MPTCP_MIB_MAX __MPTCP_MIB_MAX
+struct mptcp_mib {
+       unsigned long   mibs[MPTCP_MIB_MAX];
+};
+
+extern struct lock_class_key meta_key;
+extern char *meta_key_name;
+extern struct lock_class_key meta_slock_key;
+extern char *meta_slock_key_name;
+
+extern siphash_key_t mptcp_secret;
+
+/* This is needed to ensure that two subsequent key/nonce-generation result in
+ * different keys/nonces if the IPs and ports are the same.
+ */
+extern u32 mptcp_seed;
+
+#define MPTCP_HASH_SIZE                1024
+
+extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
+
+/* Request-sockets can be hashed in the tk_htb for collision-detection or in
+ * the regular htb for join-connections. We need to define different NULLS
+ * values so that we can correctly detect a request-socket that has been
+ * recycled. See also c25eb3bfb9729.
+ */
+#define MPTCP_REQSK_NULLS_BASE (1U << 29)
+
+
+void mptcp_data_ready(struct sock *sk);
+void mptcp_write_space(struct sock *sk);
+
+void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
+                             struct sock *sk);
+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
+                  gfp_t flags);
+void mptcp_del_sock(struct sock *sk);
+void mptcp_update_metasocket(const struct sock *meta_sk);
+void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
+void mptcp_update_sndbuf(const struct tcp_sock *tp);
+void mptcp_send_fin(struct sock *meta_sk);
+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
+bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+                     int push_one, gfp_t gfp);
+void tcp_parse_mptcp_options(const struct sk_buff *skb,
+                            struct mptcp_options_received *mopt);
+void mptcp_parse_options(const uint8_t *ptr, int opsize,
+                        struct mptcp_options_received *mopt,
+                        const struct sk_buff *skb,
+                        struct tcp_sock *tp);
+void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
+                      unsigned int *remaining);
+void mptcp_synack_options(struct request_sock *req,
+                         struct tcp_out_options *opts,
+                         unsigned int *remaining);
+void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+                              struct tcp_out_options *opts, unsigned int *size);
+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+                        const struct tcp_out_options *opts,
+                        struct sk_buff *skb);
+void mptcp_close(struct sock *meta_sk, long timeout);
+int mptcp_doit(struct sock *sk);
+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key,
+                          __u8 mptcp_ver, u32 window);
+int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req);
+int mptcp_check_req_master(struct sock *sk, struct sock *child,
+                          struct request_sock *req, const struct sk_buff *skb,
+                          int drop);
+struct sock *mptcp_check_req_child(struct sock *meta_sk,
+                                  struct sock *child,
+                                  struct request_sock *req,
+                                  struct sk_buff *skb,
+                                  const struct mptcp_options_received *mopt);
+u32 __mptcp_select_window(struct sock *sk);
+void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
+                                       __u32 *window_clamp, int wscale_ok,
+                                       __u8 *rcv_wscale, __u32 init_rcv_wnd,
+                                       const struct sock *sk);
+unsigned int mptcp_current_mss(struct sock *meta_sk);
+int mptcp_select_size(const struct sock *meta_sk, bool sg, bool first_skb);
+void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);
+void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out,
+                    int arg_num, ...);
+void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk);
+void mptcp_fin(struct sock *meta_sk);
+void mptcp_meta_retransmit_timer(struct sock *meta_sk);
+void mptcp_sub_retransmit_timer(struct sock *sk);
+int mptcp_write_wakeup(struct sock *meta_sk, int mib);
+void mptcp_sub_close_wq(struct work_struct *work);
+void mptcp_sub_close(struct sock *sk, unsigned long delay);
+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk);
+void mptcp_fallback_meta_sk(struct sock *meta_sk);
+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
+void mptcp_ack_handler(unsigned long data);
+bool mptcp_check_rtt(const struct tcp_sock *tp, int time);
+int mptcp_check_snd_buf(const struct tcp_sock *tp);
+bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
+                         const struct sk_buff *skb);
+void __init mptcp_init(void);
+void mptcp_destroy_sock(struct sock *sk);
+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
+                                   const struct sk_buff *skb,
+                                   const struct mptcp_options_received *mopt);
+unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
+                                 int large_allowed);
+int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw);
+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
+void mptcp_time_wait(struct sock *sk, int state, int timeo);
+void mptcp_disconnect(struct sock *sk);
+bool mptcp_should_expand_sndbuf(const struct sock *sk);
+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
+void mptcp_tsq_flags(struct sock *sk);
+void mptcp_tsq_sub_deferred(struct sock *meta_sk);
+struct mp_join *mptcp_find_join(const struct sk_buff *skb);
+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
+struct sock *mptcp_hash_find(const struct net *net, const u32 token);
+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
+int mptcp_do_join_short(struct sk_buff *skb,
+                       const struct mptcp_options_received *mopt,
+                       struct net *net);
+void mptcp_reqsk_destructor(struct request_sock *req);
+void mptcp_connect_init(struct sock *sk);
+void mptcp_sub_force_close(struct sock *sk);
+int mptcp_sub_len_remove_addr_align(u16 bitfield);
+void mptcp_init_buffer_space(struct sock *sk);
+void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb,
+                          const struct request_sock *req,
+                          struct sk_buff *skb);
+void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk,
+                     const struct sk_buff *skb, bool want_cookie);
+int mptcp_conn_request(struct sock *sk, struct sk_buff *skb);
+void mptcp_enable_sock(struct sock *sk);
+void mptcp_disable_sock(struct sock *sk);
+void mptcp_enable_static_key(void);
+void mptcp_disable_static_key(void);
+void mptcp_cookies_reqsk_init(struct request_sock *req,
+                             struct mptcp_options_received *mopt,
+                             struct sk_buff *skb);
+void mptcp_sock_destruct(struct sock *sk);
+int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb);
+int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen);
+void mptcp_clear_sk(struct sock *sk, int size);
+
+/* MPTCP-path-manager registration/initialization functions */
+int mptcp_register_path_manager(struct mptcp_pm_ops *pm);
+void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);
+void mptcp_init_path_manager(struct mptcp_cb *mpcb);
+void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);
+void mptcp_fallback_default(struct mptcp_cb *mpcb);
+void mptcp_get_default_path_manager(char *name);
+int mptcp_set_scheduler(struct sock *sk, const char *name);
+int mptcp_set_path_manager(struct sock *sk, const char *name);
+int mptcp_set_default_path_manager(const char *name);
+extern struct mptcp_pm_ops mptcp_pm_default;
+
+/* MPTCP-scheduler registration/initialization functions */
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
+void mptcp_init_scheduler(struct mptcp_cb *mpcb);
+void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb);
+void mptcp_get_default_scheduler(char *name);
+int mptcp_set_default_scheduler(const char *name);
+bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
+                       bool zero_wnd_test);
+bool mptcp_is_def_unavailable(struct sock *sk);
+bool subflow_is_active(const struct tcp_sock *tp);
+bool subflow_is_backup(const struct tcp_sock *tp);
+struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb,
+                                  bool zero_wnd_test);
+extern struct mptcp_sched_ops mptcp_sched_default;
+
+/* Initializes function-pointers and MPTCP-flags */
+static inline void mptcp_init_tcp_sock(struct sock *sk)
+{
+       if (!mptcp_init_failed && sysctl_mptcp_enabled == MPTCP_SYSCTL)
+               mptcp_enable_sock(sk);
+}
+
+static inline int mptcp_pi_to_flag(int pi)
+{
+       return 1 << (pi - 1);
+}
+
+static inline
+struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
+{
+       return (struct mptcp_request_sock *)req;
+}
+
+static inline
+struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
+{
+       return (struct request_sock *)req;
+}
+
+static inline bool mptcp_can_sendpage(struct sock *sk)
+{
+       struct sock *sk_it;
+
+       if (tcp_sk(sk)->mpcb->dss_csum)
+               return false;
+
+       mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
+               if (!(sk_it->sk_route_caps & NETIF_F_SG) ||
+                   !sk_check_csum_caps(sk_it))
+                       return false;
+       }
+
+       return true;
+}
+
+static inline void mptcp_push_pending_frames(struct sock *meta_sk)
+{
+       /* We check packets out and send-head here. TCP only checks the
+        * send-head. But, MPTCP also checks packets_out, as this is an
+        * indication that we might want to do opportunistic reinjection.
+        */
+       if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) {
+               struct tcp_sock *tp = tcp_sk(meta_sk);
+
+               /* We don't care about the MSS, because it will be set in
+                * mptcp_write_xmit.
+                */
+               __tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
+       }
+}
+
+static inline void mptcp_send_reset(struct sock *sk)
+{
+       if (tcp_need_reset(sk->sk_state))
+               tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
+       mptcp_sub_force_close(sk);
+}
+
+static inline void mptcp_sub_force_close_all(struct mptcp_cb *mpcb,
+                                            struct sock *except)
+{
+       struct sock *sk_it, *tmp;
+
+       mptcp_for_each_sk_safe(mpcb, sk_it, tmp) {
+               if (sk_it != except)
+                       mptcp_send_reset(sk_it);
+       }
+}
+
+static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
+{
+       return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
+}
+
+static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
+{
+       return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN;
+}
+
+/* Is it a data-fin while in infinite mapping mode?
+ * In infinite mode, a subflow-fin is in fact a data-fin.
+ */
+static inline bool mptcp_is_data_fin2(const struct sk_buff *skb,
+                                    const struct tcp_sock *tp)
+{
+       return mptcp_is_data_fin(skb) ||
+              (tp->mpcb->infinite_mapping_rcv &&
+               (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN));
+}
+
+static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
+{
+       u64 data_seq_high = (u32)(data_seq >> 32);
+
+       if (mpcb->rcv_high_order[0] == data_seq_high)
+               return 0;
+       else if (mpcb->rcv_high_order[1] == data_seq_high)
+               return MPTCPHDR_SEQ64_INDEX;
+       else
+               return MPTCPHDR_SEQ64_OFO;
+}
+
+/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
+ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
+ */
+static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
+                                           u32 *data_seq,
+                                           struct mptcp_cb *mpcb)
+{
+       __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
+
+       if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
+               u64 data_seq64 = get_unaligned_be64(ptr);
+
+               if (mpcb)
+                       TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
+
+               *data_seq = (u32)data_seq64;
+               ptr++;
+       } else {
+               *data_seq = get_unaligned_be32(ptr);
+       }
+
+       return ptr;
+}
+
+static inline struct sock *mptcp_meta_sk(const struct sock *sk)
+{
+       return tcp_sk(sk)->meta_sk;
+}
+
+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
+{
+       return tcp_sk(tp->meta_sk);
+}
+
+static inline int is_meta_tp(const struct tcp_sock *tp)
+{
+       return tp->mpcb && mptcp_meta_tp(tp) == tp;
+}
+
+static inline int is_meta_sk(const struct sock *sk)
+{
+       return sk->sk_state != TCP_NEW_SYN_RECV &&
+              sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
+              mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk;
+}
+
+static inline int is_master_tp(const struct tcp_sock *tp)
+{
+       return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
+}
+
+static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
+{
+       mopt->saw_mpc = 0;
+       mopt->dss_csum = 0;
+       mopt->drop_me = 0;
+
+       mopt->is_mp_join = 0;
+       mopt->join_ack = 0;
+
+       mopt->saw_low_prio = 0;
+       mopt->low_prio = 0;
+
+       mopt->saw_add_addr = 0;
+       mopt->more_add_addr = 0;
+
+       mopt->saw_rem_addr = 0;
+       mopt->more_rem_addr = 0;
+
+       mopt->mp_fail = 0;
+       mopt->mp_fclose = 0;
+}
+
+static inline void mptcp_reset_mopt(struct tcp_sock *tp)
+{
+       struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
+
+       mopt->saw_low_prio = 0;
+       mopt->saw_add_addr = 0;
+       mopt->more_add_addr = 0;
+       mopt->saw_rem_addr = 0;
+       mopt->more_rem_addr = 0;
+       mopt->join_ack = 0;
+       mopt->mp_fail = 0;
+       mopt->mp_fclose = 0;
+}
+
+static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
+                                                const struct mptcp_cb *mpcb)
+{
+       return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
+                       MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
+}
+
+static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
+                                       u32 data_seq_32)
+{
+       return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
+}
+
+static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
+{
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+
+       return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
+                                    meta_tp->rcv_nxt);
+}
+
+static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
+{
+       if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
+               struct mptcp_cb *mpcb = meta_tp->mpcb;
+
+               mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
+               mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
+       }
+}
+
+static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
+                                          u32 old_rcv_nxt)
+{
+       if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
+               struct mptcp_cb *mpcb = meta_tp->mpcb;
+
+               mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
+               mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
+       }
+}
+
+static inline int mptcp_sk_can_send(const struct sock *sk)
+{
+       return tcp_passive_fastopen(sk) ||
+              ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+               !tcp_sk(sk)->mptcp->pre_established);
+}
+
+static inline int mptcp_sk_can_recv(const struct sock *sk)
+{
+       return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2);
+}
+
+static inline int mptcp_sk_can_send_ack(const struct sock *sk)
+{
+       return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
+                                       TCPF_CLOSE | TCPF_LISTEN)) &&
+              !tcp_sk(sk)->mptcp->pre_established;
+}
+
+/* Only support GSO if all subflows supports it */
+static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)
+{
+       struct sock *sk;
+
+       if (tcp_sk(meta_sk)->mpcb->dss_csum)
+               return false;
+
+       mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
+               if (!mptcp_sk_can_send(sk))
+                       continue;
+               if (!sk_can_gso(sk))
+                       return false;
+       }
+       return true;
+}
+
+static inline bool mptcp_can_sg(const struct sock *meta_sk)
+{
+       struct sock *sk;
+
+       if (tcp_sk(meta_sk)->mpcb->dss_csum)
+               return false;
+
+       mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
+               if (!mptcp_sk_can_send(sk))
+                       continue;
+               if (!(sk->sk_route_caps & NETIF_F_SG))
+                       return false;
+       }
+       return true;
+}
+
+static inline void mptcp_set_rto(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sock *sk_it;
+       struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
+       __u32 max_rto = 0;
+
+       /* We are in recovery-phase on the MPTCP-level. Do not update the
+        * RTO, because this would kill exponential backoff.
+        */
+       if (micsk->icsk_retransmits)
+               return;
+
+       mptcp_for_each_sk(tp->mpcb, sk_it) {
+               if ((mptcp_sk_can_send(sk_it) || sk->sk_state == TCP_SYN_RECV) &&
+                   inet_csk(sk_it)->icsk_rto > max_rto)
+                       max_rto = inet_csk(sk_it)->icsk_rto;
+       }
+       if (max_rto) {
+               micsk->icsk_rto = max_rto << 1;
+
+               /* A successful rto-measurement - reset backoff counter */
+               micsk->icsk_backoff = 0;
+       }
+}
+
+static inline void mptcp_sub_close_passive(struct sock *sk)
+{
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
+
+       /* Only close, if the app did a send-shutdown (passive close), and we
+        * received the data-ack of the data-fin.
+        */
+       if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
+               mptcp_sub_close(sk, 0);
+}
+
+static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_cb *mpcb = tp->mpcb;
+
+       /* If data has been acknowleged on the meta-level, fully_established
+        * will have been set before and thus we will not fall back to infinite
+        * mapping.
+        */
+       if (likely(tp->mptcp->fully_established))
+               return false;
+
+       if (!(flag & MPTCP_FLAG_DATA_ACKED))
+               return false;
+
+       /* Don't fallback twice ;) */
+       if (mpcb->infinite_mapping_snd)
+               return false;
+
+       pr_err("%s %#x will fallback - pi %d, src %pI4:%u dst %pI4:%u rcv_nxt %u from %pS\n",
+              __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
+              &inet_sk(sk)->inet_saddr, ntohs(inet_sk(sk)->inet_sport),
+              &inet_sk(sk)->inet_daddr, ntohs(inet_sk(sk)->inet_dport),
+              tp->rcv_nxt, __builtin_return_address(0));
+       if (!is_master_tp(tp)) {
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKSUB);
+               return true;
+       }
+
+       mpcb->infinite_mapping_snd = 1;
+       mpcb->infinite_mapping_rcv = 1;
+       mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
+       tp->mptcp->fully_established = 1;
+
+       mptcp_sub_force_close_all(mpcb, sk);
+
+       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBACKINIT);
+
+       return false;
+}
+
+/* Find the first index whose bit in the bit-field == 0 */
+static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
+{
+       u8 base = mpcb->next_path_index;
+       int i;
+
+       /* Start at 1, because 0 is reserved for the meta-sk */
+       mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {
+               if (i + base < 1)
+                       continue;
+               if (i + base >= sizeof(mpcb->path_index_bits) * 8)
+                       break;
+               i += base;
+               mpcb->path_index_bits |= (1 << i);
+               mpcb->next_path_index = i + 1;
+               return i;
+       }
+       mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {
+               if (i >= sizeof(mpcb->path_index_bits) * 8)
+                       break;
+               if (i < 1)
+                       continue;
+               mpcb->path_index_bits |= (1 << i);
+               mpcb->next_path_index = i + 1;
+               return i;
+       }
+
+       return 0;
+}
+
+static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk)
+{
+       return sk->sk_family == AF_INET6 &&
+              ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
+}
+
+static inline bool mptcp_can_new_subflow(const struct sock *meta_sk)
+{
+       /* Has been removed from the tk-table. Thus, no new subflows.
+        *
+        * Check for close-state is necessary, because we may have been closed
+        * without passing by mptcp_close().
+        *
+        * When falling back, no new subflows are allowed either.
+        */
+       return meta_sk->sk_state != TCP_CLOSE &&
+              tcp_sk(meta_sk)->inside_tk_table &&
+              !tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv &&
+              !tcp_sk(meta_sk)->mpcb->send_infinite_mapping;
+}
+
+/* TCP and MPTCP mpc flag-depending functions */
+u16 mptcp_select_window(struct sock *sk);
+void mptcp_init_buffer_space(struct sock *sk);
+void mptcp_tcp_set_rto(struct sock *sk);
+
+/* TCP and MPTCP flag-depending functions */
+bool mptcp_prune_ofo_queue(struct sock *sk);
+
+#else /* CONFIG_MPTCP */
+#define mptcp_debug(fmt, args...)      \
+       do {                            \
+       } while (0)
+
+/* Without MPTCP, we just do one iteration
+ * over the only socket available. This assumes that
+ * the sk/tp arg is the socket in that case.
+ */
+#define mptcp_for_each_sk(mpcb, sk)
+#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)
+
+#define MPTCP_INC_STATS(net, field)    \
+       do {                            \
+       } while (0)
+
+static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
+{
+       return false;
+}
+static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
+{
+       return false;
+}
+static inline struct sock *mptcp_meta_sk(const struct sock *sk)
+{
+       return NULL;
+}
+static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
+{
+       return NULL;
+}
+static inline int is_meta_sk(const struct sock *sk)
+{
+       return 0;
+}
+static inline int is_master_tp(const struct tcp_sock *tp)
+{
+       return 0;
+}
+static inline void mptcp_del_sock(const struct sock *sk) {}
+static inline void mptcp_update_metasocket(const struct sock *meta_sk) {}
+static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
+static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {}
+static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
+                                           const struct sock *sk) {}
+static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
+static inline void mptcp_set_rto(const struct sock *sk) {}
+static inline void mptcp_send_fin(const struct sock *meta_sk) {}
+static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
+                                      struct mptcp_options_received *mopt,
+                                      const struct sk_buff *skb,
+                                      const struct tcp_sock *tp) {}
+static inline void mptcp_syn_options(const struct sock *sk,
+                                    struct tcp_out_options *opts,
+                                    unsigned int *remaining) {}
+static inline void mptcp_synack_options(struct request_sock *req,
+                                       struct tcp_out_options *opts,
+                                       unsigned int *remaining) {}
+
+static inline void mptcp_established_options(struct sock *sk,
+                                            struct sk_buff *skb,
+                                            struct tcp_out_options *opts,
+                                            unsigned int *size) {}
+static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+                                      const struct tcp_out_options *opts,
+                                      struct sk_buff *skb) {}
+static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
+static inline int mptcp_doit(struct sock *sk)
+{
+       return 0;
+}
+static inline int mptcp_check_req_fastopen(struct sock *child,
+                                          struct request_sock *req)
+{
+       return 1;
+}
+static inline int mptcp_check_req_master(const struct sock *sk,
+                                        const struct sock *child,
+                                        const struct request_sock *req,
+                                        const struct sk_buff *skb,
+                                        int drop)
+{
+       return 1;
+}
+static inline struct sock *mptcp_check_req_child(const struct sock *meta_sk,
+                                                const struct sock *child,
+                                                const struct request_sock *req,
+                                                struct sk_buff *skb,
+                                                const struct mptcp_options_received *mopt)
+{
+       return NULL;
+}
+static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
+{
+       return 0;
+}
+static inline void mptcp_sub_close_passive(struct sock *sk) {}
+static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)
+{
+       return false;
+}
+static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
+static inline bool mptcp_check_rtt(const struct tcp_sock *tp, int time)
+{
+       return false;
+}
+static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
+{
+       return 0;
+}
+static inline void mptcp_send_reset(const struct sock *sk) {}
+static inline bool mptcp_handle_options(struct sock *sk,
+                                       const struct tcphdr *th,
+                                       struct sk_buff *skb)
+{
+       return false;
+}
+static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
+static inline void  __init mptcp_init(void) {}
+static inline bool mptcp_sk_can_gso(const struct sock *sk)
+{
+       return false;
+}
+static inline bool mptcp_can_sg(const struct sock *meta_sk)
+{
+       return false;
+}
+static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk,
+                                               u32 mss_now, int large_allowed)
+{
+       return 0;
+}
+static inline void mptcp_destroy_sock(struct sock *sk) {}
+static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
+                                                 struct sock **skptr,
+                                                 struct sk_buff *skb,
+                                                 const struct mptcp_options_received *mopt)
+{
+       return 0;
+}
+static inline bool mptcp_can_sendpage(struct sock *sk)
+{
+       return false;
+}
+static inline int mptcp_init_tw_sock(struct sock *sk,
+                                    struct tcp_timewait_sock *tw)
+{
+       return 0;
+}
+static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
+static inline void mptcp_disconnect(struct sock *sk) {}
+static inline void mptcp_tsq_flags(struct sock *sk) {}
+static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
+static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}
+static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
+                                         const struct sk_buff *skb) {}
+static inline void mptcp_init_tcp_sock(struct sock *sk) {}
+static inline void mptcp_disable_static_key(void) {}
+static inline void mptcp_cookies_reqsk_init(struct request_sock *req,
+                                           struct mptcp_options_received *mopt,
+                                           struct sk_buff *skb) {}
+static inline void mptcp_fin(struct sock *meta_sk) {}
+static inline bool mptcp_can_new_subflow(const struct sock *meta_sk)
+{
+       return false;
+}
+
+#endif /* CONFIG_MPTCP */
+
+#endif /* _MPTCP_H */
diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h
new file mode 100644 (file)
index 0000000..c83dca0
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ *     MPTCP implementation
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef MPTCP_V4_H_
+#define MPTCP_V4_H_
+
+
+#include <linux/in.h>
+#include <linux/skbuff.h>
+#include <net/mptcp.h>
+#include <net/request_sock.h>
+#include <net/sock.h>
+
+extern struct request_sock_ops mptcp_request_sock_ops;
+extern const struct inet_connection_sock_af_ops mptcp_v4_specific;
+extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
+extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
+
+#ifdef CONFIG_MPTCP
+
+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
+struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
+                                const __be32 laddr, const struct net *net);
+int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
+                          struct mptcp_rem4 *rem);
+int mptcp_pm_v4_init(void);
+void mptcp_pm_v4_undo(void);
+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
+                    u32 seed);
+
+#else
+
+static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
+                                 const struct sk_buff *skb)
+{
+       return 0;
+}
+
+#endif /* CONFIG_MPTCP */
+
+#endif /* MPTCP_V4_H_ */
diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h
new file mode 100644 (file)
index 0000000..a6257fb
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ *     MPTCP implementation
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _MPTCP_V6_H
+#define _MPTCP_V6_H
+
+#include <linux/in6.h>
+#include <net/if_inet6.h>
+
+#include <net/mptcp.h>
+
+
+#ifdef CONFIG_MPTCP
+extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;
+extern const struct inet_connection_sock_af_ops mptcp_v6_specific;
+extern struct request_sock_ops mptcp6_request_sock_ops;
+extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
+extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
+
+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
+struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
+                                const struct in6_addr *laddr, const struct net *net);
+int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
+                          struct mptcp_rem6 *rem);
+int mptcp_pm_v6_init(void);
+void mptcp_pm_v6_undo(void);
+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
+                        __be16 sport, __be16 dport);
+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
+                    __be16 sport, __be16 dport, u32 seed);
+
+#else /* CONFIG_MPTCP */
+
+#define mptcp_v6_mapped ipv6_mapped
+
+static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
+{
+       return 0;
+}
+
+#endif /* CONFIG_MPTCP */
+
+#endif /* _MPTCP_V6_H */
index f4bf75fac349ca66f1345854406c64bb71a74af4..87dd9c8b19575f7ca77d1853f9a8b0dd3e7314d4 100644 (file)
@@ -18,6 +18,7 @@
 #include <net/netns/packet.h>
 #include <net/netns/ipv4.h>
 #include <net/netns/ipv6.h>
+#include <net/netns/mptcp.h>
 #include <net/netns/ieee802154_6lowpan.h>
 #include <net/netns/sctp.h>
 #include <net/netns/dccp.h>
@@ -100,6 +101,9 @@ struct net {
 #if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6       ipv6;
 #endif
+#if IS_ENABLED(CONFIG_MPTCP)
+       struct netns_mptcp      mptcp;
+#endif
 #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan  ieee802154_lowpan;
 #endif
diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h
new file mode 100644 (file)
index 0000000..6680f3b
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ *     MPTCP implementation - MPTCP namespace
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef __NETNS_MPTCP_H__
+#define __NETNS_MPTCP_H__
+
+#include <linux/compiler.h>
+
+enum {
+       MPTCP_PM_FULLMESH = 0,
+       MPTCP_PM_MAX
+};
+
+struct mptcp_mib;
+
+struct netns_mptcp {
+       DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics);
+
+#ifdef CONFIG_PROC_FS
+       struct proc_dir_entry *proc_net_mptcp;
+#endif
+
+       void *path_managers[MPTCP_PM_MAX];
+};
+
+#endif /* __NETNS_MPTCP_H__ */
index 9bd5d68076d9f84fa053bf41a3f343670f7efda9..b26cae08a0cff5b22660048bc055fee28430f5b2 100644 (file)
@@ -771,6 +771,7 @@ enum sock_flags {
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
+       SOCK_MPTCP, /* MPTCP set on this socket */
 };
 
 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1071,6 +1072,7 @@ struct proto {
        void                    (*unhash)(struct sock *sk);
        void                    (*rehash)(struct sock *sk);
        int                     (*get_port)(struct sock *sk, unsigned short snum);
+       void                    (*clear_sk)(struct sock *sk, int size);
 
        /* Keeping track of sockets in use */
 #ifdef CONFIG_PROC_FS
index eca8d65cad1e9ccd2ca7b28058e23b1d052750a9..8d9de37e5c6c847d00f497dc59191676cf6c5701 100644 (file)
@@ -185,6 +185,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_SACK             5       /* SACK Block */
 #define TCPOPT_TIMESTAMP       8       /* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG          19      /* MD5 Signature (RFC2385) */
+#define TCPOPT_MPTCP           30
 #define TCPOPT_FASTOPEN                34      /* Fast open (RFC7413) */
 #define TCPOPT_EXP             254     /* Experimental */
 /* Magic number to be after the option value for sharing TCP
@@ -238,6 +239,30 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define        TFO_SERVER_WO_SOCKOPT1  0x400
 
+/* Flags from tcp_input.c for tcp_ack */
+#define FLAG_DATA              0x01 /* Incoming frame contained data.          */
+#define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
+#define FLAG_DATA_ACKED                0x04 /* This ACK acknowledged new data.         */
+#define FLAG_RETRANS_DATA_ACKED        0x08 /* "" "" some of which was retransmitted.  */
+#define FLAG_SYN_ACKED         0x10 /* This ACK acknowledged SYN.              */
+#define FLAG_DATA_SACKED       0x20 /* New SACK.                               */
+#define FLAG_ECE               0x40 /* ECE in this ACK                         */
+#define FLAG_LOST_RETRANS      0x80 /* This ACK marks some retransmission lost */
+#define FLAG_SLOWPATH          0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ORIG_SACK_ACKED   0x200 /* Never retransmitted data are (s)acked  */
+#define FLAG_SND_UNA_ADVANCED  0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
+#define FLAG_DSACKING_ACK      0x800 /* SACK blocks contained D-SACK info */
+#define FLAG_SET_XMIT_TIMER    0x1000 /* Set TLP or RTO timer */
+#define FLAG_SACK_RENEGING     0x2000 /* snd_una advanced to a sacked seq */
+#define FLAG_UPDATE_TS_RECENT  0x4000 /* tcp_replace_ts_recent() */
+#define FLAG_NO_CHALLENGE_ACK  0x8000 /* do not call tcp_send_challenge_ack()  */
+
+#define MPTCP_FLAG_DATA_ACKED  0x10000
+
+#define FLAG_ACKED             (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP           (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT          (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
+#define FLAG_FORWARD_PROGRESS  (FLAG_ACKED|FLAG_DATA_SACKED)
 
 /* sysctl variables for tcp */
 extern int sysctl_tcp_fastopen;
@@ -339,6 +364,96 @@ extern struct proto tcp_prot;
 #define TCP_DEC_STATS(net, field)      SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
 #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
 
+/**** START - Exports needed for MPTCP ****/
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
+extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
+
+struct mptcp_options_received;
+
+void tcp_cleanup_rbuf(struct sock *sk, int copied);
+void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited);
+int tcp_close_state(struct sock *sk);
+void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+                        const struct sk_buff *skb);
+int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib);
+void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+                    gfp_t gfp_mask);
+unsigned int tcp_mss_split_point(const struct sock *sk,
+                                const struct sk_buff *skb,
+                                unsigned int mss_now,
+                                unsigned int max_segs,
+                                int nonagle);
+bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+                   unsigned int cur_mss, int nonagle);
+bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+                     unsigned int cur_mss);
+unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
+int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now);
+int __pskb_trim_head(struct sk_buff *skb, int len);
+void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
+void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
+void tcp_reset(struct sock *sk);
+bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
+                          const u32 ack_seq, const u32 nwin);
+bool tcp_urg_mode(const struct tcp_sock *tp);
+void tcp_ack_probe(struct sock *sk);
+void tcp_rearm_rto(struct sock *sk);
+int tcp_write_timeout(struct sock *sk);
+bool retransmits_timed_out(struct sock *sk,
+                          unsigned int boundary,
+                          unsigned int timeout);
+void tcp_write_err(struct sock *sk);
+void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
+void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now);
+
+void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+                          struct request_sock *req);
+void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb);
+struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb);
+void tcp_v4_reqsk_destructor(struct request_sock *req);
+
+void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+                          struct request_sock *req);
+void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
+struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb);
+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+void tcp_v6_destroy_sock(struct sock *sk);
+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
+void tcp_v6_hash(struct sock *sk);
+struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb);
+struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
+                                 struct request_sock *req,
+                                 struct dst_entry *dst,
+                                 struct request_sock *req_unhash,
+                                 bool *own_req);
+void tcp_v6_reqsk_destructor(struct request_sock *req);
+
+unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
+                                      int large_allowed);
+u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
+
+void skb_clone_fraglist(struct sk_buff *skb);
+void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
+
+void inet_twsk_free(struct inet_timewait_sock *tw);
+int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);
+/* These states need RST on ABORT according to RFC793 */
+static inline bool tcp_need_reset(int state)
+{
+       return (1 << state) &
+              (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+               TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+
+int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+                              bool *fragstolen);
+void tcp_ofo_queue(struct sock *sk);
+void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb);
+int linear_payload_sz(bool first_skb);
+/**** END - Exports needed for MPTCP ****/
+
 void tcp_tasklet_init(void);
 
 void tcp_v4_err(struct sk_buff *skb, u32);
@@ -434,7 +549,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                int flags, int *addr_len);
 void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
                       struct tcp_options_received *opt_rx,
-                      int estab, struct tcp_fastopen_cookie *foc);
+                      struct mptcp_options_received *mopt_rx,
+                      int estab, struct tcp_fastopen_cookie *foc,
+                      struct tcp_sock *tp);
 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 /*
@@ -443,6 +560,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
 void tcp_v4_mtu_reduced(struct sock *sk);
+void tcp_v6_mtu_reduced(struct sock *sk);
 void tcp_req_err(struct sock *sk, u32 seq, bool abort);
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_create_openreq_child(const struct sock *sk,
@@ -523,7 +641,8 @@ static inline u32 tcp_cookie_time(void)
 
 u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
                              u16 *mssp);
-__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
+__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk,
+                             const struct sk_buff *skb, __u16 *mss);
 u64 cookie_init_timestamp(struct request_sock *req);
 bool cookie_timestamp_decode(const struct net *net,
                             struct tcp_options_received *opt);
@@ -537,7 +656,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
 
 u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
                              const struct tcphdr *th, u16 *mssp);
-__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
+__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk,
+                             const struct sk_buff *skb, __u16 *mss);
 #endif
 /* tcp_output.c */
 
@@ -569,10 +689,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
 void tcp_skb_collapse_tstamp(struct sk_buff *skb,
                             const struct sk_buff *next_skb);
 
+u16 tcp_select_window(struct sock *sk);
+int select_size(const struct sock *sk, bool sg, bool first_skb);
+bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+               int push_one, gfp_t gfp);
+
 /* tcp_input.c */
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
 void tcp_reset(struct sock *sk);
+void tcp_set_rto(struct sock *sk);
+bool tcp_should_expand_sndbuf(const struct sock *sk);
 void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
 void tcp_fin(struct sock *sk);
 
@@ -805,6 +932,12 @@ struct tcp_skb_cb {
                 */
                ktime_t         swtstamp;
        };
+
+#ifdef CONFIG_MPTCP
+       __u8            mptcp_flags;    /* flags for the MPTCP layer    */
+       __u8            dss_off;        /* Number of 4-byte words until
+                                        * seq-number */
+#endif
        __u8            tcp_flags;      /* TCP header flags. (tcp[13])  */
 
        __u8            sacked;         /* State flags for SACK/FACK.   */
@@ -823,6 +956,14 @@ struct tcp_skb_cb {
                        has_rxtstamp:1, /* SKB has a RX timestamp       */
                        unused:5;
        __u32           ack_seq;        /* Sequence number ACK'd        */
+
+#ifdef CONFIG_MPTCP
+       union {                 /* For MPTCP outgoing frames */
+               __u32 path_mask; /* paths that tried to send this skb */
+               __u32 dss[6];   /* DSS options */
+       };
+#endif
+
        union {
                struct {
                        /* There is space for up to 24 bytes */
@@ -1324,7 +1465,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
 /* Determine a window scaling and initial window to offer. */
 void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
                               __u32 *window_clamp, int wscale_ok,
-                              __u8 *rcv_wscale, __u32 init_rcv_wnd);
+                              __u8 *rcv_wscale, __u32 init_rcv_wnd,
+                              const struct sock *sk);
 
 static inline int tcp_win_from_space(int space)
 {
@@ -1335,6 +1477,19 @@ static inline int tcp_win_from_space(int space)
                space - (space>>tcp_adv_win_scale);
 }
 
+#ifdef CONFIG_MPTCP
+extern struct static_key mptcp_static_key;
+static inline bool mptcp(const struct tcp_sock *tp)
+{
+       return static_key_false(&mptcp_static_key) && tp->mpc;
+}
+#else
+static inline bool mptcp(const struct tcp_sock *tp)
+{
+       return 0;
+}
+#endif
+
 /* Note: caller must be prepared to deal with negative returns */
 static inline int tcp_space(const struct sock *sk)
 {
@@ -1885,6 +2040,30 @@ struct tcp_sock_af_ops {
 #endif
 };
 
+/* TCP/MPTCP-specific functions */
+struct tcp_sock_ops {
+       u32 (*__select_window)(struct sock *sk);
+       u16 (*select_window)(struct sock *sk);
+       void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,
+                                     __u32 *window_clamp, int wscale_ok,
+                                     __u8 *rcv_wscale, __u32 init_rcv_wnd,
+                                     const struct sock *sk);
+       int (*select_size)(const struct sock *sk, bool sg, bool first_skb);
+       void (*init_buffer_space)(struct sock *sk);
+       void (*set_rto)(struct sock *sk);
+       bool (*should_expand_sndbuf)(const struct sock *sk);
+       void (*send_fin)(struct sock *sk);
+       bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,
+                          int push_one, gfp_t gfp);
+       void (*send_active_reset)(struct sock *sk, gfp_t priority);
+       int (*write_wakeup)(struct sock *sk, int mib);
+       void (*retransmit_timer)(struct sock *sk);
+       void (*time_wait)(struct sock *sk, int state, int timeo);
+       void (*cleanup_rbuf)(struct sock *sk, int copied);
+       void (*cwnd_validate)(struct sock *sk, bool is_cwnd_limited);
+};
+extern const struct tcp_sock_ops tcp_specific;
+
 struct tcp_request_sock_ops {
        u16 mss_clamp;
 #ifdef CONFIG_TCP_MD5SIG
@@ -1895,12 +2074,13 @@ struct tcp_request_sock_ops {
                                          const struct sock *sk,
                                          const struct sk_buff *skb);
 #endif
-       void (*init_req)(struct request_sock *req,
-                        const struct sock *sk_listener,
-                        struct sk_buff *skb);
+       int (*init_req)(struct request_sock *req,
+                       const struct sock *sk_listener,
+                       struct sk_buff *skb,
+                       bool want_cookie);
 #ifdef CONFIG_SYN_COOKIES
-       __u32 (*cookie_init_seq)(const struct sk_buff *skb,
-                                __u16 *mss);
+       __u32 (*cookie_init_seq)(struct request_sock *req, const struct sock *sk,
+                                const struct sk_buff *skb, __u16 *mss);
 #endif
        struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
                                       const struct request_sock *req);
@@ -1914,15 +2094,17 @@ struct tcp_request_sock_ops {
 
 #ifdef CONFIG_SYN_COOKIES
 static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+                                        struct request_sock *req,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
 {
        tcp_synq_overflow(sk);
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
-       return ops->cookie_init_seq(skb, mss);
+       return ops->cookie_init_seq(req, sk, skb, mss);
 }
 #else
 static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+                                        struct request_sock *req,
                                         const struct sock *sk, struct sk_buff *skb,
                                         __u16 *mss)
 {
index 50e78a74d0dfa957394e68da9aa636dad4b08a05..aa761531b2f71455643a28a01163c968bbc898f2 100644 (file)
@@ -26,6 +26,7 @@ enum {
        TCP_LISTEN,
        TCP_CLOSING,    /* Now a valid state */
        TCP_NEW_SYN_RECV,
+       TCP_RST_WAIT,
 
        TCP_MAX_STATES  /* Leave at the end! */
 };
@@ -47,6 +48,7 @@ enum {
        TCPF_LISTEN      = (1 << 10),
        TCPF_CLOSING     = (1 << 11),
        TCPF_NEW_SYN_RECV = (1 << 12),
+       TCPF_RST_WAIT    = (1 << 13),
 };
 
 #endif /* _LINUX_TCP_STATES_H */
index f6a3543e52477d0b3ec6c883fff554c4e924e0d6..93bd36e125a5355e584b8805d14c0d932e97a62d 100644 (file)
@@ -59,6 +59,8 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp,
 
 /* address family specific functions */
 extern const struct inet_connection_sock_af_ops ipv4_specific;
+extern const struct inet_connection_sock_af_ops ipv6_mapped;
+extern const struct inet_connection_sock_af_ops ipv6_specific;
 
 void inet6_destroy_sock(struct sock *sk);
 
index 7fea0fd7d6f54debe3cd4356a1e7ca04ef9c9c7d..7255e08393dbf6dc5e229027502b3fd834be004c 100644 (file)
@@ -132,6 +132,9 @@ enum net_device_flags {
 #define IFF_ECHO                       IFF_ECHO
 #endif /* __UAPI_DEF_IF_NET_DEVICE_FLAGS_LOWER_UP_DORMANT_ECHO */
 
+#define IFF_NOMULTIPATH        0x80000         /* Disable for MPTCP            */
+#define IFF_MPBACKUP   0x100000        /* Use as backup path for MPTCP */
+
 #define IFF_VOLATILE   (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
                IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
 
index 6a64beeecfad8d26d651f653e361fb56e6b623a6..d9abf0d559dec3703034d8bb3228d1866daf8e83 100644 (file)
 #ifndef _UAPI_LINUX_TCP_H
 #define _UAPI_LINUX_TCP_H
 
-#include <linux/types.h>
+#ifndef __KERNEL__
+#include <sys/socket.h>
+#endif
+
 #include <asm/byteorder.h>
+#include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/socket.h>
+#include <linux/types.h>
 
 struct tcphdr {
        __be16  source;
@@ -120,6 +126,12 @@ enum {
 #define TCP_FASTOPEN_CONNECT   30      /* Attempt FastOpen with connect */
 #define TCP_ULP                        31      /* Attach a ULP to a TCP connection */
 #define TCP_MD5SIG_EXT         32      /* TCP MD5 Signature with extensions */
+#define MPTCP_ENABLED          42
+#define MPTCP_SCHEDULER                43
+#define MPTCP_PATH_MANAGER     44
+#define MPTCP_INFO             45
+
+#define MPTCP_INFO_FLAG_SAVE_MASTER    0x01
 
 struct tcp_repair_opt {
        __u32   opt_code;
@@ -242,6 +254,53 @@ enum {
 
 };
 
+struct mptcp_meta_info {
+       __u8    mptcpi_state;
+       __u8    mptcpi_retransmits;
+       __u8    mptcpi_probes;
+       __u8    mptcpi_backoff;
+
+       __u32   mptcpi_rto;
+       __u32   mptcpi_unacked;
+
+       /* Times. */
+       __u32   mptcpi_last_data_sent;
+       __u32   mptcpi_last_data_recv;
+       __u32   mptcpi_last_ack_recv;
+
+       __u32   mptcpi_total_retrans;
+
+       __u64   mptcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
+       __u64   mptcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
+};
+
+struct mptcp_sub_info {
+       union {
+               struct sockaddr src;
+               struct sockaddr_in src_v4;
+               struct sockaddr_in6 src_v6;
+       };
+
+       union {
+               struct sockaddr dst;
+               struct sockaddr_in dst_v4;
+               struct sockaddr_in6 dst_v6;
+       };
+};
+
+struct mptcp_info {
+       __u32   tcp_info_len;   /* Length of each struct tcp_info in subflows pointer */
+       __u32   sub_len;        /* Total length of memory pointed to by subflows pointer */
+       __u32   meta_len;       /* Length of memory pointed to by meta_info */
+       __u32   sub_info_len;   /* Length of each struct mptcp_sub_info in subflow_info pointer */
+       __u32   total_sub_info_len;     /* Total length of memory pointed to by subflow_info */
+
+       struct mptcp_meta_info  *meta_info;
+       struct tcp_info         *initial;
+       struct tcp_info         *subflows;      /* Pointer to array of tcp_info structs */
+       struct mptcp_sub_info   *subflow_info;
+};
+
 /* for TCP_MD5SIG socket option */
 #define TCP_MD5SIG_MAXKEYLEN   80
 
index 9dba2715919d7668af369a34ed2d67089b112cd4..009ec219ebb09677ad7423cde21ae0a5e20fe490 100644 (file)
@@ -88,6 +88,7 @@ if INET
 source "net/ipv4/Kconfig"
 source "net/ipv6/Kconfig"
 source "net/netlabel/Kconfig"
+source "net/mptcp/Kconfig"
 
 endif # if INET
 
index 14fede520840e25aa2db28c5d1aeb02a8fc1f2d9..4d9fb91d9ada413bdbd9871c1069f2e328ef90b1 100644 (file)
@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS)             += tls/
 obj-$(CONFIG_XFRM)             += xfrm/
 obj-$(CONFIG_UNIX)             += unix/
 obj-$(CONFIG_NET)              += ipv6/
+obj-$(CONFIG_MPTCP)            += mptcp/
 obj-$(CONFIG_PACKET)           += packet/
 obj-$(CONFIG_NET_KEY)          += key/
 obj-$(CONFIG_BRIDGE)           += bridge/
index 85f4a1047707522ae16775b58131be74a03ea863..9ca5bb0fe68ac5e27941492123f7083a2f08dd57 100644 (file)
@@ -6740,7 +6740,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
 
        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
-                              IFF_AUTOMEDIA)) |
+                              IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
                                    IFF_ALLMULTI));
 
index 2e5eeba97de9e0f8590ab06fe45b653ce32efc84..5ea73c6919d2bfb894596d9d6e2fd8b69df96576 100644 (file)
@@ -532,7 +532,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)
        skb_drop_list(&skb_shinfo(skb)->frag_list);
 }
 
-static void skb_clone_fraglist(struct sk_buff *skb)
+void skb_clone_fraglist(struct sk_buff *skb)
 {
        struct sk_buff *list;
 
@@ -1304,7 +1304,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
        skb->inner_mac_header += off;
 }
 
-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
+void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 {
        __copy_skb_header(new, old);
 
index 68d08ed5521e5ce808c582b91b22246bae68ff78..bd3940d7da8551a55eec5cda0fe5f8f6ee0fa86c 100644 (file)
 
 #include <trace/events/sock.h>
 
+#ifdef CONFIG_MPTCP
+#include <net/mptcp.h>
+#include <net/inet_common.h>
+#endif
+
 #include <net/tcp.h>
 #include <net/busy_poll.h>
 
@@ -1415,6 +1420,23 @@ lenout:
  */
 static inline void sock_lock_init(struct sock *sk)
 {
+#ifdef CONFIG_MPTCP
+       /* Reclassify the lock-class for subflows */
+       if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
+               if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {
+                       sock_lock_init_class_and_name(sk, meta_slock_key_name,
+                                                     &meta_slock_key,
+                                                     meta_key_name,
+                                                     &meta_key);
+
+                       /* We don't yet have the mptcp-point.
+                        * Thus we still need inet_sock_destruct
+                        */
+                       sk->sk_destruct = inet_sock_destruct;
+                       return;
+               }
+#endif
+
        if (sk->sk_kern_sock)
                sock_lock_init_class_and_name(
                        sk,
@@ -1463,8 +1485,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
                if (!sk)
                        return sk;
-               if (priority & __GFP_ZERO)
-                       sk_prot_clear_nulls(sk, prot->obj_size);
+               if (priority & __GFP_ZERO) {
+                       if (prot->clear_sk)
+                               prot->clear_sk(sk, prot->obj_size);
+                       else
+                               sk_prot_clear_nulls(sk, prot->obj_size);
+               }
        } else
                sk = kmalloc(prot->obj_size, priority);
 
@@ -1681,6 +1707,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
                atomic_set(&newsk->sk_zckey, 0);
 
                sock_reset_flag(newsk, SOCK_DONE);
+               sock_reset_flag(newsk, SOCK_MPTCP);
                mem_cgroup_sk_alloc(newsk);
                cgroup_sk_alloc(&newsk->sk_cgrp_data);
 
index f48fe6fc7e8c413d7d7e4d7d37d1d859a566e8fb..93def62ca73eb328f5227c083aaca9ffcc970919 100644 (file)
@@ -675,6 +675,38 @@ config TCP_CONG_BBR
        bufferbloat, policers, or AQM schemes that do not provide a delay
        signal. It requires the fq ("Fair Queue") pacing packet scheduler.
 
+config TCP_CONG_LIA
+       tristate "MPTCP Linked Increase"
+       depends on MPTCP
+       default n
+       ---help---
+       MultiPath TCP Linked Increase Congestion Control
+       To enable it, just put 'lia' in tcp_congestion_control
+
+config TCP_CONG_OLIA
+       tristate "MPTCP Opportunistic Linked Increase"
+       depends on MPTCP
+       default n
+       ---help---
+       MultiPath TCP Opportunistic Linked Increase Congestion Control
+       To enable it, just put 'olia' in tcp_congestion_control
+
+config TCP_CONG_WVEGAS
+       tristate "MPTCP WVEGAS CONGESTION CONTROL"
+       depends on MPTCP
+       default n
+       ---help---
+       wVegas congestion control for MPTCP
+       To enable it, just put 'wvegas' in tcp_congestion_control
+
+config TCP_CONG_BALIA
+       tristate "MPTCP BALIA CONGESTION CONTROL"
+       depends on MPTCP
+       default n
+       ---help---
+       Multipath TCP Balanced Linked Adaptation Congestion Control
+       To enable it, just put 'balia' in tcp_congestion_control
+
 choice
        prompt "Default TCP congestion control"
        default DEFAULT_CUBIC
@@ -712,6 +744,18 @@ choice
        config DEFAULT_BBR
                bool "BBR" if TCP_CONG_BBR=y
 
+       config DEFAULT_LIA
+               bool "Lia" if TCP_CONG_LIA=y
+
+       config DEFAULT_OLIA
+               bool "Olia" if TCP_CONG_OLIA=y
+
+       config DEFAULT_WVEGAS
+               bool "Wvegas" if TCP_CONG_WVEGAS=y
+
+       config DEFAULT_BALIA
+               bool "Balia" if TCP_CONG_BALIA=y
+
        config DEFAULT_RENO
                bool "Reno"
 endchoice
@@ -732,6 +776,10 @@ config DEFAULT_TCP_CONG
        default "vegas" if DEFAULT_VEGAS
        default "westwood" if DEFAULT_WESTWOOD
        default "veno" if DEFAULT_VENO
+       default "lia" if DEFAULT_LIA
+       default "olia" if DEFAULT_OLIA
+       default "wvegas" if DEFAULT_WVEGAS
+       default "balia" if DEFAULT_BALIA
        default "reno" if DEFAULT_RENO
        default "dctcp" if DEFAULT_DCTCP
        default "cdg" if DEFAULT_CDG
index b9d9a2b8792c7a9aa6744f80b55f7b5a727a5cef..884e505f34a75207f6fa61f8524ed59662dc1302 100644 (file)
 #include <net/ip_fib.h>
 #include <net/inet_connection_sock.h>
 #include <net/tcp.h>
+#include <net/mptcp.h>
 #include <net/udp.h>
 #include <net/udplite.h>
 #include <net/ping.h>
@@ -149,6 +150,9 @@ void inet_sock_destruct(struct sock *sk)
                return;
        }
 
+       if (sock_flag(sk, SOCK_MPTCP))
+               mptcp_disable_static_key();
+
        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON(sk->sk_wmem_queued);
@@ -241,8 +245,7 @@ EXPORT_SYMBOL(inet_listen);
  *     Create an inet socket.
  */
 
-static int inet_create(struct net *net, struct socket *sock, int protocol,
-                      int kern)
+int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
        struct sock *sk;
        struct inet_protosw *answer;
@@ -702,6 +705,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags,
        lock_sock(sk2);
 
        sock_rps_record_flow(sk2);
+
+       if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {
+               struct sock *sk_it = sk2;
+
+               mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)
+                       sock_rps_record_flow(sk_it);
+
+               if (tcp_sk(sk2)->mpcb->master_sk) {
+                       sk_it = tcp_sk(sk2)->mpcb->master_sk;
+
+                       write_lock_bh(&sk_it->sk_callback_lock);
+                       sk_it->sk_wq = newsock->wq;
+                       sk_it->sk_socket = newsock;
+                       write_unlock_bh(&sk_it->sk_callback_lock);
+               }
+       }
+
        WARN_ON(!((1 << sk2->sk_state) &
                  (TCPF_ESTABLISHED | TCPF_SYN_RECV |
                  TCPF_CLOSE_WAIT | TCPF_CLOSE)));
@@ -1882,6 +1902,9 @@ static int __init inet_init(void)
 
        ip_init();
 
+       /* We must initialize MPTCP before TCP. */
+       mptcp_init();
+
        /* Setup TCP slab cache for open requests. */
        tcp_init();
 
index 0cc08c51220230be1daf2f444f73230103d1e5d4..fcc90f8e125909cefdd0df441f3d5d6ed51d4e30 100644 (file)
@@ -23,6 +23,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
+#include <net/mptcp.h>
 #include <net/tcp.h>
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
@@ -686,7 +687,10 @@ static void reqsk_timer_handler(unsigned long data)
        int max_retries, thresh;
        u8 defer_accept;
 
-       if (sk_state_load(sk_listener) != TCP_LISTEN)
+       if (sk_state_load(sk_listener) != TCP_LISTEN && !is_meta_sk(sk_listener))
+               goto drop;
+
+       if (is_meta_sk(sk_listener) && !mptcp_can_new_subflow(sk_listener))
                goto drop;
 
        max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
@@ -780,7 +784,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
                                 const struct request_sock *req,
                                 const gfp_t priority)
 {
-       struct sock *newsk = sk_clone_lock(sk, priority);
+       struct sock *newsk;
+
+       newsk = sk_clone_lock(sk, priority);
 
        if (newsk) {
                struct inet_connection_sock *newicsk = inet_csk(newsk);
@@ -980,7 +986,12 @@ void inet_csk_listen_stop(struct sock *sk)
         */
        while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
                struct sock *child = req->sk;
+               bool mutex_taken = false;
 
+               if (is_meta_sk(child)) {
+                       mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);
+                       mutex_taken = true;
+               }
                local_bh_disable();
                bh_lock_sock(child);
                WARN_ON(sock_owned_by_user(child));
@@ -990,6 +1001,8 @@ void inet_csk_listen_stop(struct sock *sk)
                reqsk_put(req);
                bh_unlock_sock(child);
                local_bh_enable();
+               if (mutex_taken)
+                       mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);
                sock_put(child);
 
                cond_resched();
index 048d5f6dd320e1f5cbf32438b572b02630387666..19224be59cb4d9ee514699dc6733342184bec5b7 100644 (file)
@@ -44,6 +44,8 @@
 #endif
 #include <net/ip_fib.h>
 
+#include <net/mptcp.h>
+
 #include <linux/errqueue.h>
 #include <linux/uaccess.h>
 
@@ -755,6 +757,18 @@ static int do_ip_setsockopt(struct sock *sk, int level,
                        inet->tos = val;
                        sk->sk_priority = rt_tos2priority(val);
                        sk_dst_reset(sk);
+                       /* Update TOS on mptcp subflow */
+                       if (is_meta_sk(sk)) {
+                               struct sock *sk_it;
+
+                               mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
+                                       if (inet_sk(sk_it)->tos != inet_sk(sk)->tos) {
+                                               inet_sk(sk_it)->tos = inet_sk(sk)->tos;
+                                               sk_it->sk_priority = sk->sk_priority;
+                                               sk_dst_reset(sk_it);
+                                       }
+                               }
+                       }
                }
                break;
        case IP_TTL:
index 77cf32a80952fcf3ceff4ada946cc2d0df2411d9..230202cb8ad334e3bc3ef013ff120985b4d16e9e 100644 (file)
@@ -16,6 +16,8 @@
 #include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
 #include <net/secure_seq.h>
 #include <net/tcp.h>
 #include <net/route.h>
@@ -179,7 +181,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 }
 EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
 
-__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
+__u32 cookie_v4_init_sequence(struct request_sock *req, const struct sock *sk,
+                             const struct sk_buff *skb, __u16 *mssp)
 {
        const struct iphdr *iph = ip_hdr(skb);
        const struct tcphdr *th = tcp_hdr(skb);
@@ -209,9 +212,27 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sock *child;
        bool own_req;
+#ifdef CONFIG_MPTCP
+       int ret;
+#endif
 
        child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
                                                 NULL, &own_req);
+
+#ifdef CONFIG_MPTCP
+       if (!child)
+               goto listen_overflow;
+
+       ret = mptcp_check_req_master(sk, child, req, skb, 0);
+       if (ret < 0)
+               return NULL;
+
+       if (!ret)
+               return tcp_sk(child)->mpcb->master_sk;
+
+listen_overflow:
+#endif
+
        if (child) {
                refcount_set(&req->rsk_refcnt, 1);
                tcp_sk(child)->tsoffset = tsoff;
@@ -284,6 +305,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 {
        struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
        struct tcp_options_received tcp_opt;
+       struct mptcp_options_received mopt;
        struct inet_request_sock *ireq;
        struct tcp_request_sock *treq;
        struct tcp_sock *tp = tcp_sk(sk);
@@ -313,7 +335,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 
        /* check for timestamp cookie support */
        memset(&tcp_opt, 0, sizeof(tcp_opt));
-       tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+       mptcp_init_mp_opt(&mopt);
+       tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL);
 
        if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
                tsoff = secure_tcp_ts_off(sock_net(sk),
@@ -326,7 +349,12 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
                goto out;
 
        ret = NULL;
-       req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
+#ifdef CONFIG_MPTCP
+       if (mopt.saw_mpc)
+               req = inet_reqsk_alloc(&mptcp_request_sock_ops, sk, false); /* for safety */
+       else
+#endif
+               req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
        if (!req)
                goto out;
 
@@ -346,12 +374,17 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
        ireq->sack_ok           = tcp_opt.sack_ok;
        ireq->wscale_ok         = tcp_opt.wscale_ok;
        ireq->tstamp_ok         = tcp_opt.saw_tstamp;
+       ireq->mptcp_rqsk        = 0;
+       ireq->saw_mpc           = 0;
        req->ts_recent          = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
        treq->snt_synack        = 0;
        treq->tfo_listener      = false;
 
        ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
 
+       if (mopt.saw_mpc)
+               mptcp_cookies_reqsk_init(req, &mopt, skb);
+
        /* We throwed the options of the initial SYN away, so we hope
         * the ACK carries the same options again (see RFC1122 4.2.3.8)
         */
@@ -385,10 +418,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
        /* Try to redo what tcp_v4_send_synack did. */
        req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
 
-       tcp_select_initial_window(tcp_full_space(sk), req->mss,
-                                 &req->rsk_rcv_wnd, &req->rsk_window_clamp,
-                                 ireq->wscale_ok, &rcv_wscale,
-                                 dst_metric(&rt->dst, RTAX_INITRWND));
+       tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
+                                      &req->rsk_rcv_wnd, &req->rsk_window_clamp,
+                                      ireq->wscale_ok, &rcv_wscale,
+                                      dst_metric(&rt->dst, RTAX_INITRWND), sk);
 
        ireq->rcv_wscale  = rcv_wscale;
        ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
index 7462ec7587ced26dd31b14a51b2083e63b1149dd..6de5caaa20d26b1762da04e93ce917caa37fc5b6 100644 (file)
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
+#include <net/mptcp.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -402,6 +403,24 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
        return rate64;
 }
 
+const struct tcp_sock_ops tcp_specific = {
+       .__select_window                = __tcp_select_window,
+       .select_window                  = tcp_select_window,
+       .select_initial_window          = tcp_select_initial_window,
+       .select_size                    = select_size,
+       .init_buffer_space              = tcp_init_buffer_space,
+       .set_rto                        = tcp_set_rto,
+       .should_expand_sndbuf           = tcp_should_expand_sndbuf,
+       .send_fin                       = tcp_send_fin,
+       .write_xmit                     = tcp_write_xmit,
+       .send_active_reset              = tcp_send_active_reset,
+       .write_wakeup                   = tcp_write_wakeup,
+       .retransmit_timer               = tcp_retransmit_timer,
+       .time_wait                      = tcp_time_wait,
+       .cleanup_rbuf                   = tcp_cleanup_rbuf,
+       .cwnd_validate                  = tcp_cwnd_validate,
+};
+
 /* Address-family independent initialization for a tcp_sock.
  *
  * NOTE: A lot of things set to zero explicitly by call to
@@ -452,6 +471,11 @@ void tcp_init_sock(struct sock *sk)
        sk->sk_sndbuf = sysctl_tcp_wmem[1];
        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
+       tp->ops = &tcp_specific;
+
+       /* Initialize MPTCP-specific stuff and function-pointers */
+       mptcp_init_tcp_sock(sk);
+
        sk_sockets_allocated_inc(sk);
 }
 EXPORT_SYMBOL(tcp_init_sock);
@@ -766,6 +790,15 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
        int ret;
 
        sock_rps_record_flow(sk);
+
+#ifdef CONFIG_MPTCP
+       if (mptcp(tcp_sk(sk))) {
+               struct sock *sk_it;
+
+               mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)
+                       sock_rps_record_flow(sk_it);
+       }
+#endif
        /*
         * We can't seek on a socket input
         */
@@ -879,8 +912,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
        return NULL;
 }
 
-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
-                                      int large_allowed)
+unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u32 new_size_goal, size_goal;
@@ -908,8 +940,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
 {
        int mss_now;
 
-       mss_now = tcp_current_mss(sk);
-       *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+       if (mptcp(tcp_sk(sk))) {
+               mss_now = mptcp_current_mss(sk);
+               *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+       } else {
+               mss_now = tcp_current_mss(sk);
+               *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
+       }
 
        return mss_now;
 }
@@ -928,12 +965,33 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
         * is fully established.
         */
        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
-           !tcp_passive_fastopen(sk)) {
+           !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
+                                 tp->mpcb->master_sk : sk)) {
                err = sk_stream_wait_connect(sk, &timeo);
                if (err != 0)
                        goto out_err;
        }
 
+       if (mptcp(tp)) {
+               struct sock *sk_it = sk;
+
+               /* We must check this with socket-lock hold because we iterate
+                * over the subflows.
+                */
+               if (!mptcp_can_sendpage(sk)) {
+                       ssize_t ret;
+
+                       release_sock(sk);
+                       ret = sock_no_sendpage(sk->sk_socket, page, offset,
+                                              size, flags);
+                       lock_sock(sk);
+                       return ret;
+               }
+
+               mptcp_for_each_sk(tp->mpcb, sk_it)
+                       sock_rps_record_flow(sk_it);
+       }
+
        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
        mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1050,8 +1108,9 @@ EXPORT_SYMBOL_GPL(do_tcp_sendpages);
 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
                        size_t size, int flags)
 {
-       if (!(sk->sk_route_caps & NETIF_F_SG) ||
-           !sk_check_csum_caps(sk))
+       /* If MPTCP is enabled, we check it later after establishment */
+       if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) ||
+           !sk_check_csum_caps(sk)))
                return sock_no_sendpage_locked(sk, page, offset, size, flags);
 
        tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
@@ -1083,14 +1142,14 @@ EXPORT_SYMBOL(tcp_sendpage);
  * This also speeds up tso_fragment(), since it wont fallback
  * to tcp_fragment().
  */
-static int linear_payload_sz(bool first_skb)
+int linear_payload_sz(bool first_skb)
 {
        if (first_skb)
                return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
        return 0;
 }
 
-static int select_size(const struct sock *sk, bool sg, bool first_skb)
+int select_size(const struct sock *sk, bool sg, bool first_skb)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        int tmp = tp->mss_cache;
@@ -1212,12 +1271,20 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
         * is fully established.
         */
        if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
-           !tcp_passive_fastopen(sk)) {
+           !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
+                                 tp->mpcb->master_sk : sk)) {
                err = sk_stream_wait_connect(sk, &timeo);
                if (err != 0)
                        goto do_error;
        }
 
+       if (mptcp(tp)) {
+               struct sock *sk_it = sk;
+
+               mptcp_for_each_sk(tp->mpcb, sk_it)
+                       sock_rps_record_flow(sk_it);
+       }
+
        if (unlikely(tp->repair)) {
                if (tp->repair_queue == TCP_RECV_QUEUE) {
                        copied = tcp_send_rcvq(sk, msg, size);
@@ -1253,7 +1320,10 @@ restart:
        if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
                goto do_error;
 
-       sg = !!(sk->sk_route_caps & NETIF_F_SG);
+       if (mptcp(tp))
+               sg = mptcp_can_sg(sk);
+       else
+               sg = !!(sk->sk_route_caps & NETIF_F_SG);
 
        while (msg_data_left(msg)) {
                int copy = 0;
@@ -1282,7 +1352,7 @@ new_segment:
                        }
                        first_skb = skb_queue_empty(&sk->sk_write_queue);
                        skb = sk_stream_alloc_skb(sk,
-                                                 select_size(sk, sg, first_skb),
+                                                 tp->ops->select_size(sk, sg, first_skb),
                                                  sk->sk_allocation,
                                                  first_skb);
                        if (!skb)
@@ -1291,8 +1361,15 @@ new_segment:
                        process_backlog = true;
                        /*
                         * Check whether we can use HW checksum.
+                        *
+                        * If dss-csum is enabled, we do not do hw-csum.
+                        * In case of non-mptcp we check the
+                        * device-capabilities.
+                        * In case of mptcp, hw-csum's will be handled
+                        * later in mptcp_write_xmit.
                         */
-                       if (sk_check_csum_caps(sk))
+                       if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) &&
+                           (mptcp(tp) || sk_check_csum_caps(sk)))
                                skb->ip_summed = CHECKSUM_PARTIAL;
 
                        skb_entail(sk, skb);
@@ -1523,7 +1600,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        bool time_to_ack = false;
@@ -1566,7 +1643,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
 
                /* Optimize, __tcp_select_window() is not cheap. */
                if (2*rcv_window_now <= tp->window_clamp) {
-                       __u32 new_window = __tcp_select_window(sk);
+                       __u32 new_window = tp->ops->__select_window(sk);
 
                        /* Send ACK now, if this read freed lots of space
                         * in our buffer. Certainly, new_window is new window.
@@ -1682,7 +1759,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
        /* Clean up data we have read: This will do ACK frames. */
        if (copied > 0) {
                tcp_recv_skb(sk, seq, &offset);
-               tcp_cleanup_rbuf(sk, copied);
+               tp->ops->cleanup_rbuf(sk, copied);
        }
        return copied;
 }
@@ -1782,6 +1859,15 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 
        lock_sock(sk);
 
+#ifdef CONFIG_MPTCP
+       if (mptcp(tp)) {
+               struct sock *sk_it;
+
+               mptcp_for_each_sk(tp->mpcb, sk_it)
+                       sock_rps_record_flow(sk_it);
+       }
+#endif
+
        err = -ENOTCONN;
        if (sk->sk_state == TCP_LISTEN)
                goto out;
@@ -1902,7 +1988,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
                        }
                }
 
-               tcp_cleanup_rbuf(sk, copied);
+               tp->ops->cleanup_rbuf(sk, copied);
 
                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
@@ -1995,7 +2081,7 @@ skip_copy:
                tcp_recv_timestamp(msg, sk, &tss);
 
        /* Clean up data we have read: This will do ACK frames. */
-       tcp_cleanup_rbuf(sk, copied);
+       tp->ops->cleanup_rbuf(sk, copied);
 
        release_sock(sk);
        return copied;
@@ -2073,7 +2159,7 @@ static const unsigned char new_state[16] = {
   [TCP_NEW_SYN_RECV]   = TCP_CLOSE,    /* should not happen ! */
 };
 
-static int tcp_close_state(struct sock *sk)
+int tcp_close_state(struct sock *sk)
 {
        int next = (int)new_state[sk->sk_state];
        int ns = next & TCP_STATE_MASK;
@@ -2103,7 +2189,7 @@ void tcp_shutdown(struct sock *sk, int how)
             TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
                /* Clear out any half completed packets.  FIN if needed. */
                if (tcp_close_state(sk))
-                       tcp_send_fin(sk);
+                       tcp_sk(sk)->ops->send_fin(sk);
        }
 }
 EXPORT_SYMBOL(tcp_shutdown);
@@ -2128,6 +2214,11 @@ void tcp_close(struct sock *sk, long timeout)
        int data_was_unread = 0;
        int state;
 
+       if (is_meta_sk(sk)) {
+               mptcp_close(sk, timeout);
+               return;
+       }
+
        lock_sock(sk);
        sk->sk_shutdown = SHUTDOWN_MASK;
 
@@ -2172,7 +2263,7 @@ void tcp_close(struct sock *sk, long timeout)
                /* Unread data was tossed, zap the connection. */
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
                tcp_set_state(sk, TCP_CLOSE);
-               tcp_send_active_reset(sk, sk->sk_allocation);
+               tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);
        } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
                /* Check zero linger _after_ checking for unread data. */
                sk->sk_prot->disconnect(sk, 0);
@@ -2252,7 +2343,7 @@ adjudge_to_death:
                struct tcp_sock *tp = tcp_sk(sk);
                if (tp->linger2 < 0) {
                        tcp_set_state(sk, TCP_CLOSE);
-                       tcp_send_active_reset(sk, GFP_ATOMIC);
+                       tp->ops->send_active_reset(sk, GFP_ATOMIC);
                        __NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPABORTONLINGER);
                } else {
@@ -2262,7 +2353,8 @@ adjudge_to_death:
                                inet_csk_reset_keepalive_timer(sk,
                                                tmo - TCP_TIMEWAIT_LEN);
                        } else {
-                               tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+                               tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,
+                                                          tmo);
                                goto out;
                        }
                }
@@ -2271,7 +2363,7 @@ adjudge_to_death:
                sk_mem_reclaim(sk);
                if (tcp_check_oom(sk, 0)) {
                        tcp_set_state(sk, TCP_CLOSE);
-                       tcp_send_active_reset(sk, GFP_ATOMIC);
+                       tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
                        __NET_INC_STATS(sock_net(sk),
                                        LINUX_MIB_TCPABORTONMEMORY);
                } else if (!check_net(sock_net(sk))) {
@@ -2299,15 +2391,6 @@ out:
 }
 EXPORT_SYMBOL(tcp_close);
 
-/* These states need RST on ABORT according to RFC793 */
-
-static inline bool tcp_need_reset(int state)
-{
-       return (1 << state) &
-              (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
-               TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
-}
-
 int tcp_disconnect(struct sock *sk, int flags)
 {
        struct inet_sock *inet = inet_sk(sk);
@@ -2330,7 +2413,7 @@ int tcp_disconnect(struct sock *sk, int flags)
                /* The last check adjusts for discrepancy of Linux wrt. RFC
                 * states
                 */
-               tcp_send_active_reset(sk, gfp_any());
+               tp->ops->send_active_reset(sk, gfp_any());
                sk->sk_err = ECONNRESET;
        } else if (old_state == TCP_SYN_SENT)
                sk->sk_err = ECONNRESET;
@@ -2346,6 +2429,13 @@ int tcp_disconnect(struct sock *sk, int flags)
        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
                inet_reset_saddr(sk);
 
+       if (is_meta_sk(sk)) {
+               mptcp_disconnect(sk);
+       } else {
+               if (tp->inside_tk_table)
+                       mptcp_hash_remove_bh(tp);
+       }
+
        sk->sk_shutdown = 0;
        sock_reset_flag(sk, SOCK_DONE);
        tp->srtt_us = 0;
@@ -2528,6 +2618,61 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                release_sock(sk);
                return err;
        }
+#ifdef CONFIG_MPTCP
+       case MPTCP_SCHEDULER: {
+               char name[MPTCP_SCHED_NAME_MAX];
+
+               if (optlen < 1)
+                       return -EINVAL;
+
+               /* Cannot be used if MPTCP is not used or we already have
+                * established an MPTCP-connection.
+                */
+               if (mptcp_init_failed || !sysctl_mptcp_enabled ||
+                   sk->sk_state != TCP_CLOSE)
+                       return -EPERM;
+
+               val = strncpy_from_user(name, optval,
+                                       min_t(long, MPTCP_SCHED_NAME_MAX - 1,
+                                             optlen));
+
+               if (val < 0)
+                       return -EFAULT;
+               name[val] = 0;
+
+               lock_sock(sk);
+               err = mptcp_set_scheduler(sk, name);
+               release_sock(sk);
+               return err;
+       }
+
+       case MPTCP_PATH_MANAGER: {
+               char name[MPTCP_PM_NAME_MAX];
+
+               if (optlen < 1)
+                       return -EINVAL;
+
+               /* Cannot be used if MPTCP is not used or we already have
+                * established an MPTCP-connection.
+                */
+               if (mptcp_init_failed || !sysctl_mptcp_enabled ||
+                   sk->sk_state != TCP_CLOSE)
+                       return -EPERM;
+
+               val = strncpy_from_user(name, optval,
+                                       min_t(long, MPTCP_PM_NAME_MAX - 1,
+                                             optlen));
+
+               if (val < 0)
+                       return -EFAULT;
+               name[val] = 0;
+
+               lock_sock(sk);
+               err = mptcp_set_path_manager(sk, name);
+               release_sock(sk);
+               return err;
+       }
+#endif
        default:
                /* fallthru */
                break;
@@ -2705,6 +2850,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                break;
 
        case TCP_DEFER_ACCEPT:
+               /* An established MPTCP-connection (mptcp(tp) only returns true
+                * if the socket is established) should not use DEFER on new
+                * subflows.
+                */
+               if (mptcp(tp))
+                       break;
                /* Translate value in seconds to number of retransmits */
                icsk->icsk_accept_queue.rskq_defer_accept =
                        secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
@@ -2732,7 +2883,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                            (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
                            inet_csk_ack_scheduled(sk)) {
                                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
-                               tcp_cleanup_rbuf(sk, 1);
+                               tp->ops->cleanup_rbuf(sk, 1);
                                if (!(val & 1))
                                        icsk->icsk_ack.pingpong = 1;
                        }
@@ -2793,6 +2944,28 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                tp->notsent_lowat = val;
                sk->sk_write_space(sk);
                break;
+#ifdef CONFIG_MPTCP
+       case MPTCP_ENABLED:
+               if (mptcp_init_failed || !sysctl_mptcp_enabled ||
+                   sk->sk_state != TCP_CLOSE) {
+                       err = -EPERM;
+                       break;
+               }
+
+               if (val)
+                       mptcp_enable_sock(sk);
+               else
+                       mptcp_disable_sock(sk);
+               break;
+       case MPTCP_INFO:
+               if (mptcp_init_failed || !sysctl_mptcp_enabled) {
+                       err = -EPERM;
+                       break;
+               }
+
+               tp->record_master_info = !!(val & MPTCP_INFO_FLAG_SAVE_MASTER);
+               break;
+#endif
        default:
                err = -ENOPROTOOPT;
                break;
@@ -3220,6 +3393,75 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                }
                return 0;
        }
+#ifdef CONFIG_MPTCP
+       case MPTCP_SCHEDULER:
+               if (get_user(len, optlen))
+                       return -EFAULT;
+               len = min_t(unsigned int, len, MPTCP_SCHED_NAME_MAX);
+               if (put_user(len, optlen))
+                       return -EFAULT;
+
+               if (mptcp(tcp_sk(sk))) {
+                       struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb;
+
+                       if (copy_to_user(optval, mpcb->sched_ops->name, len))
+                               return -EFAULT;
+               } else {
+                       if (copy_to_user(optval, tcp_sk(sk)->mptcp_sched_name,
+                                        len))
+                               return -EFAULT;
+               }
+               return 0;
+
+       case MPTCP_PATH_MANAGER:
+               if (get_user(len, optlen))
+                       return -EFAULT;
+               len = min_t(unsigned int, len, MPTCP_PM_NAME_MAX);
+               if (put_user(len, optlen))
+                       return -EFAULT;
+
+               if (mptcp(tcp_sk(sk))) {
+                       struct mptcp_cb *mpcb = tcp_sk(mptcp_meta_sk(sk))->mpcb;
+
+                       if (copy_to_user(optval, mpcb->pm_ops->name, len))
+                               return -EFAULT;
+               } else {
+                       if (copy_to_user(optval, tcp_sk(sk)->mptcp_pm_name,
+                                        len))
+                               return -EFAULT;
+               }
+               return 0;
+
+       case MPTCP_ENABLED:
+               if (sk->sk_state != TCP_SYN_SENT)
+                       val = mptcp(tp) ? 1 : 0;
+               else
+                       val = sock_flag(sk, SOCK_MPTCP) ? 1 : 0;
+               break;
+       case MPTCP_INFO:
+       {
+               int ret;
+
+               if (!mptcp(tp))
+                       return -EINVAL;
+
+               if (get_user(len, optlen))
+                       return -EFAULT;
+
+               len = min_t(unsigned int, len, sizeof(struct mptcp_info));
+
+               lock_sock(sk);
+               ret = mptcp_get_info(sk, optval, len);
+               release_sock(sk);
+
+               if (ret)
+                       return ret;
+
+               if (put_user(len, optlen))
+                       return -EFAULT;
+               return 0;
+       }
+#endif
        default:
                return -ENOPROTOOPT;
        }
@@ -3394,7 +3636,9 @@ void tcp_done(struct sock *sk)
        if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
                TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
 
+       WARN_ON(sk->sk_state == TCP_CLOSE);
        tcp_set_state(sk, TCP_CLOSE);
+
        tcp_clear_xmit_timers(sk);
        if (req)
                reqsk_fastopen_remove(sk, req, false);
@@ -3410,6 +3654,8 @@ EXPORT_SYMBOL_GPL(tcp_done);
 
 int tcp_abort(struct sock *sk, int err)
 {
+       struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
+
        if (!sk_fullsock(sk)) {
                if (sk->sk_state == TCP_NEW_SYN_RECV) {
                        struct request_sock *req = inet_reqsk(sk);
@@ -3423,7 +3669,7 @@ int tcp_abort(struct sock *sk, int err)
        }
 
        /* Don't race with userspace socket closes such as tcp_close. */
-       lock_sock(sk);
+       lock_sock(meta_sk);
 
        if (sk->sk_state == TCP_LISTEN) {
                tcp_set_state(sk, TCP_CLOSE);
@@ -3432,7 +3678,7 @@ int tcp_abort(struct sock *sk, int err)
 
        /* Don't race with BH socket closes such as inet_csk_listen_stop. */
        local_bh_disable();
-       bh_lock_sock(sk);
+       bh_lock_sock(meta_sk);
 
        if (!sock_flag(sk, SOCK_DEAD)) {
                sk->sk_err = err;
@@ -3444,10 +3690,10 @@ int tcp_abort(struct sock *sk, int err)
                tcp_done(sk);
        }
 
-       bh_unlock_sock(sk);
+       bh_unlock_sock(meta_sk);
        local_bh_enable();
        tcp_write_queue_purge(sk);
-       release_sock(sk);
+       release_sock(meta_sk);
        return 0;
 }
 EXPORT_SYMBOL_GPL(tcp_abort);
index fbbeda64777406d16beb0c2905bedb633c28d33a..7a3c8ce85132bd8b9dad525861857240f37aded1 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/rculist.h>
 #include <net/inetpeer.h>
 #include <net/tcp.h>
+#include <net/mptcp.h>
 
 int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
 
@@ -176,7 +177,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 {
        struct tcp_sock *tp;
        struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
-       struct sock *child;
+       struct sock *child, *meta_sk;
        bool own_req;
 
        req->num_retrans = 0;
@@ -216,20 +217,27 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 
        refcount_set(&req->rsk_refcnt, 2);
 
+       tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+
+       tcp_fastopen_add_skb(child, skb);
+
+       tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
+       tp->rcv_wup = tp->rcv_nxt;
+
+       meta_sk = child;
+       if (!mptcp_check_req_fastopen(meta_sk, req)) {
+               child = tcp_sk(meta_sk)->mpcb->master_sk;
+               tp = tcp_sk(child);
+       }
+
        /* Now finish processing the fastopen child socket. */
        inet_csk(child)->icsk_af_ops->rebuild_header(child);
        tcp_init_congestion_control(child);
        tcp_mtup_init(child);
        tcp_init_metrics(child);
        tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
-       tcp_init_buffer_space(child);
-
-       tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+       tp->ops->init_buffer_space(child);
 
-       tcp_fastopen_add_skb(child, skb);
-
-       tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
-       tp->rcv_wup = tp->rcv_nxt;
        /* tcp_conn_request() is sending the SYNACK,
         * and queues the child into listener accept queue.
         */
index bdabd748f4bcf24a4c980f4351044c69691a5304..89ceca8e35791634f1bcaf2c69e4bb7ae0266fe6 100644 (file)
@@ -76,6 +76,9 @@
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+#include <net/mptcp_v6.h>
 
 int sysctl_tcp_fack __read_mostly;
 int sysctl_tcp_max_reordering __read_mostly = 300;
@@ -96,28 +99,6 @@ int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
-#define FLAG_DATA              0x01 /* Incoming frame contained data.          */
-#define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
-#define FLAG_DATA_ACKED                0x04 /* This ACK acknowledged new data.         */
-#define FLAG_RETRANS_DATA_ACKED        0x08 /* "" "" some of which was retransmitted.  */
-#define FLAG_SYN_ACKED         0x10 /* This ACK acknowledged SYN.              */
-#define FLAG_DATA_SACKED       0x20 /* New SACK.                               */
-#define FLAG_ECE               0x40 /* ECE in this ACK                         */
-#define FLAG_LOST_RETRANS      0x80 /* This ACK marks some retransmission lost */
-#define FLAG_SLOWPATH          0x100 /* Do not skip RFC checks for window update.*/
-#define FLAG_ORIG_SACK_ACKED   0x200 /* Never retransmitted data are (s)acked  */
-#define FLAG_SND_UNA_ADVANCED  0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
-#define FLAG_DSACKING_ACK      0x800 /* SACK blocks contained D-SACK info */
-#define FLAG_SET_XMIT_TIMER    0x1000 /* Set TLP or RTO timer */
-#define FLAG_SACK_RENEGING     0x2000 /* snd_una advanced to a sacked seq */
-#define FLAG_UPDATE_TS_RECENT  0x4000 /* tcp_replace_ts_recent() */
-#define FLAG_NO_CHALLENGE_ACK  0x8000 /* do not call tcp_send_challenge_ack()  */
-
-#define FLAG_ACKED             (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
-#define FLAG_NOT_DUP           (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
-#define FLAG_CA_ALERT          (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
-#define FLAG_FORWARD_PROGRESS  (FLAG_ACKED|FLAG_DATA_SACKED)
-
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
 
@@ -329,8 +310,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
        per_mss = roundup_pow_of_two(per_mss) +
                  SKB_DATA_ALIGN(sizeof(struct sk_buff));
 
-       nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
-       nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+       if (mptcp(tp)) {
+               nr_segs = mptcp_check_snd_buf(tp);
+       } else {
+               nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+               nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
+       }
 
        /* Fast Recovery (RFC 5681 3.2) :
         * Cubic needs 1.7 factor, rounded to 2 to include
@@ -339,8 +324,18 @@ static void tcp_sndbuf_expand(struct sock *sk)
        sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
        sndmem *= nr_segs * per_mss;
 
-       if (sk->sk_sndbuf < sndmem)
+       /* MPTCP: after this sndmem is the new contribution of the
+        * current subflow to the aggregated sndbuf
+        */
+       if (sk->sk_sndbuf < sndmem) {
+               int old_sndbuf = sk->sk_sndbuf;
                sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+               /* MPTCP: ok, the subflow sndbuf has grown, reflect
+                * this in the aggregate buffer.
+                */
+               if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)
+                       mptcp_update_sndbuf(tp);
+       }
 }
 
 /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -389,10 +384,15 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
 static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+       struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+
+       if (is_meta_sk(sk))
+               return;
 
        /* Check #1 */
-       if (tp->rcv_ssthresh < tp->window_clamp &&
-           (int)tp->rcv_ssthresh < tcp_space(sk) &&
+       if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&
+           (int)meta_tp->rcv_ssthresh < tcp_space(meta_sk) &&
            !tcp_under_memory_pressure(sk)) {
                int incr;
 
@@ -400,14 +400,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
                 * will fit to rcvbuf in future.
                 */
                if (tcp_win_from_space(skb->truesize) <= skb->len)
-                       incr = 2 * tp->advmss;
+                       incr = 2 * meta_tp->advmss;
                else
-                       incr = __tcp_grow_window(sk, skb);
+                       incr = __tcp_grow_window(meta_sk, skb);
 
                if (incr) {
                        incr = max_t(int, incr, 2 * skb->len);
-                       tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
-                                              tp->window_clamp);
+                       meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,
+                                       meta_tp->window_clamp);
                        inet_csk(sk)->icsk_ack.quick |= 1;
                }
        }
@@ -601,7 +601,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
 
        tcp_mstamp_refresh(tp);
        time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
-       if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
+       if (mptcp(tp)) {
+               if (mptcp_check_rtt(tp, time))
+                       return;
+       } else if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
                return;
 
        /* Number of bytes copied to user in last RTT */
@@ -837,7 +840,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
  * routine referred to above.
  */
-static void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        /* Old crap is replaced with new one. 8)
@@ -1411,7 +1414,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
        int len;
        int in_sack;
 
-       if (!sk_can_gso(sk))
+       /* For MPTCP we cannot shift skb-data and remove one skb from the
+        * send-queue, because this will make us loose the DSS-option (which
+        * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.
+        */
+       if (!sk_can_gso(sk) || mptcp(tp))
                goto fallback;
 
        /* Normally R but no L won't result in plain S */
@@ -2963,7 +2970,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
         */
        tcp_update_rtt_min(sk, ca_rtt_us);
        tcp_rtt_estimator(sk, seq_rtt_us);
-       tcp_set_rto(sk);
+       tp->ops->set_rto(sk);
 
        /* RFC6298: only reset backoff on valid RTT measurement. */
        inet_csk(sk)->icsk_backoff = 0;
@@ -3031,7 +3038,7 @@ static void tcp_set_xmit_timer(struct sock *sk)
 }
 
 /* If we get here, the whole TSO packet has not been acked. */
-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
+u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u32 packets_acked;
@@ -3154,6 +3161,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                 */
                if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
                        flag |= FLAG_DATA_ACKED;
+                       if (mptcp(tp) && mptcp_is_data_seq(skb))
+                               flag |= MPTCP_FLAG_DATA_ACKED;
                } else {
                        flag |= FLAG_SYN_ACKED;
                        tp->retrans_stamp = 0;
@@ -3266,7 +3275,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
        return flag;
 }
 
-static void tcp_ack_probe(struct sock *sk)
+void tcp_ack_probe(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3336,9 +3345,8 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
 /* Check that window update is acceptable.
  * The function assumes that snd_una<=ack<=snd_next.
  */
-static inline bool tcp_may_update_window(const struct tcp_sock *tp,
-                                       const u32 ack, const u32 ack_seq,
-                                       const u32 nwin)
+bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
+                          const u32 ack_seq, const u32 nwin)
 {
        return  after(ack, tp->snd_una) ||
                after(ack_seq, tp->snd_wl1) ||
@@ -3558,7 +3566,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
 }
 
 /* This routine deals with incoming acks, but not outgoing ones. */
-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
@@ -3665,6 +3673,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
        flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
                                    &sack_state);
 
+       if (mptcp(tp)) {
+               if (mptcp_fallback_infinite(sk, flag)) {
+                       pr_err("%s resetting flow\n", __func__);
+                       mptcp_send_reset(sk);
+                       goto invalid_ack;
+               }
+
+               mptcp_clean_rtx_infinite(skb, sk);
+       }
+
        if (tp->tlp_high_seq)
                tcp_process_tlp_ack(sk, ack, flag);
        /* If needed, reset TLP/RTO timer; RACK may later override this. */
@@ -3743,8 +3761,10 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
  */
 void tcp_parse_options(const struct net *net,
                       const struct sk_buff *skb,
-                      struct tcp_options_received *opt_rx, int estab,
-                      struct tcp_fastopen_cookie *foc)
+                      struct tcp_options_received *opt_rx,
+                      struct mptcp_options_received *mopt,
+                      int estab, struct tcp_fastopen_cookie *foc,
+                      struct tcp_sock *tp)
 {
        const unsigned char *ptr;
        const struct tcphdr *th = tcp_hdr(skb);
@@ -3828,6 +3848,10 @@ void tcp_parse_options(const struct net *net,
                                 */
                                break;
 #endif
+                       case TCPOPT_MPTCP:
+                               mptcp_parse_options(ptr - 2, opsize, mopt, skb, tp);
+                               break;
+
                        case TCPOPT_FASTOPEN:
                                tcp_parse_fastopen_option(
                                        opsize - TCPOLEN_FASTOPEN_BASE,
@@ -3892,7 +3916,9 @@ static bool tcp_fast_parse_options(const struct net *net,
                        return true;
        }
 
-       tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
+       tcp_parse_options(net, skb, &tp->rx_opt,
+                         mptcp(tp) ? &tp->mptcp->rx_opt : NULL, 1, NULL, tp);
+
        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
                tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
@@ -4049,6 +4075,11 @@ void tcp_fin(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
+       if (is_meta_sk(sk)) {
+               mptcp_fin(sk);
+               return;
+       }
+
        inet_csk_schedule_ack(sk);
 
        sk->sk_shutdown |= RCV_SHUTDOWN;
@@ -4059,6 +4090,10 @@ void tcp_fin(struct sock *sk)
        case TCP_ESTABLISHED:
                /* Move to CLOSE_WAIT */
                tcp_set_state(sk, TCP_CLOSE_WAIT);
+
+               if (mptcp(tp))
+                       mptcp_sub_close_passive(sk);
+
                inet_csk(sk)->icsk_ack.pingpong = 1;
                break;
 
@@ -4081,9 +4116,16 @@ void tcp_fin(struct sock *sk)
                tcp_set_state(sk, TCP_CLOSING);
                break;
        case TCP_FIN_WAIT2:
+               if (mptcp(tp)) {
+                       /* The socket will get closed by mptcp_data_ready.
+                        * We first have to process all data-sequences.
+                        */
+                       tp->close_it = 1;
+                       break;
+               }
                /* Received a FIN -- send ACK and enter TIME_WAIT. */
                tcp_send_ack(sk);
-               tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+               tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
                break;
        default:
                /* Only TCP_LISTEN and TCP_CLOSE are left, in these
@@ -4105,6 +4147,10 @@ void tcp_fin(struct sock *sk)
        if (!sock_flag(sk, SOCK_DEAD)) {
                sk->sk_state_change(sk);
 
+               /* Don't wake up MPTCP-subflows */
+               if (mptcp(tp))
+                       return;
+
                /* Do not send POLL_HUP for half duplex close. */
                if (sk->sk_shutdown == SHUTDOWN_MASK ||
                    sk->sk_state == TCP_CLOSE)
@@ -4311,6 +4357,9 @@ static bool tcp_try_coalesce(struct sock *sk,
 
        *fragstolen = false;
 
+       if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
+               return false;
+
        /* Its possible this segment overlaps with prior segment in queue */
        if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
                return false;
@@ -4362,7 +4411,7 @@ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 /* This one checks to see if we can put data from the
  * out_of_order queue into the receive_queue.
  */
-static void tcp_ofo_queue(struct sock *sk)
+void tcp_ofo_queue(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        __u32 dsack_high = tp->rcv_nxt;
@@ -4388,7 +4437,14 @@ static void tcp_ofo_queue(struct sock *sk)
                if (TCP_SKB_CB(skb)->has_rxtstamp)
                        skb->tstamp = TCP_SKB_CB(skb)->swtstamp;
 
-               if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
+               /* In case of MPTCP, the segment may be empty if it's a
+                * non-data DATA_FIN. (see beginning of tcp_data_queue)
+                *
+                * But this only holds true for subflows, not for the
+                * meta-socket.
+                */
+               if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
+                            (is_meta_sk(sk) || !mptcp(tp) || TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq))) {
                        SOCK_DEBUG(sk, "ofo packet was already received\n");
                        tcp_drop(sk, skb);
                        continue;
@@ -4423,6 +4479,9 @@ static int tcp_prune_queue(struct sock *sk);
 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
                                 unsigned int size)
 {
+       if (mptcp(tcp_sk(sk)))
+               sk = mptcp_meta_sk(sk);
+
        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
            !sk_rmem_schedule(sk, skb, size)) {
 
@@ -4437,7 +4496,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
        return 0;
 }
 
-static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct rb_node **p, *q, *parent;
@@ -4509,7 +4568,8 @@ coalesce_done:
                        continue;
                }
                if (before(seq, TCP_SKB_CB(skb1)->end_seq)) {
-                       if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                       if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
+                           (is_meta_sk(sk) || !mptcp(tp) || end_seq != seq)) {
                                /* All the bits are present. Drop. */
                                NET_INC_STATS(sock_net(sk),
                                              LINUX_MIB_TCPOFOMERGE);
@@ -4558,6 +4618,11 @@ merge_right:
                                         end_seq);
                        break;
                }
+               /* MPTCP allows non-data data-fin to be in the ofo-queue */
+               if (mptcp(tp) && !is_meta_sk(sk) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq) {
+                       skb = skb1;
+                       continue;
+               }
                rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
                tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
                                 TCP_SKB_CB(skb1)->end_seq);
@@ -4569,7 +4634,7 @@ merge_right:
                tp->ooo_last_skb = skb;
 
 add_sack:
-       if (tcp_is_sack(tp))
+       if (tcp_is_sack(tp) && seq != end_seq)
                tcp_sack_new_ofo_skb(sk, seq, end_seq);
 end:
        if (skb) {
@@ -4579,8 +4644,8 @@ end:
        }
 }
 
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
-                 bool *fragstolen)
+int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
+                              bool *fragstolen)
 {
        int eaten;
        struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
@@ -4653,10 +4718,14 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
        bool fragstolen;
        int eaten;
 
-       if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+       /* If no data is present, but a data_fin is in the options, we still
+        * have to call mptcp_queue_skb later on. */
+       if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
+           !(mptcp(tp) && mptcp_is_data_fin(skb))) {
                __kfree_skb(skb);
                return;
        }
+
        skb_dst_drop(skb);
        __skb_pull(skb, tcp_hdr(skb)->doff * 4);
 
@@ -4681,7 +4750,7 @@ queue_and_out:
 
                eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
                tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
-               if (skb->len)
+               if (skb->len || mptcp_is_data_fin(skb))
                        tcp_event_data_recv(sk, skb);
                if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                        tcp_fin(sk);
@@ -4703,7 +4772,11 @@ queue_and_out:
 
                if (eaten > 0)
                        kfree_skb_partial(skb, fragstolen);
-               if (!sock_flag(sk, SOCK_DEAD))
+               if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp))
+                       /* MPTCP: we always have to call data_ready, because
+                        * we may be about to receive a data-fin, which still
+                        * must get queued.
+                        */
                        sk->sk_data_ready(sk);
                return;
        }
@@ -5049,7 +5122,7 @@ static int tcp_prune_queue(struct sock *sk)
        return -1;
 }
 
-static bool tcp_should_expand_sndbuf(const struct sock *sk)
+bool tcp_should_expand_sndbuf(const struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
 
@@ -5084,7 +5157,7 @@ static void tcp_new_space(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
-       if (tcp_should_expand_sndbuf(sk)) {
+       if (tp->ops->should_expand_sndbuf(sk)) {
                tcp_sndbuf_expand(sk);
                tp->snd_cwnd_stamp = tcp_jiffies32;
        }
@@ -5098,10 +5171,11 @@ static void tcp_check_space(struct sock *sk)
                sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
                /* pairs with tcp_poll() */
                smp_mb();
-               if (sk->sk_socket &&
-                   test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+               if (mptcp(tcp_sk(sk)) ||
+                   (sk->sk_socket &&
+                    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))) {
                        tcp_new_space(sk);
-                       if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+                       if (sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
                                tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
                }
        }
@@ -5125,7 +5199,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
             /* ... and right edge of window advances far enough.
              * (tcp_recvmsg() will send ACK otherwise). Or...
              */
-            __tcp_select_window(sk) >= tp->rcv_wnd) ||
+            tp->ops->__select_window(sk) >= tp->rcv_wnd) ||
            /* We ACK each frame or... */
            tcp_in_quickack_mode(sk) ||
            /* We have out of order data. */
@@ -5227,6 +5301,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
+       /* MPTCP urgent data is not yet supported */
+       if (mptcp(tp))
+               return;
+
        /* Check if we get a new urgent pointer - normally not. */
        if (th->urg)
                tcp_check_urg(sk, th);
@@ -5369,9 +5447,15 @@ syn_challenge:
                goto discard;
        }
 
+       /* If valid: post process the received MPTCP options. */
+       if (mptcp(tp) && mptcp_handle_options(sk, th, skb))
+               goto discard;
+
        return true;
 
 discard:
+       if (mptcp(tp))
+               mptcp_reset_mopt(tp);
        tcp_drop(sk, skb);
        return false;
 }
@@ -5425,6 +5509,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
        tp->rx_opt.saw_tstamp = 0;
 
+       /* MPTCP: force slowpath. */
+       if (mptcp(tp))
+               goto slow_path;
+
        /*      pred_flags is 0xS?10 << 16 + snd_wnd
         *      if header_prediction is to be made
         *      'S' will always be tp->tcp_header_len >> 2
@@ -5594,7 +5682,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
         */
        tp->lsndtime = tcp_jiffies32;
 
-       tcp_init_buffer_space(sk);
+       tp->ops->init_buffer_space(sk);
 
        if (sock_flag(sk, SOCK_KEEPOPEN))
                inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
@@ -5609,7 +5697,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
                                    struct tcp_fastopen_cookie *cookie)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+       struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
+       struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(meta_sk) : NULL;
        u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
        bool syn_drop = false;
 
@@ -5619,7 +5708,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
                /* Get original SYNACK MSS value if user MSS sets mss_clamp */
                tcp_clear_options(&opt);
                opt.user_mss = opt.mss_clamp = 0;
-               tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
+               tcp_parse_options(sock_net(sk), synack, &opt, NULL, 0, NULL, NULL);
                mss = opt.mss_clamp;
        }
 
@@ -5643,7 +5732,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 
        tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
 
-       if (data) { /* Retransmit unacked data in SYN */
+       /* In mptcp case, we do not rely on "retransmit", but instead on
+        * "transmit", because if fastopen data is not acked, the retransmission
+        * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).
+        */
+       if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */
                tcp_for_write_queue_from(data, sk) {
                        if (data == tcp_send_head(sk) ||
                            __tcp_retransmit_skb(sk, data, 1))
@@ -5671,9 +5764,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_fastopen_cookie foc = { .len = -1 };
        int saved_clamp = tp->rx_opt.mss_clamp;
+       struct mptcp_options_received mopt;
        bool fastopen_fail;
 
-       tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
+       mptcp_init_mp_opt(&mopt);
+
+       tcp_parse_options(sock_net(sk), skb, &tp->rx_opt,
+                         mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc, tp);
        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
                tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
@@ -5733,6 +5830,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                tcp_ack(sk, skb, FLAG_SLOWPATH);
 
+               if (tp->request_mptcp || mptcp(tp)) {
+                       int ret;
+
+                       ret = mptcp_rcv_synsent_state_process(sk, &sk,
+                                                             skb, &mopt);
+
+                       /* May have changed if we support MPTCP */
+                       tp = tcp_sk(sk);
+                       icsk = inet_csk(sk);
+
+                       if (ret == 1)
+                               goto reset_and_undo;
+                       if (ret == 2)
+                               goto discard;
+               }
+
+               if (mptcp(tp) && !is_master_tp(tp)) {
+                       /* Timer for repeating the ACK until an answer
+                        * arrives. Used only when establishing an additional
+                        * subflow inside of an MPTCP connection.
+                        */
+                       sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
+                                      jiffies + icsk->icsk_rto);
+               }
+
                /* Ok.. it's good. Set up sequence numbers and
                 * move to established.
                 */
@@ -5759,6 +5881,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                        tp->tcp_header_len = sizeof(struct tcphdr);
                }
 
+               if (mptcp(tp)) {
+                       tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
+                       tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
+               }
+
                if (tcp_is_sack(tp) && sysctl_tcp_fack)
                        tcp_enable_fack(tp);
 
@@ -5784,9 +5911,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                }
                if (fastopen_fail)
                        return -1;
-               if (sk->sk_write_pending ||
+               /* With MPTCP we cannot send data on the third ack due to the
+                * lack of option-space to combine with an MP_CAPABLE.
+                */
+               if (!mptcp(tp) && (sk->sk_write_pending ||
                    icsk->icsk_accept_queue.rskq_defer_accept ||
-                   icsk->icsk_ack.pingpong) {
+                   icsk->icsk_ack.pingpong)) {
                        /* Save one ACK. Data will be ready after
                         * several ticks, if write_pending is set.
                         *
@@ -5825,6 +5955,7 @@ discard:
            tcp_paws_reject(&tp->rx_opt, 0))
                goto discard_and_undo;
 
+       /* TODO - check this here for MPTCP */
        if (th->syn) {
                /* We see SYN without ACK. It is attempt of
                 * simultaneous connect with crossed SYNs.
@@ -5841,6 +5972,11 @@ discard:
                        tp->tcp_header_len = sizeof(struct tcphdr);
                }
 
+               if (mptcp(tp)) {
+                       tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
+                       tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
+               }
+
                tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
                tp->copied_seq = tp->rcv_nxt;
                tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
@@ -5899,6 +6035,7 @@ reset_and_undo:
  */
 
 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
+       __releases(&sk->sk_lock.slock)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
@@ -5939,6 +6076,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                tp->rx_opt.saw_tstamp = 0;
                tcp_mstamp_refresh(tp);
                queued = tcp_rcv_synsent_state_process(sk, skb, th);
+               if (is_meta_sk(sk)) {
+                       sk = tcp_sk(sk)->mpcb->master_sk;
+                       tp = tcp_sk(sk);
+
+                       /* Need to call it here, because it will announce new
+                        * addresses, which can only be done after the third ack
+                        * of the 3-way handshake.
+                        */
+                       mptcp_update_metasocket(tp->meta_sk);
+               }
                if (queued >= 0)
                        return queued;
 
@@ -5946,6 +6093,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                tcp_urg(sk, skb, th);
                __kfree_skb(skb);
                tcp_data_snd_check(sk);
+               if (mptcp(tp) && is_master_tp(tp))
+                       bh_unlock_sock(sk);
                return 0;
        }
 
@@ -5996,7 +6145,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
                        tcp_mtup_init(sk);
                        tp->copied_seq = tp->rcv_nxt;
-                       tcp_init_buffer_space(sk);
+                       tp->ops->init_buffer_space(sk);
                }
                smp_mb();
                tcp_set_state(sk, TCP_ESTABLISHED);
@@ -6015,6 +6164,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
                if (tp->rx_opt.tstamp_ok)
                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+               if (mptcp(tp))
+                       tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
 
                if (req) {
                        /* Re-arm the timer because data may have been sent out.
@@ -6037,6 +6188,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
                tcp_initialize_rcv_mss(sk);
                tcp_fast_path_on(tp);
+
+               /* Send an ACK when establishing a new  MPTCP subflow, i.e.
+                * using an MP_JOIN subtype.
+                */
+               if (mptcp(tp)) {
+                       if (is_master_tp(tp))
+                               mptcp_update_metasocket(mptcp_meta_sk(sk));
+                       else
+                               tcp_send_ack(sk);
+               }
                break;
 
        case TCP_FIN_WAIT1: {
@@ -6084,7 +6245,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                tmo = tcp_fin_time(sk);
                if (tmo > TCP_TIMEWAIT_LEN) {
                        inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
-               } else if (th->fin || sock_owned_by_user(sk)) {
+               } else if (th->fin || mptcp_is_data_fin(skb) ||
+                          sock_owned_by_user(sk)) {
                        /* Bad case. We could lose such FIN otherwise.
                         * It is not a big problem, but it looks confusing
                         * and not so rare event. We still can lose it now,
@@ -6093,7 +6255,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                         */
                        inet_csk_reset_keepalive_timer(sk, tmo);
                } else {
-                       tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+                       tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
                        goto discard;
                }
                break;
@@ -6101,7 +6263,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
        case TCP_CLOSING:
                if (tp->snd_una == tp->write_seq) {
-                       tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+                       tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
                        goto discard;
                }
                break;
@@ -6113,6 +6275,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                        goto discard;
                }
                break;
+       case TCP_CLOSE:
+               if (tp->mp_killed)
+                       goto discard;
        }
 
        /* step 6: check the URG bit */
@@ -6133,7 +6298,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
                 */
                if (sk->sk_shutdown & RCV_SHUTDOWN) {
                        if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
-                           after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+                           after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
+                           !mptcp(tp)) {
                                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
                                tcp_reset(sk);
                                return 1;
@@ -6230,6 +6396,8 @@ static void tcp_openreq_init(struct request_sock *req,
        ireq->wscale_ok = rx_opt->wscale_ok;
        ireq->acked = 0;
        ireq->ecn_ok = 0;
+       ireq->mptcp_rqsk = 0;
+       ireq->saw_mpc = 0;
        ireq->ir_rmt_port = tcp_hdr(skb)->source;
        ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
        ireq->ir_mark = inet_request_mark(sk, skb);
@@ -6324,12 +6492,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        /* TW buckets are converted to open requests without
         * limitations, they conserve resources and peer is
         * evidently real one.
+        *
+        * MPTCP: new subflows cannot be established in a stateless manner.
         */
-       if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
+       if (((!is_meta_sk(sk) && net->ipv4.sysctl_tcp_syncookies == 2) ||
             inet_csk_reqsk_queue_is_full(sk)) && !isn) {
                want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
                if (!want_cookie)
                        goto drop;
+
+               if (is_meta_sk(sk))
+                       goto drop;
        }
 
        if (sk_acceptq_is_full(sk)) {
@@ -6347,8 +6520,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        tcp_clear_options(&tmp_opt);
        tmp_opt.mss_clamp = af_ops->mss_clamp;
        tmp_opt.user_mss  = tp->rx_opt.user_mss;
-       tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
-                         want_cookie ? NULL : &foc);
+       tcp_parse_options(sock_net(sk), skb, &tmp_opt, NULL, 0,
+                         want_cookie ? NULL : &foc, NULL);
 
        if (want_cookie && !tmp_opt.saw_tstamp)
                tcp_clear_options(&tmp_opt);
@@ -6360,7 +6533,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        /* Note: tcp_v6_init_req() might override ir_iif for link locals */
        inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
 
-       af_ops->init_req(req, sk, skb);
+       if (af_ops->init_req(req, sk, skb, want_cookie))
+               goto drop_and_free;
 
        if (security_inet_conn_request(sk, skb, req))
                goto drop_and_free;
@@ -6396,7 +6570,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        tcp_ecn_create_request(req, skb, sk, dst);
 
        if (want_cookie) {
-               isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+               isn = cookie_init_sequence(af_ops, req, sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
                if (!tmp_opt.tstamp_ok)
                        inet_rsk(req)->ecn_ok = 0;
@@ -6410,12 +6584,18 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
                fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
        }
        if (fastopen_sk) {
+               struct sock *meta_sk = fastopen_sk;
+
+               if (mptcp(tcp_sk(fastopen_sk)))
+                       meta_sk = mptcp_meta_sk(fastopen_sk);
                af_ops->send_synack(fastopen_sk, dst, &fl, req,
                                    &foc, TCP_SYNACK_FASTOPEN);
                /* Add the child socket directly into the accept queue */
-               inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+               inet_csk_reqsk_queue_add(sk, req, meta_sk);
                sk->sk_data_ready(sk);
                bh_unlock_sock(fastopen_sk);
+               if (meta_sk != fastopen_sk)
+                       bh_unlock_sock(meta_sk);
                sock_put(fastopen_sk);
        } else {
                tcp_rsk(req)->tfo_listener = false;
index a95ccdceb79799b22339201d047bb4925d7d106d..9fcc796762d0e28d38d19b165c378c36cc28c06a 100644 (file)
@@ -67,6 +67,8 @@
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
 #include <net/tcp.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
 #include <net/transp_v6.h>
 #include <net/ipv6.h>
 #include <net/inet_common.h>
@@ -372,7 +374,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        struct inet_sock *inet;
        const int type = icmp_hdr(icmp_skb)->type;
        const int code = icmp_hdr(icmp_skb)->code;
-       struct sock *sk;
+       struct sock *sk, *meta_sk;
        struct sk_buff *skb;
        struct request_sock *fastopen;
        u32 seq, snd_una;
@@ -401,13 +403,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                                   (code == ICMP_NET_UNREACH ||
                                    code == ICMP_HOST_UNREACH)));
 
-       bh_lock_sock(sk);
+       tp = tcp_sk(sk);
+       if (mptcp(tp))
+               meta_sk = mptcp_meta_sk(sk);
+       else
+               meta_sk = sk;
+
+       bh_lock_sock(meta_sk);
        /* If too many ICMPs get dropped on busy
         * servers this needs to be solved differently.
         * We do take care of PMTU discovery (RFC1191) special case :
         * we can receive locally generated ICMP messages while socket is held.
         */
-       if (sock_owned_by_user(sk)) {
+       if (sock_owned_by_user(meta_sk)) {
                if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
                        __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
        }
@@ -420,7 +428,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        }
 
        icsk = inet_csk(sk);
-       tp = tcp_sk(sk);
        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
        fastopen = tp->fastopen_rsk;
        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
@@ -454,11 +461,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                                goto out;
 
                        tp->mtu_info = info;
-                       if (!sock_owned_by_user(sk)) {
+                       if (!sock_owned_by_user(meta_sk)) {
                                tcp_v4_mtu_reduced(sk);
                        } else {
                                if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
                                        sock_hold(sk);
+                               if (mptcp(tp))
+                                       mptcp_tsq_flags(sk);
                        }
                        goto out;
                }
@@ -472,7 +481,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                    !icsk->icsk_backoff || fastopen)
                        break;
 
-               if (sock_owned_by_user(sk))
+               if (sock_owned_by_user(meta_sk))
                        break;
 
                icsk->icsk_backoff--;
@@ -494,7 +503,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                } else {
                        /* RTO revert clocked out retransmission.
                         * Will retransmit now */
-                       tcp_retransmit_timer(sk);
+                       tcp_sk(sk)->ops->retransmit_timer(sk);
                }
 
                break;
@@ -514,7 +523,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
                if (fastopen && !fastopen->sk)
                        break;
 
-               if (!sock_owned_by_user(sk)) {
+               if (!sock_owned_by_user(meta_sk)) {
                        sk->sk_err = err;
 
                        sk->sk_error_report(sk);
@@ -543,7 +552,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
         */
 
        inet = inet_sk(sk);
-       if (!sock_owned_by_user(sk) && inet->recverr) {
+       if (!sock_owned_by_user(meta_sk) && inet->recverr) {
                sk->sk_err = err;
                sk->sk_error_report(sk);
        } else  { /* Only an error on timeout */
@@ -551,7 +560,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        }
 
 out:
-       bh_unlock_sock(sk);
+       bh_unlock_sock(meta_sk);
        sock_put(sk);
 }
 
@@ -593,7 +602,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
  *     Exception: precedence violation. We do not implement it in any case.
  */
 
-static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
+void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 {
        const struct tcphdr *th = tcp_hdr(skb);
        struct {
@@ -730,10 +739,10 @@ out:
  */
 
 static void tcp_v4_send_ack(const struct sock *sk,
-                           struct sk_buff *skb, u32 seq, u32 ack,
+                           struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
                            u32 win, u32 tsval, u32 tsecr, int oif,
                            struct tcp_md5sig_key *key,
-                           int reply_flags, u8 tos)
+                           int reply_flags, u8 tos, int mptcp)
 {
        const struct tcphdr *th = tcp_hdr(skb);
        struct {
@@ -741,6 +750,10 @@ static void tcp_v4_send_ack(const struct sock *sk,
                __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 #ifdef CONFIG_TCP_MD5SIG
                           + (TCPOLEN_MD5SIG_ALIGNED >> 2)
+#endif
+#ifdef CONFIG_MPTCP
+                          + ((MPTCP_SUB_LEN_DSS >> 2) +
+                             (MPTCP_SUB_LEN_ACK >> 2))
 #endif
                        ];
        } rep;
@@ -786,6 +799,21 @@ static void tcp_v4_send_ack(const struct sock *sk,
                                    ip_hdr(skb)->daddr, &rep.th);
        }
 #endif
+#ifdef CONFIG_MPTCP
+       if (mptcp) {
+               int offset = (tsecr) ? 3 : 0;
+               /* Construction of 32-bit data_ack */
+               rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
+                                         ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
+                                         (0x20 << 8) |
+                                         (0x01));
+               rep.opt[offset] = htonl(data_ack);
+
+               arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
+               rep.th.doff = arg.iov[0].iov_len / 4;
+       }
+#endif /* CONFIG_MPTCP */
+
        arg.flags = reply_flags;
        arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
                                      ip_hdr(skb)->saddr, /* XXX */
@@ -809,28 +837,36 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+       u32 data_ack = 0;
+       int mptcp = 0;
+
+       if (tcptw->mptcp_tw) {
+               data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
+               mptcp = 1;
+       }
 
        tcp_v4_send_ack(sk, skb,
-                       tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+                       tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, data_ack,
                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
                        tcptw->tw_ts_recent,
                        tw->tw_bound_dev_if,
                        tcp_twsk_md5_key(tcptw),
                        tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
-                       tw->tw_tos
+                       tw->tw_tos, mptcp
                        );
 
        inet_twsk_put(tw);
 }
 
-static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
-                                 struct request_sock *req)
+void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+                          struct request_sock *req)
 {
        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
         */
-       u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+       u32 seq = (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
+                                            tcp_rsk(req)->snt_isn + 1 :
                                             tcp_sk(sk)->snd_nxt;
 
        /* RFC 7323 2.3
@@ -839,7 +875,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
         * Rcv.Wind.Shift bits:
         */
        tcp_v4_send_ack(sk, skb, seq,
-                       tcp_rsk(req)->rcv_nxt,
+                       tcp_rsk(req)->rcv_nxt, 0,
                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
                        req->ts_recent,
@@ -847,7 +883,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
                        tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
                                          AF_INET),
                        inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
-                       ip_hdr(skb)->tos);
+                       ip_hdr(skb)->tos, 0);
 }
 
 /*
@@ -855,11 +891,11 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
  *     This still operates on a request_sock only, not on a big
  *     socket.
  */
-static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
-                             struct flowi *fl,
-                             struct request_sock *req,
-                             struct tcp_fastopen_cookie *foc,
-                             enum tcp_synack_type synack_type)
+int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
+                      struct flowi *fl,
+                      struct request_sock *req,
+                      struct tcp_fastopen_cookie *foc,
+                      enum tcp_synack_type synack_type)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
@@ -887,7 +923,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 /*
  *     IPv4 request_sock destructor.
  */
-static void tcp_v4_reqsk_destructor(struct request_sock *req)
+void tcp_v4_reqsk_destructor(struct request_sock *req)
 {
        kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
 }
@@ -1260,9 +1296,10 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
        return false;
 }
 
-static void tcp_v4_init_req(struct request_sock *req,
-                           const struct sock *sk_listener,
-                           struct sk_buff *skb)
+static int tcp_v4_init_req(struct request_sock *req,
+                          const struct sock *sk_listener,
+                          struct sk_buff *skb,
+                          bool want_cookie)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        struct net *net = sock_net(sk_listener);
@@ -1270,6 +1307,8 @@ static void tcp_v4_init_req(struct request_sock *req,
        sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
        sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
        RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
+
+       return 0;
 }
 
 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
@@ -1289,7 +1328,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
        .syn_ack_timeout =      tcp_syn_ack_timeout,
 };
 
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
        .mss_clamp      =       TCP_MSS_DEFAULT,
 #ifdef CONFIG_TCP_MD5SIG
        .req_md5_lookup =       tcp_v4_md5_lookup,
@@ -1426,7 +1465,7 @@ put_and_exit:
 }
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
 
-static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
+struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
 {
 #ifdef CONFIG_SYN_COOKIES
        const struct tcphdr *th = tcp_hdr(skb);
@@ -1449,6 +1488,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
        struct sock *rsk;
 
+       if (is_meta_sk(sk))
+               return mptcp_v4_do_rcv(sk, skb);
+
        if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                struct dst_entry *dst = sk->sk_rx_dst;
 
@@ -1607,6 +1649,10 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                                    skb->len - th->doff * 4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+#ifdef CONFIG_MPTCP
+       TCP_SKB_CB(skb)->mptcp_flags = 0;
+       TCP_SKB_CB(skb)->dss_off = 0;
+#endif
        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
        TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
@@ -1625,8 +1671,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
        int sdif = inet_sdif(skb);
        const struct iphdr *iph;
        const struct tcphdr *th;
+       struct sock *sk, *meta_sk = NULL;
        bool refcounted;
-       struct sock *sk;
        int ret;
 
        if (skb->pkt_type != PACKET_HOST)
@@ -1679,7 +1725,7 @@ process:
                        reqsk_put(req);
                        goto csum_error;
                }
-               if (unlikely(sk->sk_state != TCP_LISTEN)) {
+               if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) {
                        inet_csk_reqsk_queue_drop_and_put(sk, req);
                        goto lookup;
                }
@@ -1688,6 +1734,37 @@ process:
                 */
                sock_hold(sk);
                refcounted = true;
+
+               if (is_meta_sk(sk)) {
+                       bh_lock_sock(sk);
+
+                       if (!mptcp_can_new_subflow(sk)) {
+                               inet_csk_reqsk_queue_drop_and_put(sk, req);
+                               bh_unlock_sock(sk);
+                               sock_put(sk);
+
+                               return 0;
+                       }
+
+                       if (sock_owned_by_user(sk)) {
+                               skb->sk = sk;
+                               if (unlikely(sk_add_backlog(sk, skb,
+                                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
+                                       reqsk_put(req);
+
+                                       bh_unlock_sock(sk);
+                                       __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
+                                       goto discard_and_relse;
+                               }
+
+                               reqsk_put(req);
+                               bh_unlock_sock(sk);
+                               sock_put(sk);
+
+                               return 0;
+                       }
+               }
+
                nsk = NULL;
                if (!tcp_filter(sk, skb)) {
                        th = (const struct tcphdr *)skb->data;
@@ -1697,10 +1774,14 @@ process:
                }
                if (!nsk) {
                        reqsk_put(req);
+                       if (is_meta_sk(sk))
+                               bh_unlock_sock(sk);
                        goto discard_and_relse;
                }
                if (nsk == sk) {
                        reqsk_put(req);
+                       if (is_meta_sk(sk))
+                               bh_unlock_sock(sk);
                        tcp_v4_restore_cb(skb);
                } else if (tcp_child_process(sk, nsk, skb)) {
                        tcp_v4_send_reset(nsk, skb);
@@ -1738,15 +1819,23 @@ process:
 
        sk_incoming_cpu_update(sk);
 
-       bh_lock_sock_nested(sk);
+       if (mptcp(tcp_sk(sk))) {
+               meta_sk = mptcp_meta_sk(sk);
+
+               bh_lock_sock_nested(meta_sk);
+               if (sock_owned_by_user(meta_sk))
+                       skb->sk = sk;
+       } else {
+               meta_sk = sk;
+               bh_lock_sock_nested(sk);
+       }
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
-       if (!sock_owned_by_user(sk)) {
+       if (!sock_owned_by_user(meta_sk))
                ret = tcp_v4_do_rcv(sk, skb);
-       } else if (tcp_add_backlog(sk, skb)) {
+       else if (tcp_add_backlog(meta_sk, skb))
                goto discard_and_relse;
-       }
-       bh_unlock_sock(sk);
+       bh_unlock_sock(meta_sk);
 
 put_and_return:
        if (refcounted)
@@ -1760,6 +1849,19 @@ no_tcp_socket:
 
        tcp_v4_fill_cb(skb, iph, th);
 
+#ifdef CONFIG_MPTCP
+       if (!sk && th->syn && !th->ack) {
+               int ret = mptcp_lookup_join(skb, NULL);
+
+               if (ret < 0) {
+                       tcp_v4_send_reset(NULL, skb);
+                       goto discard_it;
+               } else if (ret > 0) {
+                       return 0;
+               }
+       }
+#endif
+
        if (tcp_checksum_complete(skb)) {
 csum_error:
                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
@@ -1808,6 +1910,18 @@ do_time_wait:
                        refcounted = false;
                        goto process;
                }
+#ifdef CONFIG_MPTCP
+               if (th->syn && !th->ack) {
+                       int ret = mptcp_lookup_join(skb, inet_twsk(sk));
+
+                       if (ret < 0) {
+                               tcp_v4_send_reset(NULL, skb);
+                               goto discard_it;
+                       } else if (ret > 0) {
+                               return 0;
+                       }
+               }
+#endif
                /* Fall through to ACK */
        }
        case TCP_TW_ACK:
@@ -1876,7 +1990,12 @@ static int tcp_v4_init_sock(struct sock *sk)
 
        tcp_init_sock(sk);
 
-       icsk->icsk_af_ops = &ipv4_specific;
+#ifdef CONFIG_MPTCP
+       if (sock_flag(sk, SOCK_MPTCP))
+               icsk->icsk_af_ops = &mptcp_v4_specific;
+       else
+#endif
+               icsk->icsk_af_ops = &ipv4_specific;
 
 #ifdef CONFIG_TCP_MD5SIG
        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
@@ -1893,6 +2012,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
        tcp_cleanup_congestion_control(sk);
 
+       if (mptcp(tp))
+               mptcp_destroy_sock(sk);
+       if (tp->inside_tk_table)
+               mptcp_hash_remove_bh(tp);
+
        tcp_cleanup_ulp(sk);
 
        /* Cleanup up the write buffer. */
@@ -2438,8 +2562,16 @@ struct proto tcp_prot = {
 #ifdef CONFIG_COMPAT
        .compat_setsockopt      = compat_tcp_setsockopt,
        .compat_getsockopt      = compat_tcp_getsockopt,
+#endif
+#ifdef CONFIG_MEMCG_KMEM
+       .init_cgroup            = tcp_init_cgroup,
+       .destroy_cgroup         = tcp_destroy_cgroup,
+       .proto_cgroup           = tcp_proto_cgroup,
 #endif
        .diag_destroy           = tcp_abort,
+#ifdef CONFIG_MPTCP
+       .clear_sk               = mptcp_clear_sk,
+#endif
 };
 EXPORT_SYMBOL(tcp_prot);
 
index 420fecbb98fe7edc3a215455d603997c95b7ced3..c8a295b0f966758361165bcf1139985e6a556eb6 100644 (file)
  *             Jorge Cwik, <jorge@laser.satlink.net>
  */
 
+#include <linux/kconfig.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/workqueue.h>
+#include <net/mptcp.h>
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <net/xfrm.h>
@@ -95,10 +97,14 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
        struct tcp_options_received tmp_opt;
        struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
        bool paws_reject = false;
+       struct mptcp_options_received mopt;
 
        tmp_opt.saw_tstamp = 0;
-       if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-               tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
+       if (th->doff > (sizeof(*th) >> 2) &&
+           (tcptw->tw_ts_recent_stamp || tcptw->mptcp_tw)) {
+               mptcp_init_mp_opt(&mopt);
+
+               tcp_parse_options(twsk_net(tw), skb, &tmp_opt, &mopt, 0, NULL, NULL);
 
                if (tmp_opt.saw_tstamp) {
                        if (tmp_opt.rcv_tsecr)
@@ -107,6 +113,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
                        tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
                        paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
                }
+
+               if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
+                       if (mopt.mptcp_sender_key == tcptw->mptcp_tw->loc_key)
+                               return TCP_TW_RST;
+               }
        }
 
        if (tw->tw_substate == TCP_FIN_WAIT2) {
@@ -130,6 +141,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
                if (!th->ack ||
                    !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
                    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+                       /* If mptcp_is_data_fin() returns true, we are sure that
+                        * mopt has been initialized - otherwise it would not
+                        * be a DATA_FIN.
+                        */
+                       if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
+                           mptcp_is_data_fin(skb) &&
+                           TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+                           mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
+                               return TCP_TW_ACK;
+
                        inet_twsk_put(tw);
                        return TCP_TW_SUCCESS;
                }
@@ -273,6 +294,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                tcptw->tw_ts_offset     = tp->tsoffset;
                tcptw->tw_last_oow_ack_time = 0;
 
+               if (mptcp(tp)) {
+                       if (mptcp_init_tw_sock(sk, tcptw)) {
+                               inet_twsk_free(tw);
+                               goto exit;
+                       }
+               } else {
+                       tcptw->mptcp_tw = NULL;
+               }
+
 #if IS_ENABLED(CONFIG_IPV6)
                if (tw->tw_family == PF_INET6) {
                        struct ipv6_pinfo *np = inet6_sk(sk);
@@ -330,15 +360,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
        }
 
+exit:
        tcp_update_metrics(sk);
        tcp_done(sk);
 }
 
 void tcp_twsk_destructor(struct sock *sk)
 {
-#ifdef CONFIG_TCP_MD5SIG
        struct tcp_timewait_sock *twsk = tcp_twsk(sk);
 
+       if (twsk->mptcp_tw)
+               mptcp_twsk_destructor(twsk);
+#ifdef CONFIG_TCP_MD5SIG
        if (twsk->tw_md5_key)
                kfree_rcu(twsk->tw_md5_key, rcu);
 #endif
@@ -377,13 +410,14 @@ void tcp_openreq_init_rwin(struct request_sock *req,
                full_space = rcv_wnd * mss;
 
        /* tcp_full_space because it is guaranteed to be the first packet */
-       tcp_select_initial_window(full_space,
-               mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+       tp->ops->select_initial_window(tcp_full_space(sk_listener),
+               mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
+               (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
                &req->rsk_rcv_wnd,
                &req->rsk_window_clamp,
                ireq->wscale_ok,
                &rcv_wscale,
-               rcv_wnd);
+               rcv_wnd, sk_listener);
        ireq->rcv_wscale = rcv_wscale;
 }
 EXPORT_SYMBOL(tcp_openreq_init_rwin);
@@ -525,6 +559,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
                        newtp->rx_opt.ts_recent_stamp = 0;
                        newtp->tcp_header_len = sizeof(struct tcphdr);
                }
+               if (ireq->saw_mpc)
+                       newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
                newtp->tsoffset = treq->ts_off;
 #ifdef CONFIG_TCP_MD5SIG
                newtp->md5sig_info = NULL;      /*XXX*/
@@ -563,6 +599,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                           bool fastopen)
 {
        struct tcp_options_received tmp_opt;
+       struct mptcp_options_received mopt;
        struct sock *child;
        const struct tcphdr *th = tcp_hdr(skb);
        __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
@@ -570,8 +607,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        bool own_req;
 
        tmp_opt.saw_tstamp = 0;
+
+       mptcp_init_mp_opt(&mopt);
+
        if (th->doff > (sizeof(struct tcphdr)>>2)) {
-               tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
+               tcp_parse_options(sock_net(sk), skb, &tmp_opt, &mopt, 0, NULL, NULL);
 
                if (tmp_opt.saw_tstamp) {
                        tmp_opt.ts_recent = req->ts_recent;
@@ -612,7 +652,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
                 *
                 * Reset timer after retransmitting SYNACK, similar to
                 * the idea of fast retransmit in recovery.
+                *
+                * Fall back to TCP if MP_CAPABLE is not set.
                 */
+
+               if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)
+                       inet_rsk(req)->saw_mpc = false;
+
+
                if (!tcp_oow_rate_limited(sock_net(sk), skb,
                                          LINUX_MIB_TCPACKSKIPPEDSYNRECV,
                                          &tcp_rsk(req)->last_oow_ack_time) &&
@@ -765,6 +812,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
        if (!child)
                goto listen_overflow;
 
+       if (own_req && !is_meta_sk(sk)) {
+               int ret = mptcp_check_req_master(sk, child, req, skb, 1);
+               if (ret < 0)
+                       goto listen_overflow;
+
+               /* MPTCP-supported */
+               if (!ret)
+                       return tcp_sk(child)->mpcb->master_sk;
+       } else if (own_req) {
+               return mptcp_check_req_child(sk, child, req, skb, &mopt);
+       }
+
        sock_rps_save_rxhash(child, skb);
        tcp_synack_rtt_meas(child, req);
        return inet_csk_complete_hashdance(sk, child, req, own_req);
@@ -812,12 +871,13 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 {
        int ret = 0;
        int state = child->sk_state;
+       struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;
 
        /* record NAPI ID of child */
        sk_mark_napi_id(child, skb);
 
        tcp_segs_in(tcp_sk(child), skb);
-       if (!sock_owned_by_user(child)) {
+       if (!sock_owned_by_user(meta_sk)) {
                ret = tcp_rcv_state_process(child, skb);
                /* Wakeup parent, send SIGIO */
                if (state == TCP_SYN_RECV && child->sk_state != state)
@@ -827,10 +887,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,
                 * in main socket hash table and lock on listening
                 * socket does not protect us more.
                 */
-               __sk_add_backlog(child, skb);
+               if (mptcp(tcp_sk(child)))
+                       skb->sk = child;
+               __sk_add_backlog(meta_sk, skb);
        }
 
-       bh_unlock_sock(child);
+       if (mptcp(tcp_sk(child)))
+               bh_unlock_sock(child);
+       bh_unlock_sock(meta_sk);
        sock_put(child);
        return ret;
 }
index b2ead31afcbab34e7ff5a95a156b173e48b1bc73..6801d2a66a64d40b0bd59a7527fd5a5b3fb1545a 100644 (file)
 
 #define pr_fmt(fmt) "TCP: " fmt
 
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/mptcp_v6.h>
+#endif
+#include <net/ipv6.h>
 #include <net/tcp.h>
 
 #include <linux/compiler.h>
@@ -62,11 +68,8 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
-                          int push_one, gfp_t gfp);
-
 /* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
@@ -211,7 +214,7 @@ u32 tcp_default_init_rwnd(u32 mss)
 void tcp_select_initial_window(int __space, __u32 mss,
                               __u32 *rcv_wnd, __u32 *window_clamp,
                               int wscale_ok, __u8 *rcv_wscale,
-                              __u32 init_rcv_wnd)
+                              __u32 init_rcv_wnd, const struct sock *sk)
 {
        unsigned int space = (__space < 0 ? 0 : __space);
 
@@ -265,12 +268,16 @@ EXPORT_SYMBOL(tcp_select_initial_window);
  * value can be stuffed directly into th->window for an outgoing
  * frame.
  */
-static u16 tcp_select_window(struct sock *sk)
+u16 tcp_select_window(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u32 old_win = tp->rcv_wnd;
-       u32 cur_win = tcp_receive_window(tp);
-       u32 new_win = __tcp_select_window(sk);
+       /* The window must never shrink at the meta-level. At the subflow we
+        * have to allow this. Otherwise we may announce a window too large
+        * for the current meta-level sk_rcvbuf.
+        */
+       u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);
+       u32 new_win = tp->ops->__select_window(sk);
 
        /* Never shrink the offered window */
        if (new_win < cur_win) {
@@ -286,6 +293,7 @@ static u16 tcp_select_window(struct sock *sk)
                                      LINUX_MIB_TCPWANTZEROWINDOWADV);
                new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
        }
+
        tp->rcv_wnd = new_win;
        tp->rcv_wup = tp->rcv_nxt;
 
@@ -397,7 +405,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
 /* Constructs common control bits of non-data skb. If SYN/FIN is present,
  * auto increment end seqno.
  */
-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 {
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum = 0;
@@ -413,7 +421,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
        TCP_SKB_CB(skb)->end_seq = seq;
 }
 
-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
+bool tcp_urg_mode(const struct tcp_sock *tp)
 {
        return tp->snd_una != tp->snd_up;
 }
@@ -423,17 +431,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5             (1 << 2)
 #define OPTION_WSCALE          (1 << 3)
 #define OPTION_FAST_OPEN_COOKIE        (1 << 8)
-
-struct tcp_out_options {
-       u16 options;            /* bit field of OPTION_* */
-       u16 mss;                /* 0 to disable */
-       u8 ws;                  /* window scale, 0 to disable */
-       u8 num_sack_blocks;     /* number of SACK blocks to include */
-       u8 hash_size;           /* bytes in hash_location */
-       __u8 *hash_location;    /* temporary pointer, overloaded */
-       __u32 tsval, tsecr;     /* need to include OPTION_TS */
-       struct tcp_fastopen_cookie *fastopen_cookie;    /* Fast open cookie */
-};
+/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
 
 /* Write previously computed TCP options to the packet.
  *
@@ -449,7 +447,7 @@ struct tcp_out_options {
  * (but it may well be that other scenarios fail similarly).
  */
 static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
-                             struct tcp_out_options *opts)
+                             struct tcp_out_options *opts, struct sk_buff *skb)
 {
        u16 options = opts->options;    /* mungable copy */
 
@@ -541,6 +539,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
                }
                ptr += (len + 3) >> 2;
        }
+
+       if (unlikely(OPTION_MPTCP & opts->options))
+               mptcp_options_write(ptr, tp, opts, skb);
 }
 
 /* Compute TCP options for SYN packets. This is not the final
@@ -592,6 +593,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
                if (unlikely(!(OPTION_TS & opts->options)))
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
+       if (tp->request_mptcp || mptcp(tp))
+               mptcp_syn_options(sk, opts, &remaining);
 
        if (fastopen && fastopen->cookie.len >= 0) {
                u32 need = fastopen->cookie.len;
@@ -668,6 +671,9 @@ static unsigned int tcp_synack_options(struct request_sock *req,
                }
        }
 
+       if (ireq->saw_mpc)
+               mptcp_synack_options(req, opts, &remaining);
+
        return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -700,16 +706,23 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
                opts->tsecr = tp->rx_opt.ts_recent;
                size += TCPOLEN_TSTAMP_ALIGNED;
        }
+       if (mptcp(tp))
+               mptcp_established_options(sk, skb, opts, &size);
 
        eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
        if (unlikely(eff_sacks)) {
                const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
-               opts->num_sack_blocks =
-                       min_t(unsigned int, eff_sacks,
-                             (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
-                             TCPOLEN_SACK_PERBLOCK);
-               size += TCPOLEN_SACK_BASE_ALIGNED +
-                       opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+
+               if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
+                       opts->num_sack_blocks = 0;
+               else
+                       opts->num_sack_blocks =
+                           min_t(unsigned int, eff_sacks,
+                                 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+                                 TCPOLEN_SACK_PERBLOCK);
+               if (opts->num_sack_blocks)
+                       size += TCPOLEN_SACK_BASE_ALIGNED +
+                           opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
        }
 
        return size;
@@ -749,8 +762,8 @@ static void tcp_tsq_handler(struct sock *sk)
                        tcp_xmit_retransmit_queue(sk);
                }
 
-               tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
-                              0, GFP_ATOMIC);
+               tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),
+                                           tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);
        }
 }
 /*
@@ -766,7 +779,7 @@ static void tcp_tasklet_func(unsigned long data)
        unsigned long flags;
        struct list_head *q, *n;
        struct tcp_sock *tp;
-       struct sock *sk;
+       struct sock *sk, *meta_sk;
 
        local_irq_save(flags);
        list_splice_init(&tsq->head, &list);
@@ -780,14 +793,28 @@ static void tcp_tasklet_func(unsigned long data)
                smp_mb__before_atomic();
                clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
 
-               if (!sk->sk_lock.owned &&
+               meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
+
+               if (!meta_sk->sk_lock.owned &&
                    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
-                       bh_lock_sock(sk);
-                       if (!sock_owned_by_user(sk)) {
+                       bh_lock_sock(meta_sk);
+                       if (!sock_owned_by_user(meta_sk)) {
                                clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
                                tcp_tsq_handler(sk);
+                               if (mptcp(tp))
+                                       tcp_tsq_handler(meta_sk);
+                       } else if (mptcp(tp)) {
+                               if (sk->sk_state != TCP_CLOSE)
+                                       mptcp_tsq_flags(sk);
+                       }
+                       bh_unlock_sock(meta_sk);
+               } else {
+                       if (mptcp(tp)) {
+                               bh_lock_sock(meta_sk);
+                               if (sk->sk_state != TCP_CLOSE)
+                                       mptcp_tsq_flags(sk);
+                               bh_unlock_sock(meta_sk);
                        }
-                       bh_unlock_sock(sk);
                }
 
                sk_free(sk);
@@ -797,7 +824,9 @@ static void tcp_tasklet_func(unsigned long data)
 #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |          \
                          TCPF_WRITE_TIMER_DEFERRED |   \
                          TCPF_DELACK_TIMER_DEFERRED |  \
-                         TCPF_MTU_REDUCED_DEFERRED)
+                         TCPF_MTU_REDUCED_DEFERRED | \
+                         TCPF_PATH_MANAGER_DEFERRED |\
+                         TCPF_SUB_DEFERRED)
 /**
  * tcp_release_cb - tcp release_sock() callback
  * @sk: socket
@@ -817,8 +846,11 @@ void tcp_release_cb(struct sock *sk)
                nflags = flags & ~TCP_DEFERRED_ALL;
        } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
 
-       if (flags & TCPF_TSQ_DEFERRED)
+       if (flags & TCPF_TSQ_DEFERRED) {
                tcp_tsq_handler(sk);
+               if (mptcp(tcp_sk(sk)))
+                       tcp_tsq_handler(mptcp_meta_sk(sk));
+       }
 
        /* Here begins the tricky part :
         * We are called from release_sock() with :
@@ -843,6 +875,13 @@ void tcp_release_cb(struct sock *sk)
                inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
                __sock_put(sk);
        }
+       if (flags & TCPF_PATH_MANAGER_DEFERRED) {
+               if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
+                       tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
+               __sock_put(sk);
+       }
+       if (flags & TCPF_SUB_DEFERRED)
+               mptcp_tsq_sub_deferred(sk);
 }
 EXPORT_SYMBOL(tcp_release_cb);
 
@@ -1080,10 +1119,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
                }
        }
 
-       tcp_options_write((__be32 *)(th + 1), tp, &opts);
+       tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
        skb_shinfo(skb)->gso_type = sk->sk_gso_type;
        if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
-               th->window      = htons(tcp_select_window(sk));
+               th->window      = htons(tp->ops->select_window(sk));
                tcp_ecn_send(sk, skb, th, tcp_header_size);
        } else {
                /* RFC1323: The window in SYN & SYN/ACK segments
@@ -1140,7 +1179,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
        return err;
 }
 
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                            gfp_t gfp_mask)
 {
        return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
@@ -1152,7 +1191,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
  * otherwise socket can stall.
  */
-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
@@ -1165,7 +1204,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 }
 
 /* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 {
        if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
                /* Avoid the costly divide in the normal
@@ -1197,7 +1236,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
 /* Pcount in the middle of the write queue got changed, we need to do various
  * tweaks to fix counters
  */
-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
+void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
@@ -1349,7 +1388,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 /* This is similar to __pskb_pull_tail(). The difference is that pulled
  * data is not copied, but immediately discarded.
  */
-static int __pskb_trim_head(struct sk_buff *skb, int len)
+int __pskb_trim_head(struct sk_buff *skb, int len)
 {
        struct skb_shared_info *shinfo;
        int i, k, eat;
@@ -1572,6 +1611,7 @@ unsigned int tcp_current_mss(struct sock *sk)
 
        return mss_now;
 }
+EXPORT_SYMBOL(tcp_current_mss);
 
 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
  * As additional protections, we do not touch cwnd in retransmission phases,
@@ -1595,7 +1635,7 @@ static void tcp_cwnd_application_limited(struct sock *sk)
        tp->snd_cwnd_stamp = tcp_jiffies32;
 }
 
-static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
+void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
 {
        const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
        struct tcp_sock *tp = tcp_sk(sk);
@@ -1653,8 +1693,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
  * But we can avoid doing the divide again given we already have
  *  skb_pcount = skb->len / mss_now
  */
-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
-                               const struct sk_buff *skb)
+void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
+                        const struct sk_buff *skb)
 {
        if (skb->len < tcp_skb_pcount(skb) * mss_now)
                tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
@@ -1712,11 +1752,11 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 }
 
 /* Returns the portion of skb which can be sent right away */
-static unsigned int tcp_mss_split_point(const struct sock *sk,
-                                       const struct sk_buff *skb,
-                                       unsigned int mss_now,
-                                       unsigned int max_segs,
-                                       int nonagle)
+unsigned int tcp_mss_split_point(const struct sock *sk,
+                                const struct sk_buff *skb,
+                                unsigned int mss_now,
+                                unsigned int max_segs,
+                                int nonagle)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
        u32 partial, needed, window, max_len;
@@ -1746,13 +1786,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
 /* Can at least one segment of SKB be sent right now, according to the
  * congestion window rules?  If so, return how many segments are allowed.
  */
-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
-                                        const struct sk_buff *skb)
+unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
+                          const struct sk_buff *skb)
 {
        u32 in_flight, cwnd, halfcwnd;
 
        /* Don't be strict about the congestion window for the final FIN.  */
-       if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+       if (skb &&
+           (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
            tcp_skb_pcount(skb) == 1)
                return 1;
 
@@ -1772,7 +1813,7 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
  * This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
-static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 {
        int tso_segs = tcp_skb_pcount(skb);
 
@@ -1787,8 +1828,8 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 /* Return true if the Nagle test allows this packet to be
  * sent now.
  */
-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
-                                 unsigned int cur_mss, int nonagle)
+bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+                   unsigned int cur_mss, int nonagle)
 {
        /* Nagle rule does not apply to frames, which sit in the middle of the
         * write_queue (they have no chances to get new data).
@@ -1800,7 +1841,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
                return true;
 
        /* Don't use the nagle rule for urgent data (or for the final FIN). */
-       if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+       if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
+           mptcp_is_data_fin(skb))
                return true;
 
        if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
@@ -1810,9 +1852,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
 }
 
 /* Does at least the first segment of SKB fit into the send window? */
-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
-                            const struct sk_buff *skb,
-                            unsigned int cur_mss)
+bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
+                     unsigned int cur_mss)
 {
        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
@@ -1893,7 +1934,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
        struct sk_buff *head;
        int win_divisor;
 
-       if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+       if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
                goto send_now;
 
        if (icsk->icsk_ca_state >= TCP_CA_Recovery)
@@ -2266,7 +2307,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
  * Returns true, if no segments are in flight and we have queued segments,
  * but cannot send anything now because of SWS or another problem.
  */
-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                           int push_one, gfp_t gfp)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -2280,7 +2321,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
        sent_pkts = 0;
 
        tcp_mstamp_refresh(tp);
-       if (!push_one) {
+
+       /* pmtu not yet supported with MPTCP. Should be possible, by early
+        * exiting the loop inside tcp_mtu_probe, making sure that only one
+        * single DSS-mapping gets probed.
+        */
+       if (!push_one && !mptcp(tp)) {
                /* Do MTU probing. */
                result = tcp_mtu_probe(sk);
                if (!result) {
@@ -2378,7 +2424,8 @@ repair:
                if (push_one != 2)
                        tcp_schedule_loss_probe(sk, false);
                is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
-               tcp_cwnd_validate(sk, is_cwnd_limited);
+               if (tp->ops->cwnd_validate)
+                       tp->ops->cwnd_validate(sk, is_cwnd_limited);
                return false;
        }
        return !tp->packets_out && tcp_send_head(sk);
@@ -2463,7 +2510,7 @@ void tcp_send_loss_probe(struct sock *sk)
        if (skb) {
                if (tcp_snd_wnd_test(tp, skb, mss)) {
                        pcount = tp->packets_out;
-                       tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+                       tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
                        if (tp->packets_out > pcount)
                                goto probe_sent;
                        goto rearm_timer;
@@ -2526,8 +2573,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
        if (unlikely(sk->sk_state == TCP_CLOSE))
                return;
 
-       if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
-                          sk_gfp_mask(sk, GFP_ATOMIC)))
+       if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,
+                                       sk_gfp_mask(sk, GFP_ATOMIC)))
                tcp_check_probe_timer(sk);
 }
 
@@ -2540,7 +2587,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 
        BUG_ON(!skb || skb->len < mss_now);
 
-       tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
+       tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,
+                                   sk->sk_allocation);
 }
 
 /* This function returns the amount that we can raise the
@@ -2773,6 +2821,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
                return;
 
+       /* Currently not supported for MPTCP - but it should be possible */
+       if (mptcp(tp))
+               return;
+
        tcp_for_write_queue_from_safe(skb, tmp, sk) {
                if (!tcp_can_collapse(sk, skb))
                        break;
@@ -3230,7 +3282,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(min(req->rsk_rcv_wnd, 65535U));
-       tcp_options_write((__be32 *)(th + 1), NULL, &opts);
+       tcp_options_write((__be32 *)(th + 1), NULL, &opts, skb);
        th->doff = (tcp_header_size >> 2);
        __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
 
@@ -3311,13 +3363,13 @@ static void tcp_connect_init(struct sock *sk)
        if (rcv_wnd == 0)
                rcv_wnd = dst_metric(dst, RTAX_INITRWND);
 
-       tcp_select_initial_window(tcp_full_space(sk),
-                                 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
-                                 &tp->rcv_wnd,
-                                 &tp->window_clamp,
-                                 sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
-                                 &rcv_wscale,
-                                 rcv_wnd);
+       tp->ops->select_initial_window(tcp_full_space(sk),
+                                      tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+                                      &tp->rcv_wnd,
+                                      &tp->window_clamp,
+                                      sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
+                                      &rcv_wscale,
+                                      rcv_wnd, sk);
 
        tp->rx_opt.rcv_wscale = rcv_wscale;
        tp->rcv_ssthresh = tp->rcv_wnd;
@@ -3342,6 +3394,36 @@ static void tcp_connect_init(struct sock *sk)
        inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
        inet_csk(sk)->icsk_retransmits = 0;
        tcp_clear_retrans(tp);
+
+#ifdef CONFIG_MPTCP
+       if (sock_flag(sk, SOCK_MPTCP) && mptcp_doit(sk)) {
+               if (is_master_tp(tp)) {
+                       tp->request_mptcp = 1;
+                       mptcp_connect_init(sk);
+               } else if (tp->mptcp) {
+                       struct inet_sock *inet  = inet_sk(sk);
+
+                       tp->mptcp->snt_isn      = tp->write_seq;
+                       tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
+
+                       /* Set nonce for new subflows */
+                       if (sk->sk_family == AF_INET)
+                               tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
+                                                       inet->inet_saddr,
+                                                       inet->inet_daddr,
+                                                       inet->inet_sport,
+                                                       inet->inet_dport);
+#if IS_ENABLED(CONFIG_IPV6)
+                       else
+                               tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
+                                               inet6_sk(sk)->saddr.s6_addr32,
+                                               sk->sk_v6_daddr.s6_addr32,
+                                               inet->inet_sport,
+                                               inet->inet_dport);
+#endif
+               }
+       }
+#endif
 }
 
 static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
@@ -3614,7 +3696,7 @@ void tcp_send_ack(struct sock *sk)
  * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
  * out-of-date with SND.UNA-1 to probe window.
  */
-static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
+int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -3700,7 +3782,7 @@ void tcp_send_probe0(struct sock *sk)
        unsigned long probe_max;
        int err;
 
-       err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
+       err = tp->ops->write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
 
        if (tp->packets_out || !tcp_send_head(sk)) {
                /* Cancel probe timer, if it is not required. */
index a845b7692c1b0d0cce8f68345db8d4f3fa812c68..4b660295f597215a39539f4cb38fa27b5296c85e 100644 (file)
@@ -20,6 +20,7 @@
 
 #include <linux/module.h>
 #include <linux/gfp.h>
+#include <net/mptcp.h>
 #include <net/tcp.h>
 
 int sysctl_tcp_thin_linear_timeouts __read_mostly;
@@ -31,7 +32,7 @@ int sysctl_tcp_thin_linear_timeouts __read_mostly;
  *  Returns: Nothing (void)
  */
 
-static void tcp_write_err(struct sock *sk)
+void tcp_write_err(struct sock *sk)
 {
        sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
        sk->sk_error_report(sk);
@@ -87,7 +88,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
                    (!tp->snd_wnd && !tp->packets_out))
                        do_reset = true;
                if (do_reset)
-                       tcp_send_active_reset(sk, GFP_ATOMIC);
+                       tp->ops->send_active_reset(sk, GFP_ATOMIC);
                tcp_done(sk);
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
                return 1;
@@ -161,9 +162,9 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
  * after "boundary" unsuccessful, exponentially backed-off
  * retransmissions with an initial RTO of TCP_RTO_MIN.
  */
-static bool retransmits_timed_out(struct sock *sk,
-                                 unsigned int boundary,
-                                 unsigned int timeout)
+bool retransmits_timed_out(struct sock *sk,
+                          unsigned int boundary,
+                          unsigned int timeout)
 {
        const unsigned int rto_base = TCP_RTO_MIN;
        unsigned int linear_backoff_thresh, start_ts;
@@ -188,7 +189,7 @@ static bool retransmits_timed_out(struct sock *sk,
 }
 
 /* A write timeout has occurred. Process the after effects. */
-static int tcp_write_timeout(struct sock *sk)
+int tcp_write_timeout(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
@@ -208,6 +209,17 @@ static int tcp_write_timeout(struct sock *sk)
                        sk_rethink_txhash(sk);
                }
                retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
+
+#ifdef CONFIG_MPTCP
+               /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
+               if (tcp_sk(sk)->request_mptcp &&
+                   icsk->icsk_retransmits >= sysctl_mptcp_syn_retries) {
+                       tcp_sk(sk)->request_mptcp = 0;
+
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLERETRANSFALLBACK);
+               }
+#endif /* CONFIG_MPTCP */
+
                expired = icsk->icsk_retransmits >= retry_until;
        } else {
                if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
@@ -303,18 +315,22 @@ out:
 static void tcp_delack_timer(unsigned long data)
 {
        struct sock *sk = (struct sock *)data;
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
 
-       bh_lock_sock(sk);
-       if (!sock_owned_by_user(sk)) {
+       bh_lock_sock(meta_sk);
+       if (!sock_owned_by_user(meta_sk)) {
                tcp_delack_timer_handler(sk);
        } else {
                inet_csk(sk)->icsk_ack.blocked = 1;
-               __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+               __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
                /* deleguate our work to tcp_release_cb() */
                if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
                        sock_hold(sk);
+               if (mptcp(tp))
+                       mptcp_tsq_flags(sk);
        }
-       bh_unlock_sock(sk);
+       bh_unlock_sock(meta_sk);
        sock_put(sk);
 }
 
@@ -575,7 +591,7 @@ void tcp_write_timer_handler(struct sock *sk)
                break;
        case ICSK_TIME_RETRANS:
                icsk->icsk_pending = 0;
-               tcp_retransmit_timer(sk);
+               tcp_sk(sk)->ops->retransmit_timer(sk);
                break;
        case ICSK_TIME_PROBE0:
                icsk->icsk_pending = 0;
@@ -590,16 +606,19 @@ out:
 static void tcp_write_timer(unsigned long data)
 {
        struct sock *sk = (struct sock *)data;
+       struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
 
-       bh_lock_sock(sk);
-       if (!sock_owned_by_user(sk)) {
+       bh_lock_sock(meta_sk);
+       if (!sock_owned_by_user(meta_sk)) {
                tcp_write_timer_handler(sk);
        } else {
                /* delegate our work to tcp_release_cb() */
                if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags))
                        sock_hold(sk);
+               if (mptcp(tcp_sk(sk)))
+                       mptcp_tsq_flags(sk);
        }
-       bh_unlock_sock(sk);
+       bh_unlock_sock(meta_sk);
        sock_put(sk);
 }
 
@@ -629,11 +648,12 @@ static void tcp_keepalive_timer (unsigned long data)
        struct sock *sk = (struct sock *) data;
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
+       struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
        u32 elapsed;
 
        /* Only process if socket is not in use. */
-       bh_lock_sock(sk);
-       if (sock_owned_by_user(sk)) {
+       bh_lock_sock(meta_sk);
+       if (sock_owned_by_user(meta_sk)) {
                /* Try again later. */
                inet_csk_reset_keepalive_timer (sk, HZ/20);
                goto out;
@@ -644,17 +664,31 @@ static void tcp_keepalive_timer (unsigned long data)
                goto out;
        }
 
+       if (tp->send_mp_fclose) {
+               if (icsk->icsk_retransmits >= MPTCP_FASTCLOSE_RETRIES) {
+                       tcp_write_err(sk);
+                       goto out;
+               }
+
+               tcp_send_ack(sk);
+               icsk->icsk_retransmits++;
+
+               icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+               elapsed = icsk->icsk_rto;
+               goto resched;
+       }
+
        tcp_mstamp_refresh(tp);
        if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
                if (tp->linger2 >= 0) {
                        const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 
                        if (tmo > 0) {
-                               tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+                               tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
                                goto out;
                        }
                }
-               tcp_send_active_reset(sk, GFP_ATOMIC);
+               tp->ops->send_active_reset(sk, GFP_ATOMIC);
                goto death;
        }
 
@@ -679,11 +713,11 @@ static void tcp_keepalive_timer (unsigned long data)
                    icsk->icsk_probes_out > 0) ||
                    (icsk->icsk_user_timeout == 0 &&
                    icsk->icsk_probes_out >= keepalive_probes(tp))) {
-                       tcp_send_active_reset(sk, GFP_ATOMIC);
+                       tp->ops->send_active_reset(sk, GFP_ATOMIC);
                        tcp_write_err(sk);
                        goto out;
                }
-               if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
+               if (tp->ops->write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
                        icsk->icsk_probes_out++;
                        elapsed = keepalive_intvl_when(tp);
                } else {
@@ -707,7 +741,7 @@ death:
        tcp_done(sk);
 
 out:
-       bh_unlock_sock(sk);
+       bh_unlock_sock(meta_sk);
        sock_put(sk);
 }
 
index 6a76e41e6d51695553d281900f73b28e9af4f814..b9ad831ff079fee7d94174ce6024c262400a9621 100644 (file)
@@ -928,6 +928,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
 
        kfree_rcu(ifp, rcu);
 }
+EXPORT_SYMBOL(inet6_ifa_finish_destroy);
 
 static void
 ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
index 9ccbf74deb99c2af94a954bef605548f0b5866cc..f5d6839337fd04bb4b6f98a3470d76ff6b730ffa 100644 (file)
@@ -107,8 +107,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
        return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
 }
 
-static int inet6_create(struct net *net, struct socket *sock, int protocol,
-                       int kern)
+int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
        struct inet_sock *inet;
        struct ipv6_pinfo *np;
index 5c91b05c8d8feb3207741a4d9b821df7e5a24cc6..2134e89805e9ee7e0f44a2602ae58e51a89ffef5 100644 (file)
@@ -48,6 +48,8 @@
 #include <net/addrconf.h>
 #include <net/inet_common.h>
 #include <net/tcp.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
 #include <net/udp.h>
 #include <net/udplite.h>
 #include <net/xfrm.h>
@@ -216,7 +218,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
                                sock_prot_inuse_add(net, &tcp_prot, 1);
                                local_bh_enable();
                                sk->sk_prot = &tcp_prot;
-                               icsk->icsk_af_ops = &ipv4_specific;
+#ifdef CONFIG_MPTCP
+                               if (sock_flag(sk, SOCK_MPTCP))
+                                       icsk->icsk_af_ops = &mptcp_v4_specific;
+                               else
+#endif
+                                       icsk->icsk_af_ops = &ipv4_specific;
                                sk->sk_socket->ops = &inet_stream_ops;
                                sk->sk_family = PF_INET;
                                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
index 4e7817abc0b934fbff21ba481c3f6773475c7a63..063542fb69a0d8a36c8f7594a4a41bf307c0d488 100644 (file)
@@ -20,6 +20,8 @@
 #include <linux/kernel.h>
 #include <net/secure_seq.h>
 #include <net/ipv6.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v6.h>
 #include <net/tcp.h>
 
 #define COOKIEBITS 24  /* Upper bits store count */
@@ -111,7 +113,8 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
 }
 EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence);
 
-__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp)
+__u32 cookie_v6_init_sequence(struct request_sock *req, const struct sock *sk,
+                             const struct sk_buff *skb, __u16 *mssp)
 {
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        const struct tcphdr *th = tcp_hdr(skb);
@@ -133,6 +136,7 @@ EXPORT_SYMBOL_GPL(__cookie_v6_check);
 struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_options_received tcp_opt;
+       struct mptcp_options_received mopt;
        struct inet_request_sock *ireq;
        struct tcp_request_sock *treq;
        struct ipv6_pinfo *np = inet6_sk(sk);
@@ -162,7 +166,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
        /* check for timestamp cookie support */
        memset(&tcp_opt, 0, sizeof(tcp_opt));
-       tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+       mptcp_init_mp_opt(&mopt);
+       tcp_parse_options(sock_net(sk), skb, &tcp_opt, &mopt, 0, NULL, NULL);
 
        if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
                tsoff = secure_tcpv6_ts_off(sock_net(sk),
@@ -175,14 +180,27 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
                goto out;
 
        ret = NULL;
-       req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
+#ifdef CONFIG_MPTCP
+       if (mopt.saw_mpc)
+               req = inet_reqsk_alloc(&mptcp6_request_sock_ops, sk, false);
+       else
+#endif
+               req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
        if (!req)
                goto out;
 
        ireq = inet_rsk(req);
+       ireq->mptcp_rqsk = 0;
+       ireq->saw_mpc = 0;
        treq = tcp_rsk(req);
        treq->tfo_listener = false;
 
+       /* Must be done before anything else, as it initializes
+        * hash_entry of the MPTCP request-sock.
+        */
+       if (mopt.saw_mpc)
+               mptcp_cookies_reqsk_init(req, &mopt, skb);
+
        if (security_inet_conn_request(sk, skb, req))
                goto out_free;
 
@@ -244,10 +262,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
        }
 
        req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
-       tcp_select_initial_window(tcp_full_space(sk), req->mss,
-                                 &req->rsk_rcv_wnd, &req->rsk_window_clamp,
-                                 ireq->wscale_ok, &rcv_wscale,
-                                 dst_metric(dst, RTAX_INITRWND));
+       tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
+                                      &req->rsk_rcv_wnd, &req->rsk_window_clamp,
+                                      ireq->wscale_ok, &rcv_wscale,
+                                      dst_metric(dst, RTAX_INITRWND), sk);
 
        ireq->rcv_wscale = rcv_wscale;
        ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
index ba8586aadffabe122a288bfc65ab6d4a7fff62de..3caea2947b4564a95d66693e48a1537105055d18 100644 (file)
@@ -61,6 +61,8 @@
 #include <net/timewait_sock.h>
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v6.h>
 #include <net/busy_poll.h>
 
 #include <linux/proc_fs.h>
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
-static void    tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
-static void    tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
-                                     struct request_sock *req);
-
-static int     tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-
-static const struct inet_connection_sock_af_ops ipv6_mapped;
-static const struct inet_connection_sock_af_ops ipv6_specific;
 #ifdef CONFIG_TCP_MD5SIG
 static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
 static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
@@ -88,7 +82,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
 }
 #endif
 
-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
+void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 {
        struct dst_entry *dst = skb_dst(skb);
 
@@ -115,7 +109,7 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
                                   ipv6_hdr(skb)->saddr.s6_addr32);
 }
 
-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                          int addr_len)
 {
        struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
@@ -213,7 +207,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
                sin.sin_port = usin->sin6_port;
                sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
 
-               icsk->icsk_af_ops = &ipv6_mapped;
+#ifdef CONFIG_MPTCP
+               if (sock_flag(sk, SOCK_MPTCP))
+                       icsk->icsk_af_ops = &mptcp_v6_mapped;
+               else
+#endif
+                       icsk->icsk_af_ops = &ipv6_mapped;
                sk->sk_backlog_rcv = tcp_v4_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
                tp->af_specific = &tcp_sock_ipv6_mapped_specific;
@@ -223,7 +222,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
                if (err) {
                        icsk->icsk_ext_hdr_len = exthdrlen;
-                       icsk->icsk_af_ops = &ipv6_specific;
+#ifdef CONFIG_MPTCP
+                       if (sock_flag(sk, SOCK_MPTCP))
+                               icsk->icsk_af_ops = &mptcp_v6_specific;
+                       else
+#endif
+                               icsk->icsk_af_ops = &ipv6_specific;
                        sk->sk_backlog_rcv = tcp_v6_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
                        tp->af_specific = &tcp_sock_ipv6_specific;
@@ -316,7 +320,7 @@ failure:
        return err;
 }
 
-static void tcp_v6_mtu_reduced(struct sock *sk)
+void tcp_v6_mtu_reduced(struct sock *sk)
 {
        struct dst_entry *dst;
 
@@ -343,7 +347,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        struct ipv6_pinfo *np;
        struct tcp_sock *tp;
        __u32 seq, snd_una;
-       struct sock *sk;
+       struct sock *sk, *meta_sk;
        bool fatal;
        int err;
 
@@ -367,8 +371,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
        if (sk->sk_state == TCP_NEW_SYN_RECV)
                return tcp_req_err(sk, seq, fatal);
 
-       bh_lock_sock(sk);
-       if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
+       tp = tcp_sk(sk);
+       if (mptcp(tp))
+               meta_sk = mptcp_meta_sk(sk);
+       else
+               meta_sk = sk;
+
+       bh_lock_sock(meta_sk);
+       if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
                __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
 
        if (sk->sk_state == TCP_CLOSE)
@@ -379,7 +389,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                goto out;
        }
 
-       tp = tcp_sk(sk);
        /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
        fastopen = tp->fastopen_rsk;
        snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
@@ -413,11 +422,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                        goto out;
 
                tp->mtu_info = ntohl(info);
-               if (!sock_owned_by_user(sk))
+               if (!sock_owned_by_user(meta_sk)) {
                        tcp_v6_mtu_reduced(sk);
-               else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
-                                          &sk->sk_tsq_flags))
-                       sock_hold(sk);
+               } else {
+                       if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
+                                             &sk->sk_tsq_flags))
+                               sock_hold(sk);
+                       if (mptcp(tp))
+                               mptcp_tsq_flags(sk);
+               }
                goto out;
        }
 
@@ -432,7 +445,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                if (fastopen && !fastopen->sk)
                        break;
 
-               if (!sock_owned_by_user(sk)) {
+               if (!sock_owned_by_user(meta_sk)) {
                        sk->sk_err = err;
                        sk->sk_error_report(sk);                /* Wake people up to see the error (see connect in sock.c) */
 
@@ -442,14 +455,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                goto out;
        }
 
-       if (!sock_owned_by_user(sk) && np->recverr) {
+       if (!sock_owned_by_user(meta_sk) && np->recverr) {
                sk->sk_err = err;
                sk->sk_error_report(sk);
        } else
                sk->sk_err_soft = err;
 
 out:
-       bh_unlock_sock(sk);
+       bh_unlock_sock(meta_sk);
        sock_put(sk);
 }
 
@@ -495,8 +508,7 @@ done:
        return err;
 }
 
-
-static void tcp_v6_reqsk_destructor(struct request_sock *req)
+void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
        kfree(inet_rsk(req)->ipv6_opt);
        kfree_skb(inet_rsk(req)->pktopts);
@@ -714,9 +726,10 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
        return false;
 }
 
-static void tcp_v6_init_req(struct request_sock *req,
-                           const struct sock *sk_listener,
-                           struct sk_buff *skb)
+static int tcp_v6_init_req(struct request_sock *req,
+                          const struct sock *sk_listener,
+                          struct sk_buff *skb,
+                          bool want_cookie)
 {
        struct inet_request_sock *ireq = inet_rsk(req);
        const struct ipv6_pinfo *np = inet6_sk(sk_listener);
@@ -737,6 +750,8 @@ static void tcp_v6_init_req(struct request_sock *req,
                refcount_inc(&skb->users);
                ireq->pktopts = skb;
        }
+
+       return 0;
 }
 
 static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
@@ -756,7 +771,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
        .syn_ack_timeout =      tcp_syn_ack_timeout,
 };
 
-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
        .mss_clamp      =       IPV6_MIN_MTU - sizeof(struct tcphdr) -
                                sizeof(struct ipv6hdr),
 #ifdef CONFIG_TCP_MD5SIG
@@ -774,9 +789,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 };
 
 static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
-                                u32 ack, u32 win, u32 tsval, u32 tsecr,
+                                u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr,
                                 int oif, struct tcp_md5sig_key *key, int rst,
-                                u8 tclass, __be32 label)
+                                u8 tclass, __be32 label, int mptcp)
 {
        const struct tcphdr *th = tcp_hdr(skb);
        struct tcphdr *t1;
@@ -794,7 +809,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
        if (key)
                tot_len += TCPOLEN_MD5SIG_ALIGNED;
 #endif
-
+#ifdef CONFIG_MPTCP
+       if (mptcp)
+               tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
+#endif
        buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
                         GFP_ATOMIC);
        if (!buff)
@@ -832,6 +850,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
                tcp_v6_md5_hash_hdr((__u8 *)topt, key,
                                    &ipv6_hdr(skb)->saddr,
                                    &ipv6_hdr(skb)->daddr, t1);
+               topt += 4;
+       }
+#endif
+#ifdef CONFIG_MPTCP
+       if (mptcp) {
+               /* Construction of 32-bit data_ack */
+               *topt++ = htonl((TCPOPT_MPTCP << 24) |
+                               ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
+                               (0x20 << 8) |
+                               (0x01));
+               *topt++ = htonl(data_ack);
        }
 #endif
 
@@ -878,7 +907,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
        kfree_skb(buff);
 }
 
-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
+void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 {
        const struct tcphdr *th = tcp_hdr(skb);
        u32 seq = 0, ack_seq = 0;
@@ -941,7 +970,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
                          (th->doff << 2);
 
        oif = sk ? sk->sk_bound_dev_if : 0;
-       tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
+       tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0);
 
 #ifdef CONFIG_TCP_MD5SIG
 out:
@@ -950,30 +979,37 @@ out:
 }
 
 static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
-                           u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
+                           u32 ack, u32 data_ack, u32 win, u32 tsval, u32 tsecr, int oif,
                            struct tcp_md5sig_key *key, u8 tclass,
-                           __be32 label)
+                           __be32 label, int mptcp)
 {
-       tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
-                            tclass, label);
+       tcp_v6_send_response(sk, skb, seq, ack, data_ack, win, tsval, tsecr, oif,
+                            key, 0, tclass, label, mptcp);
 }
 
 static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 {
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+       u32 data_ack = 0;
+       int mptcp = 0;
 
+       if (tcptw->mptcp_tw) {
+               data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
+               mptcp = 1;
+       }
        tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+                       data_ack,
                        tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
                        tcp_time_stamp_raw() + tcptw->tw_ts_offset,
                        tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
-                       tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
+                       tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), mptcp);
 
        inet_twsk_put(tw);
 }
 
-static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
-                                 struct request_sock *req)
+void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
+                          struct request_sock *req)
 {
        /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
         * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
@@ -983,18 +1019,18 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
         * exception of <SYN> segments, MUST be right-shifted by
         * Rcv.Wind.Shift bits:
         */
-       tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
+       tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN || is_meta_sk(sk)) ?
                        tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
-                       tcp_rsk(req)->rcv_nxt,
+                       tcp_rsk(req)->rcv_nxt, 0,
                        req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
                        tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
                        req->ts_recent, sk->sk_bound_dev_if,
                        tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
-                       0, 0);
+                       0, 0, 0);
 }
 
 
-static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
+struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
 {
 #ifdef CONFIG_SYN_COOKIES
        const struct tcphdr *th = tcp_hdr(skb);
@@ -1005,7 +1041,7 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
        return sk;
 }
 
-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
        if (skb->protocol == htons(ETH_P_IP))
                return tcp_v4_conn_request(sk, skb);
@@ -1031,11 +1067,11 @@ static void tcp_v6_restore_cb(struct sk_buff *skb)
                sizeof(struct inet6_skb_parm));
 }
 
-static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
-                                        struct request_sock *req,
-                                        struct dst_entry *dst,
-                                        struct request_sock *req_unhash,
-                                        bool *own_req)
+struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
+                                 struct request_sock *req,
+                                 struct dst_entry *dst,
+                                 struct request_sock *req_unhash,
+                                 bool *own_req)
 {
        struct inet_request_sock *ireq;
        struct ipv6_pinfo *newnp;
@@ -1072,7 +1108,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 
                newnp->saddr = newsk->sk_v6_rcv_saddr;
 
-               inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+#ifdef CONFIG_MPTCP
+               /* We must check on the request-socket because the listener
+                * socket's flag may have been changed halfway through.
+                */
+               if (!inet_rsk(req)->saw_mpc)
+                       inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;
+               else
+#endif
+                       inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
                newsk->sk_backlog_rcv = tcp_v4_do_rcv;
 #ifdef CONFIG_TCP_MD5SIG
                newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
@@ -1119,6 +1163,14 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
        if (!newsk)
                goto out_nonewsk;
 
+#ifdef CONFIG_MPTCP
+       /* If the meta_sk is v6-mapped we can end up here with the wrong af_ops.
+        * Just make sure that this subflow is v6.
+        */
+       if (is_meta_sk(sk))
+               inet_csk(newsk)->icsk_af_ops = &mptcp_v6_specific;
+#endif
+
        /*
         * No need to charge this sock to the relevant IPv6 refcnt debug socks
         * count here, tcp_create_openreq_child now does this for us, see the
@@ -1247,7 +1299,7 @@ out:
  * This is because we cannot sleep with the original spinlock
  * held.
  */
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct tcp_sock *tp;
@@ -1264,6 +1316,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
        if (skb->protocol == htons(ETH_P_IP))
                return tcp_v4_do_rcv(sk, skb);
 
+       if (is_meta_sk(sk))
+               return mptcp_v6_do_rcv(sk, skb);
+
        /*
         *      socket locking is here for SMP purposes as backlog rcv
         *      is currently called with bh processing disabled.
@@ -1391,6 +1446,10 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
        TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                                    skb->len - th->doff*4);
        TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+#ifdef CONFIG_MPTCP
+       TCP_SKB_CB(skb)->mptcp_flags = 0;
+       TCP_SKB_CB(skb)->dss_off = 0;
+#endif
        TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
        TCP_SKB_CB(skb)->tcp_tw_isn = 0;
        TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
@@ -1404,8 +1463,8 @@ static int tcp_v6_rcv(struct sk_buff *skb)
        int sdif = inet6_sdif(skb);
        const struct tcphdr *th;
        const struct ipv6hdr *hdr;
+       struct sock *sk, *meta_sk = NULL;
        bool refcounted;
-       struct sock *sk;
        int ret;
        struct net *net = dev_net(skb->dev);
 
@@ -1458,12 +1517,34 @@ process:
                        reqsk_put(req);
                        goto csum_error;
                }
-               if (unlikely(sk->sk_state != TCP_LISTEN)) {
+               if (unlikely(sk->sk_state != TCP_LISTEN && !is_meta_sk(sk))) {
                        inet_csk_reqsk_queue_drop_and_put(sk, req);
                        goto lookup;
                }
                sock_hold(sk);
                refcounted = true;
+
+               if (is_meta_sk(sk)) {
+                       bh_lock_sock(sk);
+
+                       if (sock_owned_by_user(sk)) {
+                               skb->sk = sk;
+                               if (unlikely(sk_add_backlog(sk, skb,
+                                                           sk->sk_rcvbuf + sk->sk_sndbuf))) {
+                                       reqsk_put(req);
+
+                                       bh_unlock_sock(sk);
+                                       __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
+                                       goto discard_and_relse;
+                               }
+
+                               reqsk_put(req);
+                               bh_unlock_sock(sk);
+                               sock_put(sk);
+
+                               return 0;
+                       }
+               }
                nsk = NULL;
                if (!tcp_filter(sk, skb)) {
                        th = (const struct tcphdr *)skb->data;
@@ -1473,10 +1554,15 @@ process:
                }
                if (!nsk) {
                        reqsk_put(req);
+
+                       if (is_meta_sk(sk))
+                               bh_unlock_sock(sk);
                        goto discard_and_relse;
                }
                if (nsk == sk) {
                        reqsk_put(req);
+                       if (is_meta_sk(sk))
+                               bh_unlock_sock(sk);
                        tcp_v6_restore_cb(skb);
                } else if (tcp_child_process(sk, nsk, skb)) {
                        tcp_v6_send_reset(nsk, skb);
@@ -1486,6 +1572,7 @@ process:
                        return 0;
                }
        }
+
        if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
                __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
                goto discard_and_relse;
@@ -1512,15 +1599,24 @@ process:
 
        sk_incoming_cpu_update(sk);
 
-       bh_lock_sock_nested(sk);
+       if (mptcp(tcp_sk(sk))) {
+               meta_sk = mptcp_meta_sk(sk);
+
+               bh_lock_sock_nested(meta_sk);
+               if (sock_owned_by_user(meta_sk))
+                       skb->sk = sk;
+       } else {
+               meta_sk = sk;
+               bh_lock_sock_nested(sk);
+       }
        tcp_segs_in(tcp_sk(sk), skb);
        ret = 0;
-       if (!sock_owned_by_user(sk)) {
+       if (!sock_owned_by_user(meta_sk))
                ret = tcp_v6_do_rcv(sk, skb);
-       } else if (tcp_add_backlog(sk, skb)) {
+       else if (tcp_add_backlog(meta_sk, skb))
                goto discard_and_relse;
-       }
-       bh_unlock_sock(sk);
+
+       bh_unlock_sock(meta_sk);
 
 put_and_return:
        if (refcounted)
@@ -1533,6 +1629,19 @@ no_tcp_socket:
 
        tcp_v6_fill_cb(skb, hdr, th);
 
+#ifdef CONFIG_MPTCP
+       if (!sk && th->syn && !th->ack) {
+               int ret = mptcp_lookup_join(skb, NULL);
+
+               if (ret < 0) {
+                       tcp_v6_send_reset(NULL, skb);
+                       goto discard_it;
+               } else if (ret > 0) {
+                       return 0;
+               }
+       }
+#endif
+
        if (tcp_checksum_complete(skb)) {
 csum_error:
                __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
@@ -1585,6 +1694,18 @@ do_time_wait:
                        refcounted = false;
                        goto process;
                }
+#ifdef CONFIG_MPTCP
+               if (th->syn && !th->ack) {
+                       int ret = mptcp_lookup_join(skb, inet_twsk(sk));
+
+                       if (ret < 0) {
+                               tcp_v6_send_reset(NULL, skb);
+                               goto discard_it;
+                       } else if (ret > 0) {
+                               return 0;
+                       }
+               }
+#endif
                /* Fall through to ACK */
        }
        case TCP_TW_ACK:
@@ -1638,13 +1759,13 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
        }
 }
 
-static struct timewait_sock_ops tcp6_timewait_sock_ops = {
+struct timewait_sock_ops tcp6_timewait_sock_ops = {
        .twsk_obj_size  = sizeof(struct tcp6_timewait_sock),
        .twsk_unique    = tcp_twsk_unique,
        .twsk_destructor = tcp_twsk_destructor,
 };
 
-static const struct inet_connection_sock_af_ops ipv6_specific = {
+const struct inet_connection_sock_af_ops ipv6_specific = {
        .queue_xmit        = inet6_csk_xmit,
        .send_check        = tcp_v6_send_check,
        .rebuild_header    = inet6_sk_rebuild_header,
@@ -1675,7 +1796,7 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
 /*
  *     TCP over IPv4 via INET6 API
  */
-static const struct inet_connection_sock_af_ops ipv6_mapped = {
+const struct inet_connection_sock_af_ops ipv6_mapped = {
        .queue_xmit        = ip_queue_xmit,
        .send_check        = tcp_v4_send_check,
        .rebuild_header    = inet_sk_rebuild_header,
@@ -1711,7 +1832,12 @@ static int tcp_v6_init_sock(struct sock *sk)
 
        tcp_init_sock(sk);
 
-       icsk->icsk_af_ops = &ipv6_specific;
+#ifdef CONFIG_MPTCP
+       if (sock_flag(sk, SOCK_MPTCP))
+               icsk->icsk_af_ops = &mptcp_v6_specific;
+       else
+#endif
+               icsk->icsk_af_ops = &ipv6_specific;
 
 #ifdef CONFIG_TCP_MD5SIG
        tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
@@ -1720,7 +1846,7 @@ static int tcp_v6_init_sock(struct sock *sk)
        return 0;
 }
 
-static void tcp_v6_destroy_sock(struct sock *sk)
+void tcp_v6_destroy_sock(struct sock *sk)
 {
        tcp_v4_destroy_sock(sk);
        inet6_destroy_sock(sk);
@@ -1954,6 +2080,9 @@ struct proto tcpv6_prot = {
        .compat_getsockopt      = compat_tcp_getsockopt,
 #endif
        .diag_destroy           = tcp_abort,
+#ifdef CONFIG_MPTCP
+       .clear_sk               = mptcp_clear_sk,
+#endif
 };
 
 /* thinking of making this const? Don't.
diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
new file mode 100644 (file)
index 0000000..13cf4d5
--- /dev/null
@@ -0,0 +1,129 @@
+#
+# MPTCP configuration
+#
+config MPTCP
+        bool "MPTCP protocol"
+        depends on (IPV6=y || IPV6=n)
+        ---help---
+          This replaces the normal TCP stack with a Multipath TCP stack,
+          able to use several paths at once.
+
+menuconfig MPTCP_PM_ADVANCED
+       bool "MPTCP: advanced path-manager control"
+       depends on MPTCP=y
+       ---help---
+         Support for selection of different path-managers. You should choose 'Y' here,
+         because otherwise you will not actively create new MPTCP-subflows.
+
+if MPTCP_PM_ADVANCED
+
+config MPTCP_FULLMESH
+       tristate "MPTCP Full-Mesh Path-Manager"
+       depends on MPTCP=y
+       ---help---
+         This path-management module will create a full-mesh among all IP-addresses.
+
+config MPTCP_NDIFFPORTS
+       tristate "MPTCP ndiff-ports"
+       depends on MPTCP=y
+       ---help---
+         This path-management module will create multiple subflows between the same
+         pair of IP-addresses, modifying the source-port. You can set the number
+         of subflows via the mptcp_ndiffports-sysctl.
+
+config MPTCP_BINDER
+       tristate "MPTCP Binder"
+       depends on (MPTCP=y)
+       ---help---
+         This path-management module works like ndiffports, and adds the sysctl
+         option to set the gateway (and/or path to) per each additional subflow
+         via Loose Source Routing (IPv4 only).
+
+choice
+       prompt "Default MPTCP Path-Manager"
+       default DEFAULT
+       help
+         Select the Path-Manager of your choice
+
+       config DEFAULT_FULLMESH
+               bool "Full mesh" if MPTCP_FULLMESH=y
+
+       config DEFAULT_NDIFFPORTS
+               bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
+
+       config DEFAULT_BINDER
+               bool "binder" if MPTCP_BINDER=y
+
+       config DEFAULT_DUMMY
+               bool "Default"
+
+endchoice
+
+endif
+
+config DEFAULT_MPTCP_PM
+       string
+       default "default" if DEFAULT_DUMMY
+       default "fullmesh" if DEFAULT_FULLMESH 
+       default "ndiffports" if DEFAULT_NDIFFPORTS
+       default "binder" if DEFAULT_BINDER
+       default "default"
+
+menuconfig MPTCP_SCHED_ADVANCED
+       bool "MPTCP: advanced scheduler control"
+       depends on MPTCP=y
+       ---help---
+         Support for selection of different schedulers. You should choose 'Y' here,
+         if you want to choose a different scheduler than the default one.
+
+if MPTCP_SCHED_ADVANCED
+
+config MPTCP_ROUNDROBIN
+       tristate "MPTCP Round-Robin"
+       depends on (MPTCP=y)
+       ---help---
+         This is a very simple round-robin scheduler. Probably has bad performance
+         but might be interesting for researchers.
+
+config MPTCP_REDUNDANT
+       tristate "MPTCP Redundant"
+       depends on (MPTCP=y)
+       ---help---
+         This scheduler sends all packets redundantly over all subflows to decreases
+         latency and jitter on the cost of lower throughput.
+
+choice
+       prompt "Default MPTCP Scheduler"
+       default DEFAULT
+       help
+         Select the Scheduler of your choice
+
+       config DEFAULT_SCHEDULER
+               bool "Default"
+               ---help---
+                 This is the default scheduler, sending first on the subflow
+                 with the lowest RTT.
+
+       config DEFAULT_ROUNDROBIN
+               bool "Round-Robin" if MPTCP_ROUNDROBIN=y
+               ---help---
+                 This is the round-rob scheduler, sending in a round-robin
+                 fashion..
+
+       config DEFAULT_REDUNDANT
+               bool "Redundant" if MPTCP_REDUNDANT=y
+               ---help---
+                 This is the redundant scheduler, sending packets redundantly over
+                 all the subflows.
+
+endchoice
+endif
+
+config DEFAULT_MPTCP_SCHED
+       string
+       depends on (MPTCP=y)
+       default "default" if DEFAULT_SCHEDULER
+       default "roundrobin" if DEFAULT_ROUNDROBIN
+       default "redundant" if DEFAULT_REDUNDANT
+       default "default"
+
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
new file mode 100644 (file)
index 0000000..a38e437
--- /dev/null
@@ -0,0 +1,22 @@
+#
+## Makefile for MultiPath TCP support code.
+#
+#
+
+obj-$(CONFIG_MPTCP) += mptcp.o
+
+mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_pm.o \
+          mptcp_output.o mptcp_input.o mptcp_sched.o
+
+obj-$(CONFIG_TCP_CONG_LIA) += mptcp_coupled.o
+obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
+obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
+obj-$(CONFIG_TCP_CONG_BALIA) += mptcp_balia.o
+obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
+obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
+obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o
+obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o
+obj-$(CONFIG_MPTCP_REDUNDANT) += mptcp_redundant.o
+
+mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
+
diff --git a/net/mptcp/mptcp_balia.c b/net/mptcp/mptcp_balia.c
new file mode 100644 (file)
index 0000000..73f365b
--- /dev/null
@@ -0,0 +1,267 @@
+/*
+ *     MPTCP implementation - Balia Congestion Control
+ *     (Balanced Linked Adaptation Algorithm)
+ *
+ *     Analysis, Design and Implementation:
+ *     Qiuyu Peng <qpeng@caltech.edu>
+ *     Anwar Walid <anwar@research.bell-labs.com>
+ *     Jaehyun Hwang <jhyun.hwang@samsung.com>
+ *     Steven H. Low <slow@caltech.edu>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+
+#include <net/tcp.h>
+#include <net/mptcp.h>
+
+#include <linux/module.h>
+
+/* The variable 'rate' (i.e., x_r) will be scaled
+ * e.g., from B/s to KB/s, MB/s, or GB/s
+ * if max_rate > 2^rate_scale_limit
+ */
+
+static int rate_scale_limit = 25;
+static int alpha_scale = 10;
+static int scale_num = 5;
+
+struct mptcp_balia {
+       u64     ai;
+       u64     md;
+       bool    forced_update;
+};
+
+static inline int mptcp_balia_sk_can_send(const struct sock *sk)
+{
+       return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
+}
+
+static inline u64 mptcp_get_ai(const struct sock *meta_sk)
+{
+       return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai;
+}
+
+static inline void mptcp_set_ai(const struct sock *meta_sk, u64 ai)
+{
+       ((struct mptcp_balia *)inet_csk_ca(meta_sk))->ai = ai;
+}
+
+static inline u64 mptcp_get_md(const struct sock *meta_sk)
+{
+       return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md;
+}
+
+static inline void mptcp_set_md(const struct sock *meta_sk, u64 md)
+{
+       ((struct mptcp_balia *)inet_csk_ca(meta_sk))->md = md;
+}
+
+static inline u64 mptcp_balia_scale(u64 val, int scale)
+{
+       return (u64) val << scale;
+}
+
+static inline bool mptcp_get_forced(const struct sock *meta_sk)
+{
+       return ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update;
+}
+
+static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
+{
+       ((struct mptcp_balia *)inet_csk_ca(meta_sk))->forced_update = force;
+}
+
+static void mptcp_balia_recalc_ai(const struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       const struct mptcp_cb *mpcb = tp->mpcb;
+       const struct sock *sub_sk;
+       u64 max_rate = 0, rate = 0, sum_rate = 0;
+       u64 alpha, ai = tp->snd_cwnd, md = (tp->snd_cwnd >> 1);
+       int num_scale_down = 0;
+
+       if (!mpcb)
+               return;
+
+       /* Only one subflow left - fall back to normal reno-behavior */
+       if (mpcb->cnt_established <= 1)
+               goto exit;
+
+       /* Find max_rate first */
+       mptcp_for_each_sk(mpcb, sub_sk) {
+               struct tcp_sock *sub_tp = tcp_sk(sub_sk);
+               u64 tmp;
+
+               if (!mptcp_balia_sk_can_send(sub_sk))
+                       continue;
+
+               tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd
+                               * (USEC_PER_SEC << 3), sub_tp->srtt_us);
+               sum_rate += tmp;
+
+               if (tp == sub_tp)
+                       rate = tmp;
+
+               if (tmp >= max_rate)
+                       max_rate = tmp;
+       }
+
+       /* At least, the current subflow should be able to send */
+       if (unlikely(!rate))
+               goto exit;
+
+       alpha = div64_u64(max_rate, rate);
+
+       /* Scale down max_rate if it is too high (e.g., >2^25) */
+       while (max_rate > mptcp_balia_scale(1, rate_scale_limit)) {
+               max_rate >>= scale_num;
+               num_scale_down++;
+       }
+
+       if (num_scale_down) {
+               sum_rate = 0;
+               mptcp_for_each_sk(mpcb, sub_sk) {
+                       struct tcp_sock *sub_tp = tcp_sk(sub_sk);
+                       u64 tmp;
+
+                       if (!mptcp_balia_sk_can_send(sub_sk))
+                               continue;
+
+                       tmp = div_u64((u64)tp->mss_cache * sub_tp->snd_cwnd
+                               * (USEC_PER_SEC << 3), sub_tp->srtt_us);
+                       tmp >>= (scale_num * num_scale_down);
+
+                       sum_rate += tmp;
+               }
+               rate >>= (scale_num * num_scale_down);
+       }
+
+       /*      (sum_rate)^2 * 10 * w_r
+        * ai = ------------------------------------
+        *      (x_r + max_rate) * (4x_r + max_rate)
+        */
+       sum_rate *= sum_rate;
+
+       ai = div64_u64(sum_rate * 10, rate + max_rate);
+       ai = div64_u64(ai * tp->snd_cwnd, (rate << 2) + max_rate);
+
+       if (unlikely(!ai))
+               ai = tp->snd_cwnd;
+
+       md = ((tp->snd_cwnd >> 1) * min(mptcp_balia_scale(alpha, alpha_scale),
+                                       mptcp_balia_scale(3, alpha_scale) >> 1))
+                                       >> alpha_scale;
+
+exit:
+       mptcp_set_ai(sk, ai);
+       mptcp_set_md(sk, md);
+}
+
+static void mptcp_balia_init(struct sock *sk)
+{
+       if (mptcp(tcp_sk(sk))) {
+               mptcp_set_forced(sk, 0);
+               mptcp_set_ai(sk, 0);
+               mptcp_set_md(sk, 0);
+       }
+}
+
+static void mptcp_balia_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+       if (event == CA_EVENT_COMPLETE_CWR || event == CA_EVENT_LOSS)
+               mptcp_balia_recalc_ai(sk);
+}
+
+static void mptcp_balia_set_state(struct sock *sk, u8 ca_state)
+{
+       if (!mptcp(tcp_sk(sk)))
+               return;
+
+       mptcp_set_forced(sk, 1);
+}
+
+static void mptcp_balia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       const struct mptcp_cb *mpcb = tp->mpcb;
+       int snd_cwnd;
+
+       if (!mptcp(tp)) {
+               tcp_reno_cong_avoid(sk, ack, acked);
+               return;
+       }
+
+       if (!tcp_is_cwnd_limited(sk))
+               return;
+
+       if (tcp_in_slow_start(tp)) {
+               /* In "safe" area, increase. */
+               tcp_slow_start(tp, acked);
+               mptcp_balia_recalc_ai(sk);
+               return;
+       }
+
+       if (mptcp_get_forced(mptcp_meta_sk(sk))) {
+               mptcp_balia_recalc_ai(sk);
+               mptcp_set_forced(sk, 0);
+       }
+
+       if (mpcb->cnt_established > 1)
+               snd_cwnd = (int) mptcp_get_ai(sk);
+       else
+               snd_cwnd = tp->snd_cwnd;
+
+       if (tp->snd_cwnd_cnt >= snd_cwnd) {
+               if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+                       tp->snd_cwnd++;
+                       mptcp_balia_recalc_ai(sk);
+               }
+
+               tp->snd_cwnd_cnt = 0;
+       } else {
+               tp->snd_cwnd_cnt++;
+       }
+}
+
+static u32 mptcp_balia_ssthresh(struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       const struct mptcp_cb *mpcb = tp->mpcb;
+
+       if (unlikely(!mptcp(tp) || mpcb->cnt_established <= 1))
+               return tcp_reno_ssthresh(sk);
+       else
+               return max((u32)(tp->snd_cwnd - mptcp_get_md(sk)), 1U);
+}
+
+static struct tcp_congestion_ops mptcp_balia = {
+       .init           = mptcp_balia_init,
+       .ssthresh       = mptcp_balia_ssthresh,
+       .cong_avoid     = mptcp_balia_cong_avoid,
+       .cwnd_event     = mptcp_balia_cwnd_event,
+       .set_state      = mptcp_balia_set_state,
+       .owner          = THIS_MODULE,
+       .name           = "balia",
+};
+
+static int __init mptcp_balia_register(void)
+{
+       BUILD_BUG_ON(sizeof(struct mptcp_balia) > ICSK_CA_PRIV_SIZE);
+       return tcp_register_congestion_control(&mptcp_balia);
+}
+
+static void __exit mptcp_balia_unregister(void)
+{
+       tcp_unregister_congestion_control(&mptcp_balia);
+}
+
+module_init(mptcp_balia_register);
+module_exit(mptcp_balia_unregister);
+
+MODULE_AUTHOR("Jaehyun Hwang, Anwar Walid, Qiuyu Peng, Steven H. Low");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPTCP BALIA CONGESTION CONTROL ALGORITHM");
+MODULE_VERSION("0.1");
diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c
new file mode 100644 (file)
index 0000000..17b1eb3
--- /dev/null
@@ -0,0 +1,484 @@
+#include <linux/module.h>
+
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+
+#include <linux/route.h>
+#include <linux/inet.h>
+#include <linux/mroute.h>
+#include <linux/spinlock_types.h>
+#include <net/inet_ecn.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#include <net/compat.h>
+#include <linux/slab.h>
+
+#define MPTCP_GW_MAX_LISTS     10
+#define MPTCP_GW_LIST_MAX_LEN  6
+#define MPTCP_GW_SYSCTL_MAX_LEN        (15 * MPTCP_GW_LIST_MAX_LEN *   \
+                                                       MPTCP_GW_MAX_LISTS)
+
+struct mptcp_gw_list {
+       struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];
+       u8 len[MPTCP_GW_MAX_LISTS];
+};
+
+struct binder_priv {
+       /* Worker struct for subflow establishment */
+       struct work_struct subflow_work;
+
+       struct mptcp_cb *mpcb;
+
+       /* Prevent multiple sub-sockets concurrently iterating over sockets */
+       spinlock_t *flow_lock;
+};
+
+static struct mptcp_gw_list *mptcp_gws;
+static rwlock_t mptcp_gws_lock;
+
+static int mptcp_binder_ndiffports __read_mostly = 1;
+
+static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;
+
+static int mptcp_get_avail_list_ipv4(struct sock *sk)
+{
+       int i, j, list_taken, opt_ret, opt_len;
+       unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];
+
+       for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {
+               if (mptcp_gws->len[i] == 0)
+                       goto error;
+
+               mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);
+               list_taken = 0;
+
+               /* Loop through all sub-sockets in this connection */
+               mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) {
+                       mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");
+
+                       /* Reset length and options buffer, then retrieve
+                        * from socket
+                        */
+                       opt_len = MAX_IPOPTLEN;
+                       memset(opt, 0, MAX_IPOPTLEN);
+                       opt_ret = ip_getsockopt(sk, IPPROTO_IP,
+                               IP_OPTIONS, (char __user *)opt, (int __user *)&opt_len);
+                       if (opt_ret < 0) {
+                               mptcp_debug("%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",
+                                           __func__, opt_ret);
+                               goto error;
+                       }
+
+                       /* If socket has no options, it has no stake in this list */
+                       if (opt_len <= 0)
+                               continue;
+
+                       /* Iterate options buffer */
+                       for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {
+                               if (*opt_ptr == IPOPT_LSRR) {
+                                       mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");
+                                       goto sock_lsrr;
+                               }
+                       }
+                       continue;
+
+sock_lsrr:
+                       /* Pointer to the 2nd to last address */
+                       opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;
+
+                       /* Addresses start 3 bytes after type offset */
+                       opt_ptr += 3;
+                       j = 0;
+
+                       /* Different length lists cannot be the same */
+                       if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])
+                               continue;
+
+                       /* Iterate if we are still inside options list
+                        * and sysctl list
+                        */
+                       while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {
+                               /* If there is a different address, this list must
+                                * not be set on this socket
+                                */
+                               if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))
+                                       break;
+
+                               /* Jump 4 bytes to next address */
+                               opt_ptr += 4;
+                               j++;
+                       }
+
+                       /* Reached the end without a differing address, lists
+                        * are therefore identical.
+                        */
+                       if (j == mptcp_gws->len[i]) {
+                               mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");
+                               list_taken = 1;
+                               break;
+                       }
+               }
+
+               /* Free list found if not taken by a socket */
+               if (!list_taken) {
+                       mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");
+                       break;
+               }
+       }
+
+       if (i >= MPTCP_GW_MAX_LISTS)
+               goto error;
+
+       return i;
+error:
+       return -1;
+}
+
+/* The list of addresses is parsed each time a new connection is opened,
+ *  to make sure it's up to date. In case of error, all the lists are
+ *  marked as unavailable and the subflow's fingerprint is set to 0.
+ */
+static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)
+{
+       int i, j, ret;
+       unsigned char opt[MAX_IPOPTLEN] = {0};
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];
+
+       /* Read lock: multiple sockets can read LSRR addresses at the same
+        * time, but writes are done in mutual exclusion.
+        * Spin lock: must search for free list for one socket at a time, or
+        * multiple sockets could take the same list.
+        */
+       read_lock(&mptcp_gws_lock);
+       spin_lock(fmp->flow_lock);
+
+       i = mptcp_get_avail_list_ipv4(sk);
+
+       /* Execution enters here only if a free path is found.
+        */
+       if (i >= 0) {
+               opt[0] = IPOPT_NOP;
+               opt[1] = IPOPT_LSRR;
+               opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *
+                               (mptcp_gws->len[i] + 1) + 3;
+               opt[3] = IPOPT_MINOFF;
+               for (j = 0; j < mptcp_gws->len[i]; ++j)
+                       memcpy(opt + 4 +
+                               (j * sizeof(mptcp_gws->list[i][0].s_addr)),
+                               &mptcp_gws->list[i][j].s_addr,
+                               sizeof(mptcp_gws->list[i][0].s_addr));
+               /* Final destination must be part of IP_OPTIONS parameter. */
+               memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,
+                      sizeof(addr.s_addr));
+
+               /* setsockopt must be inside the lock, otherwise another
+                * subflow could fail to see that we have taken a list.
+                */
+               ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, (char __user *)opt,
+                                   4 + sizeof(mptcp_gws->list[i][0].s_addr) * (mptcp_gws->len[i] + 1));
+
+               if (ret < 0) {
+                       mptcp_debug("%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",
+                                   __func__, ret);
+               }
+       }
+
+       spin_unlock(fmp->flow_lock);
+       read_unlock(&mptcp_gws_lock);
+}
+
+/* Parses gateways string for a list of paths to different
+ * gateways, and stores them for use with the Loose Source Routing (LSRR)
+ * socket option. Each list must have "," separated addresses, and the lists
+ * themselves must be separated by "-". Returns -1 in case one or more of the
+ * addresses is not a valid ipv4/6 address.
+ */
+static int mptcp_parse_gateway_ipv4(char *gateways)
+{
+       int i, j, k, ret;
+       char *tmp_string = NULL;
+       struct in_addr tmp_addr;
+
+       tmp_string = kzalloc(16, GFP_KERNEL);
+       if (tmp_string == NULL)
+               return -ENOMEM;
+
+       write_lock(&mptcp_gws_lock);
+
+       memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
+
+       /* A TMP string is used since inet_pton needs a null terminated string
+        * but we do not want to modify the sysctl for obvious reasons.
+        * i will iterate over the SYSCTL string, j will iterate over the
+        * temporary string where each IP is copied into, k will iterate over
+        * the IPs in each list.
+        */
+       for (i = j = k = 0;
+                       i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;
+                       ++i) {
+               if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {
+                       /* If the temp IP is empty and the current list is
+                        *  empty, we are done.
+                        */
+                       if (j == 0 && mptcp_gws->len[k] == 0)
+                               break;
+
+                       /* Terminate the temp IP string, then if it is
+                        * non-empty parse the IP and copy it.
+                        */
+                       tmp_string[j] = '\0';
+                       if (j > 0) {
+                               mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);
+
+                               ret = in4_pton(tmp_string, strlen(tmp_string),
+                                               (u8 *)&tmp_addr.s_addr, '\0',
+                                               NULL);
+
+                               if (ret) {
+                                       mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",
+                                                   ret,
+                                                   &tmp_addr.s_addr);
+                                       memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,
+                                              &tmp_addr.s_addr,
+                                              sizeof(tmp_addr.s_addr));
+                                       mptcp_gws->len[k]++;
+                                       j = 0;
+                                       tmp_string[j] = '\0';
+                                       /* Since we can't impose a limit to
+                                        * what the user can input, make sure
+                                        * there are not too many IPs in the
+                                        * SYSCTL string.
+                                        */
+                                       if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {
+                                               mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",
+                                                           k,
+                                                           MPTCP_GW_LIST_MAX_LEN);
+                                               goto error;
+                                       }
+                               } else {
+                                       goto error;
+                               }
+                       }
+
+                       if (gateways[i] == '-' || gateways[i] == '\0')
+                               ++k;
+               } else {
+                       tmp_string[j] = gateways[i];
+                       ++j;
+               }
+       }
+
+       /* Number of flows is number of gateway lists plus master flow */
+       mptcp_binder_ndiffports = k+1;
+
+       write_unlock(&mptcp_gws_lock);
+       kfree(tmp_string);
+
+       return 0;
+
+error:
+       memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
+       memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);
+       write_unlock(&mptcp_gws_lock);
+       kfree(tmp_string);
+       return -1;
+}
+
+/**
+ * Create all new subflows, by doing calls to mptcp_initX_subsockets
+ *
+ * This function uses a goto next_subflow, to allow releasing the lock between
+ * new subflows and giving other processes a chance to do some work on the
+ * socket and potentially finishing the communication.
+ **/
+static void create_subflow_worker(struct work_struct *work)
+{
+       const struct binder_priv *pm_priv = container_of(work,
+                                                    struct binder_priv,
+                                                    subflow_work);
+       struct mptcp_cb *mpcb = pm_priv->mpcb;
+       struct sock *meta_sk = mpcb->meta_sk;
+       int iter = 0;
+
+next_subflow:
+       if (iter) {
+               release_sock(meta_sk);
+               mutex_unlock(&mpcb->mpcb_mutex);
+
+               cond_resched();
+       }
+       mutex_lock(&mpcb->mpcb_mutex);
+       lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
+
+       iter++;
+
+       if (sock_flag(meta_sk, SOCK_DEAD))
+               goto exit;
+
+       if (mpcb->master_sk &&
+           !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
+               goto exit;
+
+       if (mptcp_binder_ndiffports > iter &&
+           mptcp_binder_ndiffports > mpcb->cnt_subflows) {
+               struct mptcp_loc4 loc;
+               struct mptcp_rem4 rem;
+
+               loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
+               loc.loc4_id = 0;
+               loc.low_prio = 0;
+
+               rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
+               rem.port = inet_sk(meta_sk)->inet_dport;
+               rem.rem4_id = 0; /* Default 0 */
+
+               mptcp_init4_subsockets(meta_sk, &loc, &rem);
+
+               goto next_subflow;
+       }
+
+exit:
+       release_sock(meta_sk);
+       mutex_unlock(&mpcb->mpcb_mutex);
+       sock_put(meta_sk);
+}
+
+static void binder_new_session(const struct sock *meta_sk)
+{
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];
+       static DEFINE_SPINLOCK(flow_lock);
+
+#if IS_ENABLED(CONFIG_IPV6)
+       if (meta_sk->sk_family == AF_INET6 &&
+           !mptcp_v6_is_v4_mapped(meta_sk)) {
+                       mptcp_fallback_default(mpcb);
+                       return;
+       }
+#endif
+
+       /* Initialize workqueue-struct */
+       INIT_WORK(&fmp->subflow_work, create_subflow_worker);
+       fmp->mpcb = mpcb;
+
+       fmp->flow_lock = &flow_lock;
+}
+
+static void binder_create_subflows(struct sock *meta_sk)
+{
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];
+
+       if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
+           mpcb->send_infinite_mapping ||
+           mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
+               return;
+
+       if (!work_pending(&pm_priv->subflow_work)) {
+               sock_hold(meta_sk);
+               queue_work(mptcp_wq, &pm_priv->subflow_work);
+       }
+}
+
+static int binder_get_local_id(sa_family_t family, union inet_addr *addr,
+                                 struct net *net, bool *low_prio)
+{
+       return 0;
+}
+
+/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.
+ * Inspired from proc_tcp_congestion_control().
+ */
+static int proc_mptcp_gateways(struct ctl_table *ctl, int write,
+                              void __user *buffer, size_t *lenp,
+                              loff_t *ppos)
+{
+       int ret;
+       struct ctl_table tbl = {
+               .maxlen = MPTCP_GW_SYSCTL_MAX_LEN,
+       };
+
+       if (write) {
+               tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);
+               if (tbl.data == NULL)
+                       return -ENOMEM;
+               ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+               if (ret == 0) {
+                       ret = mptcp_parse_gateway_ipv4(tbl.data);
+                       memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);
+               }
+               kfree(tbl.data);
+       } else {
+               ret = proc_dostring(ctl, write, buffer, lenp, ppos);
+       }
+
+
+       return ret;
+}
+
+static struct mptcp_pm_ops binder __read_mostly = {
+       .new_session = binder_new_session,
+       .fully_established = binder_create_subflows,
+       .get_local_id = binder_get_local_id,
+       .init_subsocket_v4 = mptcp_v4_add_lsrr,
+       .name = "binder",
+       .owner = THIS_MODULE,
+};
+
+static struct ctl_table binder_table[] = {
+       {
+               .procname = "mptcp_binder_gateways",
+               .data = &sysctl_mptcp_binder_gateways,
+               .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,
+               .mode = 0644,
+               .proc_handler = &proc_mptcp_gateways
+       },
+       { }
+};
+
+static struct ctl_table_header *mptcp_sysctl_binder;
+
+/* General initialization of MPTCP_PM */
+static int __init binder_register(void)
+{
+       mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);
+       if (!mptcp_gws)
+               return -ENOMEM;
+
+       rwlock_init(&mptcp_gws_lock);
+
+       BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);
+
+       mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",
+                       binder_table);
+       if (!mptcp_sysctl_binder)
+               goto sysctl_fail;
+
+       if (mptcp_register_path_manager(&binder))
+               goto pm_failed;
+
+       return 0;
+
+pm_failed:
+       unregister_net_sysctl_table(mptcp_sysctl_binder);
+sysctl_fail:
+       kfree(mptcp_gws);
+
+       return -1;
+}
+
+static void binder_unregister(void)
+{
+       mptcp_unregister_path_manager(&binder);
+       unregister_net_sysctl_table(mptcp_sysctl_binder);
+       kfree(mptcp_gws);
+}
+
+module_init(binder_register);
+module_exit(binder_unregister);
+
+MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BINDER MPTCP");
+MODULE_VERSION("0.1");
diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c
new file mode 100644 (file)
index 0000000..2aacf74
--- /dev/null
@@ -0,0 +1,271 @@
+/*
+ *     MPTCP implementation - Linked Increase congestion control Algorithm (LIA)
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <net/tcp.h>
+#include <net/mptcp.h>
+
+#include <linux/module.h>
+
+/* Scaling is done in the numerator with alpha_scale_num and in the denominator
+ * with alpha_scale_den.
+ *
+ * To downscale, we just need to use alpha_scale.
+ *
+ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
+ */
+static int alpha_scale_den = 10;
+static int alpha_scale_num = 32;
+static int alpha_scale = 12;
+
+struct mptcp_ccc {
+       u64     alpha;
+       bool    forced_update;
+};
+
+static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
+{
+       return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
+}
+
+static inline u64 mptcp_get_alpha(const struct sock *meta_sk)
+{
+       return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;
+}
+
+static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)
+{
+       ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;
+}
+
+static inline u64 mptcp_ccc_scale(u32 val, int scale)
+{
+       return (u64) val << scale;
+}
+
+static inline bool mptcp_get_forced(const struct sock *meta_sk)
+{
+       return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;
+}
+
+static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
+{
+       ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;
+}
+
+static void mptcp_ccc_recalc_alpha(const struct sock *sk)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+       const struct sock *sub_sk;
+       int best_cwnd = 0, best_rtt = 0, can_send = 0;
+       u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
+
+       if (!mpcb)
+               return;
+
+       /* Only one subflow left - fall back to normal reno-behavior
+        * (set alpha to 1)
+        */
+       if (mpcb->cnt_established <= 1)
+               goto exit;
+
+       /* Do regular alpha-calculation for multiple subflows */
+
+       /* Find the max numerator of the alpha-calculation */
+       mptcp_for_each_sk(mpcb, sub_sk) {
+               struct tcp_sock *sub_tp = tcp_sk(sub_sk);
+               u64 tmp;
+
+               if (!mptcp_ccc_sk_can_send(sub_sk))
+                       continue;
+
+               can_send++;
+
+               /* We need to look for the path, that provides the max-value.
+                * Integer-overflow is not possible here, because
+                * tmp will be in u64.
+                */
+               tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
+                               alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);
+
+               if (tmp >= max_numerator) {
+                       max_numerator = tmp;
+                       best_cwnd = sub_tp->snd_cwnd;
+                       best_rtt = sub_tp->srtt_us;
+               }
+       }
+
+       /* No subflow is able to send - we don't care anymore */
+       if (unlikely(!can_send))
+               goto exit;
+
+       /* Calculate the denominator */
+       mptcp_for_each_sk(mpcb, sub_sk) {
+               struct tcp_sock *sub_tp = tcp_sk(sub_sk);
+
+               if (!mptcp_ccc_sk_can_send(sub_sk))
+                       continue;
+
+               sum_denominator += div_u64(
+                               mptcp_ccc_scale(sub_tp->snd_cwnd,
+                                               alpha_scale_den) * best_rtt,
+                                               sub_tp->srtt_us);
+       }
+       sum_denominator *= sum_denominator;
+       if (unlikely(!sum_denominator)) {
+               pr_err("%s: sum_denominator == 0, cnt_established:%d\n",
+                      __func__, mpcb->cnt_established);
+               mptcp_for_each_sk(mpcb, sub_sk) {
+                       struct tcp_sock *sub_tp = tcp_sk(sub_sk);
+
+                       pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
+                              __func__, sub_tp->mptcp->path_index,
+                              sub_sk->sk_state, sub_tp->srtt_us,
+                              sub_tp->snd_cwnd);
+               }
+       }
+
+       alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
+
+       if (unlikely(!alpha))
+               alpha = 1;
+
+exit:
+       mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
+}
+
+static void mptcp_ccc_init(struct sock *sk)
+{
+       if (mptcp(tcp_sk(sk))) {
+               mptcp_set_forced(mptcp_meta_sk(sk), 0);
+               mptcp_set_alpha(mptcp_meta_sk(sk), 1);
+       }
+       /* If we do not mptcp, behave like reno: return */
+}
+
+static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+       if (event == CA_EVENT_LOSS)
+               mptcp_ccc_recalc_alpha(sk);
+}
+
+static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
+{
+       if (!mptcp(tcp_sk(sk)))
+               return;
+
+       mptcp_set_forced(mptcp_meta_sk(sk), 1);
+}
+
+static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       const struct mptcp_cb *mpcb = tp->mpcb;
+       int snd_cwnd;
+
+       if (!mptcp(tp)) {
+               tcp_reno_cong_avoid(sk, ack, acked);
+               return;
+       }
+
+       if (!tcp_is_cwnd_limited(sk))
+               return;
+
+       if (tcp_in_slow_start(tp)) {
+               /* In "safe" area, increase. */
+               tcp_slow_start(tp, acked);
+               mptcp_ccc_recalc_alpha(sk);
+               return;
+       }
+
+       if (mptcp_get_forced(mptcp_meta_sk(sk))) {
+               mptcp_ccc_recalc_alpha(sk);
+               mptcp_set_forced(mptcp_meta_sk(sk), 0);
+       }
+
+       if (mpcb->cnt_established > 1) {
+               u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
+
+               /* This may happen, if at the initialization, the mpcb
+                * was not yet attached to the sock, and thus
+                * initializing alpha failed.
+                */
+               if (unlikely(!alpha))
+                       alpha = 1;
+
+               snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),
+                                               alpha);
+
+               /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
+                * Thus, we select here the max value.
+                */
+               if (snd_cwnd < tp->snd_cwnd)
+                       snd_cwnd = tp->snd_cwnd;
+       } else {
+               snd_cwnd = tp->snd_cwnd;
+       }
+
+       if (tp->snd_cwnd_cnt >= snd_cwnd) {
+               if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+                       tp->snd_cwnd++;
+                       mptcp_ccc_recalc_alpha(sk);
+               }
+
+               tp->snd_cwnd_cnt = 0;
+       } else {
+               tp->snd_cwnd_cnt++;
+       }
+}
+
+static struct tcp_congestion_ops mptcp_ccc = {
+       .init           = mptcp_ccc_init,
+       .ssthresh       = tcp_reno_ssthresh,
+       .cong_avoid     = mptcp_ccc_cong_avoid,
+       .cwnd_event     = mptcp_ccc_cwnd_event,
+       .set_state      = mptcp_ccc_set_state,
+       .owner          = THIS_MODULE,
+       .name           = "lia",
+};
+
+static int __init mptcp_ccc_register(void)
+{
+       BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
+       return tcp_register_congestion_control(&mptcp_ccc);
+}
+
+static void __exit mptcp_ccc_unregister(void)
+{
+       tcp_unregister_congestion_control(&mptcp_ccc);
+}
+
+module_init(mptcp_ccc_register);
+module_exit(mptcp_ccc_unregister);
+
+MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");
+MODULE_VERSION("0.1");
diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
new file mode 100644 (file)
index 0000000..e19313a
--- /dev/null
@@ -0,0 +1,2893 @@
+/*
+ *     MPTCP implementation - MPTCP-control
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <net/inet_common.h>
+#include <net/inet6_hashtables.h>
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_route.h>
+#include <net/mptcp_v6.h>
+#endif
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/tcp_states.h>
+#include <net/transp_v6.h>
+#include <net/xfrm.h>
+
+#include <linux/cryptohash.h>
+#include <linux/kconfig.h>
+#include <linux/module.h>
+#include <linux/netpoll.h>
+#include <linux/proc_fs.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/tcp.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/random.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <linux/atomic.h>
+#include <linux/sysctl.h>
+
+static struct kmem_cache *mptcp_sock_cache __read_mostly;
+static struct kmem_cache *mptcp_cb_cache __read_mostly;
+static struct kmem_cache *mptcp_tw_cache __read_mostly;
+
+int sysctl_mptcp_enabled __read_mostly = 1;
+int sysctl_mptcp_version __read_mostly = 0;
+static int min_mptcp_version;
+static int max_mptcp_version = 1;
+int sysctl_mptcp_checksum __read_mostly = 1;
+int sysctl_mptcp_debug __read_mostly;
+EXPORT_SYMBOL(sysctl_mptcp_debug);
+int sysctl_mptcp_syn_retries __read_mostly = 3;
+
+bool mptcp_init_failed __read_mostly;
+
+struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(mptcp_static_key);
+
+static int proc_mptcp_path_manager(struct ctl_table *ctl, int write,
+                                  void __user *buffer, size_t *lenp,
+                                  loff_t *ppos)
+{
+       char val[MPTCP_PM_NAME_MAX];
+       struct ctl_table tbl = {
+               .data = val,
+               .maxlen = MPTCP_PM_NAME_MAX,
+       };
+       int ret;
+
+       mptcp_get_default_path_manager(val);
+
+       ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+       if (write && ret == 0)
+               ret = mptcp_set_default_path_manager(val);
+       return ret;
+}
+
+static int proc_mptcp_scheduler(struct ctl_table *ctl, int write,
+                               void __user *buffer, size_t *lenp,
+                               loff_t *ppos)
+{
+       char val[MPTCP_SCHED_NAME_MAX];
+       struct ctl_table tbl = {
+               .data = val,
+               .maxlen = MPTCP_SCHED_NAME_MAX,
+       };
+       int ret;
+
+       mptcp_get_default_scheduler(val);
+
+       ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+       if (write && ret == 0)
+               ret = mptcp_set_default_scheduler(val);
+       return ret;
+}
+
+static struct ctl_table mptcp_table[] = {
+       {
+               .procname = "mptcp_enabled",
+               .data = &sysctl_mptcp_enabled,
+               .maxlen = sizeof(int),
+               .mode = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .procname = "mptcp_version",
+               .data = &sysctl_mptcp_version,
+               .mode = 0644,
+               .maxlen = sizeof(int),
+               .proc_handler = &proc_dointvec_minmax,
+               .extra1 = &min_mptcp_version,
+               .extra2 = &max_mptcp_version,
+       },
+       {
+               .procname = "mptcp_checksum",
+               .data = &sysctl_mptcp_checksum,
+               .maxlen = sizeof(int),
+               .mode = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .procname = "mptcp_debug",
+               .data = &sysctl_mptcp_debug,
+               .maxlen = sizeof(int),
+               .mode = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .procname = "mptcp_syn_retries",
+               .data = &sysctl_mptcp_syn_retries,
+               .maxlen = sizeof(int),
+               .mode = 0644,
+               .proc_handler = &proc_dointvec
+       },
+       {
+               .procname       = "mptcp_path_manager",
+               .mode           = 0644,
+               .maxlen         = MPTCP_PM_NAME_MAX,
+               .proc_handler   = proc_mptcp_path_manager,
+       },
+       {
+               .procname       = "mptcp_scheduler",
+               .mode           = 0644,
+               .maxlen         = MPTCP_SCHED_NAME_MAX,
+               .proc_handler   = proc_mptcp_scheduler,
+       },
+       { }
+};
+
+static inline u32 mptcp_hash_tk(u32 token)
+{
+       return token % MPTCP_HASH_SIZE;
+}
+
+struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
+EXPORT_SYMBOL(tk_hashtable);
+
+/* The following hash table is used to avoid collision of token */
+static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];
+
+/* Lock, protecting the two hash-tables that hold the token. Namely,
+ * mptcp_reqsk_tk_htb and tk_hashtable
+ */
+static spinlock_t mptcp_tk_hashlock;
+
+static bool mptcp_reqsk_find_tk(const u32 token)
+{
+       const u32 hash = mptcp_hash_tk(token);
+       const struct mptcp_request_sock *mtreqsk;
+       const struct hlist_nulls_node *node;
+
+begin:
+       hlist_nulls_for_each_entry_rcu(mtreqsk, node,
+                                      &mptcp_reqsk_tk_htb[hash], hash_entry) {
+               if (token == mtreqsk->mptcp_loc_token)
+                       return true;
+       }
+       /* A request-socket is destroyed by RCU. So, it might have been recycled
+        * and put into another hash-table list. So, after the lookup we may
+        * end up in a different list. So, we may need to restart.
+        *
+        * See also the comment in __inet_lookup_established.
+        */
+       if (get_nulls_value(node) != hash)
+               goto begin;
+       return false;
+}
+
+static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token)
+{
+       u32 hash = mptcp_hash_tk(token);
+
+       hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry,
+                                &mptcp_reqsk_tk_htb[hash]);
+}
+
+static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk)
+{
+       rcu_read_lock_bh();
+       spin_lock(&mptcp_tk_hashlock);
+       hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry);
+       spin_unlock(&mptcp_tk_hashlock);
+       rcu_read_unlock_bh();
+}
+
+void mptcp_reqsk_destructor(struct request_sock *req)
+{
+       if (!mptcp_rsk(req)->is_sub)
+               mptcp_reqsk_remove_tk(req);
+}
+
+static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token)
+{
+       u32 hash = mptcp_hash_tk(token);
+
+       hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);
+       meta_tp->inside_tk_table = 1;
+}
+
+static bool mptcp_find_token(u32 token)
+{
+       const u32 hash = mptcp_hash_tk(token);
+       const struct tcp_sock *meta_tp;
+       const struct hlist_nulls_node *node;
+
+begin:
+       hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {
+               if (token == meta_tp->mptcp_loc_token)
+                       return true;
+       }
+       /* A TCP-socket is destroyed by RCU. So, it might have been recycled
+        * and put into another hash-table list. So, after the lookup we may
+        * end up in a different list. So, we may need to restart.
+        *
+        * See also the comment in __inet_lookup_established.
+        */
+       if (get_nulls_value(node) != hash)
+               goto begin;
+       return false;
+}
+
+static void mptcp_set_key_reqsk(struct request_sock *req,
+                               const struct sk_buff *skb,
+                               u32 seed)
+{
+       const struct inet_request_sock *ireq = inet_rsk(req);
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
+                                                       ip_hdr(skb)->daddr,
+                                                       htons(ireq->ir_num),
+                                                       ireq->ir_rmt_port,
+                                                       seed);
+#if IS_ENABLED(CONFIG_IPV6)
+       } else {
+               mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
+                                                       ipv6_hdr(skb)->daddr.s6_addr32,
+                                                       htons(ireq->ir_num),
+                                                       ireq->ir_rmt_port,
+                                                       seed);
+#endif
+       }
+
+       mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
+}
+
+/* New MPTCP-connection request, prepare a new token for the meta-socket that
+ * will be created in mptcp_check_req_master(), and store the received token.
+ */
+static void mptcp_reqsk_new_mptcp(struct request_sock *req,
+                                 const struct sock *sk,
+                                 const struct mptcp_options_received *mopt,
+                                 const struct sk_buff *skb)
+{
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+       const struct tcp_sock *tp = tcp_sk(sk);
+
+       inet_rsk(req)->saw_mpc = 1;
+       /* MPTCP version agreement */
+       if (mopt->mptcp_ver >= tp->mptcp_ver)
+               mtreq->mptcp_ver = tp->mptcp_ver;
+       else
+               mtreq->mptcp_ver = mopt->mptcp_ver;
+
+       rcu_read_lock_bh();
+       spin_lock(&mptcp_tk_hashlock);
+       do {
+               mptcp_set_key_reqsk(req, skb, mptcp_seed++);
+       } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
+                mptcp_find_token(mtreq->mptcp_loc_token));
+       mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
+       spin_unlock(&mptcp_tk_hashlock);
+       rcu_read_unlock_bh();
+       mtreq->mptcp_rem_key = mopt->mptcp_sender_key;
+}
+
+static int mptcp_reqsk_new_cookie(struct request_sock *req,
+                                 const struct mptcp_options_received *mopt,
+                                 const struct sk_buff *skb)
+{
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+
+       rcu_read_lock_bh();
+       spin_lock(&mptcp_tk_hashlock);
+
+       mptcp_set_key_reqsk(req, skb, tcp_rsk(req)->snt_isn);
+
+       if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
+           mptcp_find_token(mtreq->mptcp_loc_token)) {
+               spin_unlock(&mptcp_tk_hashlock);
+               rcu_read_unlock_bh();
+               return false;
+       }
+
+       inet_rsk(req)->saw_mpc = 1;
+
+       spin_unlock(&mptcp_tk_hashlock);
+       rcu_read_unlock_bh();
+
+       mtreq->mptcp_rem_key = mopt->mptcp_sender_key;
+
+       return true;
+}
+
+static void mptcp_set_key_sk(const struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       const struct inet_sock *isk = inet_sk(sk);
+
+       if (sk->sk_family == AF_INET)
+               tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
+                                                    isk->inet_daddr,
+                                                    isk->inet_sport,
+                                                    isk->inet_dport,
+                                                    mptcp_seed++);
+#if IS_ENABLED(CONFIG_IPV6)
+       else
+               tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
+                                                    sk->sk_v6_daddr.s6_addr32,
+                                                    isk->inet_sport,
+                                                    isk->inet_dport,
+                                                    mptcp_seed++);
+#endif
+
+       mptcp_key_sha1(tp->mptcp_loc_key,
+                      &tp->mptcp_loc_token, NULL);
+}
+
+#ifdef HAVE_JUMP_LABEL
+/* We are not allowed to call static_key_slow_dec() from irq context
+ * If mptcp_enable/disable_static_key() is called from irq context,
+ * defer the static_key_slow_dec() calls.
+ */
+static atomic_t mptcp_enable_deferred;
+#endif
+
+void mptcp_enable_static_key(void)
+{
+#ifdef HAVE_JUMP_LABEL
+       int deferred;
+
+       if (in_interrupt()) {
+               atomic_inc(&mptcp_enable_deferred);
+               return;
+       }
+
+       deferred = atomic_xchg(&mptcp_enable_deferred, 0);
+
+       if (deferred > 0) {
+               while (deferred--)
+                       static_key_slow_inc(&mptcp_static_key);
+       } else if (deferred < 0) {
+               /* Do exactly one dec less than necessary */
+               while (++deferred)
+                       static_key_slow_dec(&mptcp_static_key);
+               return;
+       }
+#endif
+       static_key_slow_inc(&mptcp_static_key);
+       WARN_ON(atomic_read(&mptcp_static_key.enabled) == 0);
+}
+
+void mptcp_disable_static_key(void)
+{
+#ifdef HAVE_JUMP_LABEL
+       int deferred;
+
+       if (in_interrupt()) {
+               atomic_dec(&mptcp_enable_deferred);
+               return;
+       }
+
+       deferred = atomic_xchg(&mptcp_enable_deferred, 0);
+
+       if (deferred > 0) {
+               /* Do exactly one inc less than necessary */
+               while (--deferred)
+                       static_key_slow_inc(&mptcp_static_key);
+               return;
+       } else if (deferred < 0) {
+               while (deferred++)
+                       static_key_slow_dec(&mptcp_static_key);
+       }
+#endif
+       static_key_slow_dec(&mptcp_static_key);
+}
+
+void mptcp_enable_sock(struct sock *sk)
+{
+       if (!sock_flag(sk, SOCK_MPTCP)) {
+               sock_set_flag(sk, SOCK_MPTCP);
+               tcp_sk(sk)->mptcp_ver = sysctl_mptcp_version;
+
+               /* Necessary here, because MPTCP can be enabled/disabled through
+                * a setsockopt.
+                */
+               if (sk->sk_family == AF_INET)
+                       inet_csk(sk)->icsk_af_ops = &mptcp_v4_specific;
+#if IS_ENABLED(CONFIG_IPV6)
+               else if (mptcp_v6_is_v4_mapped(sk))
+                       inet_csk(sk)->icsk_af_ops = &mptcp_v6_mapped;
+               else
+                       inet_csk(sk)->icsk_af_ops = &mptcp_v6_specific;
+#endif
+
+               mptcp_enable_static_key();
+       }
+}
+
+void mptcp_disable_sock(struct sock *sk)
+{
+       if (sock_flag(sk, SOCK_MPTCP)) {
+               sock_reset_flag(sk, SOCK_MPTCP);
+
+               /* Necessary here, because MPTCP can be enabled/disabled through
+                * a setsockopt.
+                */
+               if (sk->sk_family == AF_INET)
+                       inet_csk(sk)->icsk_af_ops = &ipv4_specific;
+#if IS_ENABLED(CONFIG_IPV6)
+               else if (mptcp_v6_is_v4_mapped(sk))
+                       inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
+               else
+                       inet_csk(sk)->icsk_af_ops = &ipv6_specific;
+#endif
+
+               mptcp_disable_static_key();
+       }
+}
+
+void mptcp_connect_init(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       rcu_read_lock_bh();
+       spin_lock(&mptcp_tk_hashlock);
+       do {
+               mptcp_set_key_sk(sk);
+       } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
+                mptcp_find_token(tp->mptcp_loc_token));
+
+       __mptcp_hash_insert(tp, tp->mptcp_loc_token);
+       spin_unlock(&mptcp_tk_hashlock);
+       rcu_read_unlock_bh();
+
+       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE);
+}
+
+/**
+ * This function increments the refcount of the mpcb struct.
+ * It is the responsibility of the caller to decrement when releasing
+ * the structure.
+ */
+struct sock *mptcp_hash_find(const struct net *net, const u32 token)
+{
+       const u32 hash = mptcp_hash_tk(token);
+       const struct tcp_sock *meta_tp;
+       struct sock *meta_sk = NULL;
+       const struct hlist_nulls_node *node;
+
+       rcu_read_lock_bh();
+begin:
+       hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],
+                                      tk_table) {
+               meta_sk = (struct sock *)meta_tp;
+               if (token == meta_tp->mptcp_loc_token &&
+                   net_eq(net, sock_net(meta_sk))) {
+                       if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt)))
+                               goto out;
+                       if (unlikely(token != meta_tp->mptcp_loc_token ||
+                                    !net_eq(net, sock_net(meta_sk)))) {
+                               sock_gen_put(meta_sk);
+                               goto begin;
+                       }
+                       goto found;
+               }
+       }
+       /* A TCP-socket is destroyed by RCU. So, it might have been recycled
+        * and put into another hash-table list. So, after the lookup we may
+        * end up in a different list. So, we may need to restart.
+        *
+        * See also the comment in __inet_lookup_established.
+        */
+       if (get_nulls_value(node) != hash)
+               goto begin;
+out:
+       meta_sk = NULL;
+found:
+       rcu_read_unlock_bh();
+       return meta_sk;
+}
+
+void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
+{
+       /* remove from the token hashtable */
+       rcu_read_lock_bh();
+       spin_lock(&mptcp_tk_hashlock);
+       hlist_nulls_del_init_rcu(&meta_tp->tk_table);
+       meta_tp->inside_tk_table = 0;
+       spin_unlock(&mptcp_tk_hashlock);
+       rcu_read_unlock_bh();
+}
+
+struct sock *mptcp_select_ack_sock(const struct sock *meta_sk)
+{
+       const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sock *sk, *rttsk = NULL, *lastsk = NULL;
+       u32 min_time = 0, last_active = 0;
+
+       mptcp_for_each_sk(meta_tp->mpcb, sk) {
+               struct tcp_sock *tp = tcp_sk(sk);
+               u32 elapsed;
+
+               if (!mptcp_sk_can_send_ack(sk) || tp->pf)
+                       continue;
+
+               elapsed = keepalive_time_elapsed(tp);
+
+               /* We take the one with the lowest RTT within a reasonable
+                * (meta-RTO)-timeframe
+                */
+               if (elapsed < inet_csk(meta_sk)->icsk_rto) {
+                       if (!min_time || tp->srtt_us < min_time) {
+                               min_time = tp->srtt_us;
+                               rttsk = sk;
+                       }
+                       continue;
+               }
+
+               /* Otherwise, we just take the most recent active */
+               if (!rttsk && (!last_active || elapsed < last_active)) {
+                       last_active = elapsed;
+                       lastsk = sk;
+               }
+       }
+
+       if (rttsk)
+               return rttsk;
+
+       return lastsk;
+}
+EXPORT_SYMBOL(mptcp_select_ack_sock);
+
+static void mptcp_sock_def_error_report(struct sock *sk)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       if (!sock_flag(sk, SOCK_DEAD)) {
+               if (tp->send_mp_fclose && sk->sk_err == ETIMEDOUT) {
+                       /* Called by the keep alive timer (tcp_write_timeout),
+                        * when the limit of fastclose retransmissions has been
+                        * reached. Send a TCP RST to clear the status of any
+                        * stateful firewall (typically conntrack) which are
+                        * not aware of mptcp and cannot understand the
+                        * fastclose option.
+                        */
+                       tp->ops->send_active_reset(sk, GFP_ATOMIC);
+               }
+       }
+
+       if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||
+           mpcb->send_infinite_mapping) {
+
+               meta_sk->sk_err = sk->sk_err;
+               meta_sk->sk_err_soft = sk->sk_err_soft;
+
+               if (!sock_flag(meta_sk, SOCK_DEAD))
+                       meta_sk->sk_error_report(meta_sk);
+
+               WARN(meta_sk->sk_state == TCP_CLOSE,
+                    "Meta already closed i_rcv %u i_snd %u send_i %u flags %#lx\n",
+                    mpcb->infinite_mapping_rcv, mpcb->infinite_mapping_snd,
+                    mpcb->send_infinite_mapping, meta_sk->sk_flags);
+
+               if (meta_sk->sk_state != TCP_CLOSE)
+                       tcp_done(meta_sk);
+       }
+
+       if (mpcb->pm_ops->subflow_error)
+               mpcb->pm_ops->subflow_error(meta_sk, sk);
+
+       sk->sk_err = 0;
+}
+
+static void mptcp_mpcb_put(struct mptcp_cb *mpcb)
+{
+       if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {
+               mptcp_cleanup_path_manager(mpcb);
+               mptcp_cleanup_scheduler(mpcb);
+               kfree(mpcb->master_info);
+               kmem_cache_free(mptcp_cb_cache, mpcb);
+       }
+}
+
+void mptcp_sock_destruct(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       if (!is_meta_sk(sk) && !tp->was_meta_sk) {
+               WARN_ON(!hlist_unhashed(&tp->mptcp->cb_list));
+
+               kmem_cache_free(mptcp_sock_cache, tp->mptcp);
+               tp->mptcp = NULL;
+
+               /* Taken when mpcb pointer was set */
+               sock_put(mptcp_meta_sk(sk));
+               mptcp_mpcb_put(tp->mpcb);
+       } else {
+               struct mptcp_cb *mpcb = tp->mpcb;
+               struct mptcp_tw *mptw;
+
+               /* The mpcb is disappearing - we can make the final
+                * update to the rcv_nxt of the time-wait-sock and remove
+                * its reference to the mpcb.
+                */
+               spin_lock_bh(&mpcb->tw_lock);
+               list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
+                       list_del_rcu(&mptw->list);
+                       mptw->in_list = 0;
+                       mptcp_mpcb_put(mpcb);
+                       rcu_assign_pointer(mptw->mpcb, NULL);
+               }
+               spin_unlock_bh(&mpcb->tw_lock);
+
+               mptcp_mpcb_put(mpcb);
+
+               mptcp_debug("%s destroying meta-sk\n", __func__);
+       }
+
+       WARN_ON(!static_key_false(&mptcp_static_key));
+
+       /* Must be called here, because this will decrement the jump-label. */
+       inet_sock_destruct(sk);
+}
+
+void mptcp_destroy_sock(struct sock *sk)
+{
+       if (is_meta_sk(sk)) {
+               struct sock *sk_it, *tmpsk;
+
+               __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
+
+               /* We have to close all remaining subflows. Normally, they
+                * should all be about to get closed. But, if the kernel is
+                * forcing a closure (e.g., tcp_write_err), the subflows might
+                * not have been closed properly (as we are waiting for the
+                * DATA_ACK of the DATA_FIN).
+                */
+               mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {
+                       /* Already did call tcp_close - waiting for graceful
+                        * closure, or if we are retransmitting fast-close on
+                        * the subflow. The reset (or timeout) will kill the
+                        * subflow..
+                        */
+                       if (tcp_sk(sk_it)->closing ||
+                           tcp_sk(sk_it)->send_mp_fclose)
+                               continue;
+
+                       /* Allow the delayed work first to prevent time-wait state */
+                       if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
+                               continue;
+
+                       mptcp_sub_close(sk_it, 0);
+               }
+       } else {
+               mptcp_del_sock(sk);
+       }
+}
+
+static void mptcp_set_state(struct sock *sk)
+{
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+
+       /* Meta is not yet established - wake up the application */
+       if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
+           sk->sk_state == TCP_ESTABLISHED) {
+               tcp_set_state(meta_sk, TCP_ESTABLISHED);
+
+               if (!sock_flag(meta_sk, SOCK_DEAD)) {
+                       meta_sk->sk_state_change(meta_sk);
+                       sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
+               }
+
+               tcp_sk(meta_sk)->lsndtime = tcp_jiffies32;
+       }
+
+       if (sk->sk_state == TCP_ESTABLISHED) {
+               tcp_sk(sk)->mptcp->establish_increased = 1;
+               tcp_sk(sk)->mpcb->cnt_established++;
+       }
+
+       if (sk->sk_state == TCP_CLOSE) {
+               if (!sock_flag(sk, SOCK_DEAD))
+                       mptcp_sub_close(sk, 0);
+       }
+}
+
+static void mptcp_assign_congestion_control(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk));
+       const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops;
+
+       /* Congestion control is the same as meta. Thus, it has been
+        * try_module_get'd by tcp_assign_congestion_control.
+        */
+       if (icsk->icsk_ca_ops == ca)
+               return;
+
+       /* Use the same congestion control as set on the meta-sk */
+       if (!try_module_get(ca->owner)) {
+               /* This should never happen. The congestion control is linked
+                * to the meta-socket (through tcp_assign_congestion_control)
+                * who "holds" the refcnt on the module.
+                */
+               WARN(1, "Could not get the congestion control!");
+               return;
+       }
+       icsk->icsk_ca_ops = ca;
+
+       /* Clear out private data before diag gets it and
+        * the ca has not been initialized.
+        */
+       if (ca->get_info)
+               memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+}
+
+siphash_key_t mptcp_secret __read_mostly;
+u32 mptcp_seed = 0;
+
+void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
+{
+       u32 workspace[SHA_WORKSPACE_WORDS];
+       u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
+       u8 input[64];
+       int i;
+
+       memset(workspace, 0, sizeof(workspace));
+
+       /* Initialize input with appropriate padding */
+       memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
+                                                  * is explicitly set too
+                                                  */
+       memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
+       input[8] = 0x80; /* Padding: First bit after message = 1 */
+       input[63] = 0x40; /* Padding: Length of the message = 64 bits */
+
+       sha_init(mptcp_hashed_key);
+       sha_transform(mptcp_hashed_key, input, workspace);
+
+       for (i = 0; i < 5; i++)
+               mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);
+
+       if (token)
+               *token = mptcp_hashed_key[0];
+       if (idsn)
+               *idsn = *((u64 *)&mptcp_hashed_key[3]);
+}
+
+void mptcp_hmac_sha1(const u8 *key_1, const u8 *key_2, u32 *hash_out,
+                    int arg_num, ...)
+{
+       u32 workspace[SHA_WORKSPACE_WORDS];
+       u8 input[128]; /* 2 512-bit blocks */
+       int i;
+       int index;
+       int length;
+       u8 *msg;
+       va_list list;
+
+       memset(workspace, 0, sizeof(workspace));
+
+       /* Generate key xored with ipad */
+       memset(input, 0x36, 64);
+       for (i = 0; i < 8; i++)
+               input[i] ^= key_1[i];
+       for (i = 0; i < 8; i++)
+               input[i + 8] ^= key_2[i];
+
+       va_start(list, arg_num);
+       index = 64;
+       for (i = 0; i < arg_num; i++) {
+               length = va_arg(list, int);
+               msg = va_arg(list, u8 *);
+               WARN_ON(index + length > 125); /* Message is too long */
+               memcpy(&input[index], msg, length);
+               index += length;
+       }
+       va_end(list);
+
+       input[index] = 0x80; /* Padding: First bit after message = 1 */
+       memset(&input[index + 1], 0, (126 - index));
+
+       /* Padding: Length of the message = 512 + message length (bits) */
+       input[126] = 0x02;
+       input[127] = ((index - 64) * 8); /* Message length (bits) */
+
+       sha_init(hash_out);
+       sha_transform(hash_out, input, workspace);
+       memset(workspace, 0, sizeof(workspace));
+
+       sha_transform(hash_out, &input[64], workspace);
+       memset(workspace, 0, sizeof(workspace));
+
+       for (i = 0; i < 5; i++)
+               hash_out[i] = cpu_to_be32(hash_out[i]);
+
+       /* Prepare second part of hmac */
+       memset(input, 0x5C, 64);
+       for (i = 0; i < 8; i++)
+               input[i] ^= key_1[i];
+       for (i = 0; i < 8; i++)
+               input[i + 8] ^= key_2[i];
+
+       memcpy(&input[64], hash_out, 20);
+       input[84] = 0x80;
+       memset(&input[85], 0, 41);
+
+       /* Padding: Length of the message = 512 + 160 bits */
+       input[126] = 0x02;
+       input[127] = 0xA0;
+
+       sha_init(hash_out);
+       sha_transform(hash_out, input, workspace);
+       memset(workspace, 0, sizeof(workspace));
+
+       sha_transform(hash_out, &input[64], workspace);
+
+       for (i = 0; i < 5; i++)
+               hash_out[i] = cpu_to_be32(hash_out[i]);
+}
+EXPORT_SYMBOL(mptcp_hmac_sha1);
+
+static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
+{
+       /* Socket-options handled by sk_clone_lock while creating the meta-sk.
+        * ======
+        * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
+        * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
+        * TCP_NODELAY, TCP_CORK
+        *
+        * Socket-options handled in this function here
+        * ======
+        * TCP_DEFER_ACCEPT
+        * SO_KEEPALIVE
+        *
+        * Socket-options on the todo-list
+        * ======
+        * SO_BINDTODEVICE - should probably prevent creation of new subsocks
+        *                   across other devices. - what about the api-draft?
+        * SO_DEBUG
+        * SO_REUSEADDR - probably we don't care about this
+        * SO_DONTROUTE, SO_BROADCAST
+        * SO_OOBINLINE
+        * SO_LINGER
+        * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
+        * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
+        * SO_RXQ_OVFL
+        * TCP_COOKIE_TRANSACTIONS
+        * TCP_MAXSEG
+        * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this
+        *              in mptcp_meta_retransmit_timer. AND we need to check
+        *              what is about the subsockets.
+        * TCP_LINGER2
+        * TCP_WINDOW_CLAMP
+        * TCP_USER_TIMEOUT
+        * TCP_MD5SIG
+        *
+        * Socket-options of no concern for the meta-socket (but for the subsocket)
+        * ======
+        * SO_PRIORITY
+        * SO_MARK
+        * TCP_CONGESTION
+        * TCP_SYNCNT
+        * TCP_QUICKACK
+        */
+
+       /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */
+       inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
+
+       /* Keepalives are handled entirely at the MPTCP-layer */
+       if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {
+               inet_csk_reset_keepalive_timer(meta_sk,
+                                              keepalive_time_when(tcp_sk(meta_sk)));
+               sock_reset_flag(master_sk, SOCK_KEEPOPEN);
+               inet_csk_delete_keepalive_timer(master_sk);
+       }
+
+       /* Do not propagate subflow-errors up to the MPTCP-layer */
+       inet_sk(master_sk)->recverr = 0;
+}
+
+static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk)
+{
+       /* IP_TOS also goes to the subflow. */
+       if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {
+               inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;
+               sub_sk->sk_priority = meta_sk->sk_priority;
+               sk_dst_reset(sub_sk);
+       }
+
+       /* Inherit SO_REUSEADDR */
+       sub_sk->sk_reuse = meta_sk->sk_reuse;
+
+       /* Inherit SO_MARK: can be used for routing or filtering */
+       sub_sk->sk_mark = meta_sk->sk_mark;
+
+       /* Inherit snd/rcv-buffer locks */
+       sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+       /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */
+       tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+
+       /* Keepalives are handled entirely at the MPTCP-layer */
+       if (sock_flag(sub_sk, SOCK_KEEPOPEN)) {
+               sock_reset_flag(sub_sk, SOCK_KEEPOPEN);
+               inet_csk_delete_keepalive_timer(sub_sk);
+       }
+
+       /* Do not propagate subflow-errors up to the MPTCP-layer */
+       inet_sk(sub_sk)->recverr = 0;
+}
+
+int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
+{
+       /* skb-sk may be NULL if we receive a packet immediately after the
+        * SYN/ACK + MP_CAPABLE.
+        */
+       struct sock *sk = skb->sk ? skb->sk : meta_sk;
+       int ret = 0;
+
+       skb->sk = NULL;
+
+       if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) {
+               kfree_skb(skb);
+               return 0;
+       }
+
+       if (sk->sk_family == AF_INET)
+               ret = tcp_v4_do_rcv(sk, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+       else
+               ret = tcp_v6_do_rcv(sk, skb);
+#endif
+
+       sock_put(sk);
+       return ret;
+}
+
+struct lock_class_key meta_key;
+char *meta_key_name = "sk_lock-AF_INET-MPTCP";
+struct lock_class_key meta_slock_key;
+char *meta_slock_key_name = "slock-AF_INET-MPTCP";
+
+static const struct tcp_sock_ops mptcp_meta_specific = {
+       .__select_window                = __mptcp_select_window,
+       .select_window                  = mptcp_select_window,
+       .select_initial_window          = mptcp_select_initial_window,
+       .select_size                    = mptcp_select_size,
+       .init_buffer_space              = mptcp_init_buffer_space,
+       .set_rto                        = mptcp_tcp_set_rto,
+       .should_expand_sndbuf           = mptcp_should_expand_sndbuf,
+       .send_fin                       = mptcp_send_fin,
+       .write_xmit                     = mptcp_write_xmit,
+       .send_active_reset              = mptcp_send_active_reset,
+       .write_wakeup                   = mptcp_write_wakeup,
+       .retransmit_timer               = mptcp_meta_retransmit_timer,
+       .time_wait                      = mptcp_time_wait,
+       .cleanup_rbuf                   = mptcp_cleanup_rbuf,
+};
+
+static const struct tcp_sock_ops mptcp_sub_specific = {
+       .__select_window                = __mptcp_select_window,
+       .select_window                  = mptcp_select_window,
+       .select_initial_window          = mptcp_select_initial_window,
+       .select_size                    = mptcp_select_size,
+       .init_buffer_space              = mptcp_init_buffer_space,
+       .set_rto                        = mptcp_tcp_set_rto,
+       .should_expand_sndbuf           = mptcp_should_expand_sndbuf,
+       .send_fin                       = tcp_send_fin,
+       .write_xmit                     = tcp_write_xmit,
+       .send_active_reset              = tcp_send_active_reset,
+       .write_wakeup                   = tcp_write_wakeup,
+       .retransmit_timer               = mptcp_sub_retransmit_timer,
+       .time_wait                      = tcp_time_wait,
+       .cleanup_rbuf                   = tcp_cleanup_rbuf,
+};
+
+static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key,
+                           __u8 mptcp_ver, u32 window)
+{
+       struct mptcp_cb *mpcb;
+       struct sock *master_sk;
+       struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
+       struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
+       u64 idsn;
+
+       dst_release(meta_sk->sk_rx_dst);
+       meta_sk->sk_rx_dst = NULL;
+       /* This flag is set to announce sock_lock_init to
+        * reclassify the lock-class of the master socket.
+        */
+       meta_tp->is_master_sk = 1;
+       master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO);
+       meta_tp->is_master_sk = 0;
+       if (!master_sk)
+               return -ENOBUFS;
+
+       master_tp = tcp_sk(master_sk);
+
+       mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
+       if (!mpcb) {
+               /* sk_free (and __sk_free) requirese wmem_alloc to be 1.
+                * All the rest is set to 0 thanks to __GFP_ZERO above.
+                */
+               refcount_set(&master_sk->sk_wmem_alloc, 1);
+               sk_free(master_sk);
+               return -ENOBUFS;
+       }
+
+#if IS_ENABLED(CONFIG_IPV6)
+       if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) {
+               struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
+
+               inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
+
+               newnp = inet6_sk(master_sk);
+               memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+               newnp->ipv6_mc_list = NULL;
+               newnp->ipv6_ac_list = NULL;
+               newnp->ipv6_fl_list = NULL;
+               newnp->opt = NULL;
+               newnp->pktoptions = NULL;
+               (void)xchg(&newnp->rxpmtu, NULL);
+       } else if (meta_sk->sk_family == AF_INET6) {
+               struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
+
+               inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
+
+               newnp = inet6_sk(master_sk);
+               memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+
+               newnp->hop_limit        = -1;
+               newnp->mcast_hops       = IPV6_DEFAULT_MCASTHOPS;
+               newnp->mc_loop  = 1;
+               newnp->pmtudisc = IPV6_PMTUDISC_WANT;
+               master_sk->sk_ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only;
+       }
+#endif
+
+       meta_tp->mptcp = NULL;
+
+       /* Store the mptcp version agreed on initial handshake */
+       mpcb->mptcp_ver = mptcp_ver;
+
+       /* Store the keys and generate the peer's token */
+       mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
+       mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
+
+       /* Generate Initial data-sequence-numbers */
+       mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);
+       idsn = ntohll(idsn) + 1;
+       mpcb->snd_high_order[0] = idsn >> 32;
+       mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
+
+       meta_tp->write_seq = (u32)idsn;
+       meta_tp->snd_sml = meta_tp->write_seq;
+       meta_tp->snd_una = meta_tp->write_seq;
+       meta_tp->snd_nxt = meta_tp->write_seq;
+       meta_tp->pushed_seq = meta_tp->write_seq;
+       meta_tp->snd_up = meta_tp->write_seq;
+
+       mpcb->mptcp_rem_key = remote_key;
+       mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
+       idsn = ntohll(idsn) + 1;
+       mpcb->rcv_high_order[0] = idsn >> 32;
+       mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
+       meta_tp->copied_seq = (u32) idsn;
+       meta_tp->rcv_nxt = (u32) idsn;
+       meta_tp->rcv_wup = (u32) idsn;
+
+       meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
+       meta_tp->snd_wnd = window;
+       meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
+
+       meta_tp->packets_out = 0;
+       meta_icsk->icsk_probes_out = 0;
+
+       /* Set mptcp-pointers */
+       master_tp->mpcb = mpcb;
+       master_tp->meta_sk = meta_sk;
+       meta_tp->mpcb = mpcb;
+       meta_tp->meta_sk = meta_sk;
+       mpcb->meta_sk = meta_sk;
+       mpcb->master_sk = master_sk;
+
+       meta_tp->was_meta_sk = 0;
+
+       /* Initialize the queues */
+       skb_queue_head_init(&mpcb->reinject_queue);
+       master_tp->out_of_order_queue = RB_ROOT;
+       INIT_LIST_HEAD(&master_tp->tsq_node);
+
+       master_sk->sk_tsq_flags = 0;
+
+       mutex_init(&mpcb->mpcb_mutex);
+
+       /* Init the accept_queue structure, we support a queue of 32 pending
+        * connections, it does not need to be huge, since we only store  here
+        * pending subflow creations.
+        */
+       reqsk_queue_alloc(&meta_icsk->icsk_accept_queue);
+       meta_sk->sk_max_ack_backlog = 32;
+       meta_sk->sk_ack_backlog = 0;
+
+       if (!sock_flag(meta_sk, SOCK_MPTCP)) {
+               mptcp_enable_static_key();
+               sock_set_flag(meta_sk, SOCK_MPTCP);
+       }
+
+       /* Redefine function-pointers as the meta-sk is now fully ready */
+       meta_tp->mpc = 1;
+       meta_tp->ops = &mptcp_meta_specific;
+
+       meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
+       meta_sk->sk_destruct = mptcp_sock_destruct;
+
+       /* Meta-level retransmit timer */
+       meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
+
+       tcp_init_xmit_timers(master_sk);
+       /* Has been set for sending out the SYN */
+       inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
+
+       if (!meta_tp->inside_tk_table) {
+               /* Adding the meta_tp in the token hashtable - coming from server-side */
+               rcu_read_lock_bh();
+               spin_lock(&mptcp_tk_hashlock);
+
+               /* With lockless listeners, we might process two ACKs at the
+                * same time. With TCP, inet_csk_complete_hashdance takes care
+                * of this. But, for MPTCP this would be too late if we add
+                * this MPTCP-socket in the token table (new subflows might
+                * come in and match on this socket here.
+                * So, we need to check if someone else already added the token
+                * and revert in that case. The other guy won the race...
+                */
+               if (mptcp_find_token(mpcb->mptcp_loc_token)) {
+                       spin_unlock(&mptcp_tk_hashlock);
+                       rcu_read_unlock_bh();
+
+                       inet_put_port(master_sk);
+                       kmem_cache_free(mptcp_cb_cache, mpcb);
+                       sk_free(master_sk);
+
+                       return -ENOBUFS;
+               }
+               __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
+
+               spin_unlock(&mptcp_tk_hashlock);
+               rcu_read_unlock_bh();
+       }
+       master_tp->inside_tk_table = 0;
+
+       /* Init time-wait stuff */
+       INIT_LIST_HEAD(&mpcb->tw_list);
+       spin_lock_init(&mpcb->tw_lock);
+
+       INIT_HLIST_HEAD(&mpcb->callback_list);
+
+       mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
+
+       mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
+       mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
+       mpcb->orig_window_clamp = meta_tp->window_clamp;
+
+       /* The meta is directly linked - set refcnt to 1 */
+       atomic_set(&mpcb->mpcb_refcnt, 1);
+
+       mptcp_init_path_manager(mpcb);
+       mptcp_init_scheduler(mpcb);
+
+       if (!try_module_get(inet_csk(master_sk)->icsk_ca_ops->owner))
+               tcp_assign_congestion_control(master_sk);
+
+       master_tp->saved_syn = NULL;
+
+       mptcp_debug("%s: created mpcb with token %#x\n",
+                   __func__, mpcb->mptcp_loc_token);
+
+       return 0;
+}
+
+void mptcp_fallback_meta_sk(struct sock *meta_sk)
+{
+       kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);
+}
+
+int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
+                  gfp_t flags)
+{
+       struct mptcp_cb *mpcb   = tcp_sk(meta_sk)->mpcb;
+       struct tcp_sock *tp     = tcp_sk(sk);
+
+       tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
+       if (!tp->mptcp)
+               return -ENOMEM;
+
+       tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
+       /* No more space for more subflows? */
+       if (!tp->mptcp->path_index) {
+               kmem_cache_free(mptcp_sock_cache, tp->mptcp);
+               return -EPERM;
+       }
+
+       INIT_HLIST_NODE(&tp->mptcp->cb_list);
+
+       tp->mptcp->tp = tp;
+       tp->mpcb = mpcb;
+       tp->meta_sk = meta_sk;
+
+       if (!sock_flag(sk, SOCK_MPTCP)) {
+               mptcp_enable_static_key();
+               sock_set_flag(sk, SOCK_MPTCP);
+       }
+
+       tp->mpc = 1;
+       tp->ops = &mptcp_sub_specific;
+
+       tp->mptcp->loc_id = loc_id;
+       tp->mptcp->rem_id = rem_id;
+       if (mpcb->sched_ops->init)
+               mpcb->sched_ops->init(sk);
+
+       /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
+        * included in mptcp_del_sock(), because the mpcb must remain alive
+        * until the last subsocket is completely destroyed.
+        */
+       sock_hold(meta_sk);
+       atomic_inc(&mpcb->mpcb_refcnt);
+
+       tp->mptcp->next = mpcb->connection_list;
+       mpcb->connection_list = tp;
+       tp->mptcp->attached = 1;
+
+       mpcb->cnt_subflows++;
+       atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),
+                  &meta_sk->sk_rmem_alloc);
+
+       mptcp_sub_inherit_sockopts(meta_sk, sk);
+       INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
+
+       /* Properly inherit CC from the meta-socket */
+       mptcp_assign_congestion_control(sk);
+
+       /* As we successfully allocated the mptcp_tcp_sock, we have to
+        * change the function-pointers here (for sk_destruct to work correctly)
+        */
+       sk->sk_error_report = mptcp_sock_def_error_report;
+       sk->sk_data_ready = mptcp_data_ready;
+       sk->sk_write_space = mptcp_write_space;
+       sk->sk_state_change = mptcp_set_state;
+       sk->sk_destruct = mptcp_sock_destruct;
+
+       if (sk->sk_family == AF_INET)
+               mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",
+                           __func__, mpcb->mptcp_loc_token,
+                           tp->mptcp->path_index,
+                           &((struct inet_sock *)tp)->inet_saddr,
+                           ntohs(((struct inet_sock *)tp)->inet_sport),
+                           &((struct inet_sock *)tp)->inet_daddr,
+                           ntohs(((struct inet_sock *)tp)->inet_dport),
+                           mpcb->cnt_subflows);
+#if IS_ENABLED(CONFIG_IPV6)
+       else
+               mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",
+                           __func__, mpcb->mptcp_loc_token,
+                           tp->mptcp->path_index, &inet6_sk(sk)->saddr,
+                           ntohs(((struct inet_sock *)tp)->inet_sport),
+                           &sk->sk_v6_daddr,
+                           ntohs(((struct inet_sock *)tp)->inet_dport),
+                           mpcb->cnt_subflows);
+#endif
+
+       return 0;
+}
+
+void mptcp_del_sock(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk), *tp_prev;
+       struct mptcp_cb *mpcb;
+
+       if (!tp->mptcp || !tp->mptcp->attached)
+               return;
+
+       mpcb = tp->mpcb;
+       tp_prev = mpcb->connection_list;
+
+       if (mpcb->sched_ops->release)
+               mpcb->sched_ops->release(sk);
+
+       if (mpcb->pm_ops->delete_subflow)
+               mpcb->pm_ops->delete_subflow(sk);
+
+       mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
+                   __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
+                   sk->sk_state, is_meta_sk(sk));
+
+       if (tp_prev == tp) {
+               mpcb->connection_list = tp->mptcp->next;
+       } else {
+               for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {
+                       if (tp_prev->mptcp->next == tp) {
+                               tp_prev->mptcp->next = tp->mptcp->next;
+                               break;
+                       }
+               }
+       }
+       mpcb->cnt_subflows--;
+       if (tp->mptcp->establish_increased)
+               mpcb->cnt_established--;
+
+       tp->mptcp->next = NULL;
+       tp->mptcp->attached = 0;
+       mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
+
+       if (!skb_queue_empty(&sk->sk_write_queue))
+               mptcp_reinject_data(sk, 0);
+
+       if (is_master_tp(tp)) {
+               struct sock *meta_sk = mptcp_meta_sk(sk);
+               struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+
+               if (meta_tp->record_master_info &&
+                   !sock_flag(meta_sk, SOCK_DEAD)) {
+                       mpcb->master_info = kmalloc(sizeof(*mpcb->master_info),
+                                                   GFP_ATOMIC);
+
+                       if (mpcb->master_info)
+                               tcp_get_info(sk, mpcb->master_info);
+               }
+
+               mpcb->master_sk = NULL;
+       } else if (tp->mptcp->pre_established) {
+               sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
+       }
+
+       rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);
+}
+
+/* Updates the MPTCP-session based on path-manager information (e.g., addresses,
+ * low-prio flows,...).
+ */
+void mptcp_update_metasocket(const struct sock *meta_sk)
+{
+       if (tcp_sk(meta_sk)->mpcb->pm_ops->new_session)
+               tcp_sk(meta_sk)->mpcb->pm_ops->new_session(meta_sk);
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary.  COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ * (inspired from tcp_cleanup_rbuf())
+ */
+void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sock *sk;
+       bool recheck_rcv_window = false;
+       __u32 rcv_window_now = 0;
+
+       if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
+               rcv_window_now = tcp_receive_window(meta_tp);
+
+               /* Optimize, __mptcp_select_window() is not cheap. */
+               if (2 * rcv_window_now <= meta_tp->window_clamp)
+                       recheck_rcv_window = true;
+       }
+
+       mptcp_for_each_sk(meta_tp->mpcb, sk) {
+               struct tcp_sock *tp = tcp_sk(sk);
+               const struct inet_connection_sock *icsk = inet_csk(sk);
+
+               if (!mptcp_sk_can_send_ack(sk))
+                       continue;
+
+               if (!inet_csk_ack_scheduled(sk))
+                       goto second_part;
+               /* Delayed ACKs frequently hit locked sockets during bulk
+                * receive.
+                */
+               if (icsk->icsk_ack.blocked ||
+                   /* Once-per-two-segments ACK was not sent by tcp_input.c */
+                   tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
+                   /* If this read emptied read buffer, we send ACK, if
+                    * connection is not bidirectional, user drained
+                    * receive buffer and there was a small segment
+                    * in queue.
+                    */
+                   (copied > 0 &&
+                    ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
+                     ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
+                      !icsk->icsk_ack.pingpong)) &&
+                    !atomic_read(&meta_sk->sk_rmem_alloc))) {
+                       tcp_send_ack(sk);
+                       continue;
+               }
+
+second_part:
+               /* This here is the second part of tcp_cleanup_rbuf */
+               if (recheck_rcv_window) {
+                       __u32 new_window = tp->ops->__select_window(sk);
+
+                       /* Send ACK now, if this read freed lots of space
+                        * in our buffer. Certainly, new_window is new window.
+                        * We can advertise it now, if it is not less than
+                        * current one.
+                        * "Lots" means "at least twice" here.
+                        */
+                       if (new_window && new_window >= 2 * rcv_window_now)
+                               tcp_send_ack(sk);
+               }
+       }
+}
+
+static int mptcp_sub_send_fin(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *skb = tcp_write_queue_tail(sk);
+       int mss_now;
+
+       /* Optimization, tack on the FIN if we have a queue of
+        * unsent frames.  But be careful about outgoing SACKS
+        * and IP options.
+        */
+       mss_now = tcp_current_mss(sk);
+
+       if (tcp_send_head(sk) != NULL) {
+               TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
+               TCP_SKB_CB(skb)->end_seq++;
+               tp->write_seq++;
+       } else {
+               skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
+               if (!skb)
+                       return 1;
+
+               /* Reserve space for headers and prepare control bits. */
+               skb_reserve(skb, MAX_TCP_HEADER);
+               /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+               tcp_init_nondata_skb(skb, tp->write_seq,
+                                    TCPHDR_ACK | TCPHDR_FIN);
+               tcp_queue_skb(sk, skb);
+       }
+       __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+
+       return 0;
+}
+
+static void mptcp_sub_close_doit(struct sock *sk)
+{
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       if (sock_flag(sk, SOCK_DEAD))
+               return;
+
+       /* We come from tcp_disconnect. We are sure that meta_sk is set */
+       if (!mptcp(tp)) {
+               tp->closing = 1;
+               tcp_close(sk, 0);
+               return;
+       }
+
+       if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
+               tp->closing = 1;
+               tcp_close(sk, 0);
+       } else if (tcp_close_state(sk)) {
+               sk->sk_shutdown |= SEND_SHUTDOWN;
+               tcp_send_fin(sk);
+       }
+}
+
+void mptcp_sub_close_wq(struct work_struct *work)
+{
+       struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp;
+       struct sock *sk = (struct sock *)tp;
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+
+       mutex_lock(&tp->mpcb->mpcb_mutex);
+       lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
+
+       mptcp_sub_close_doit(sk);
+
+       release_sock(meta_sk);
+       mutex_unlock(&tp->mpcb->mpcb_mutex);
+       sock_put(sk);
+}
+
+void mptcp_sub_close(struct sock *sk, unsigned long delay)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
+
+       /* We are already closing - e.g., call from sock_def_error_report upon
+        * tcp_disconnect in tcp_close.
+        */
+       if (tp->closing)
+               return;
+
+       /* Work already scheduled ? */
+       if (work_pending(&work->work)) {
+               /* Work present - who will be first ? */
+               if (jiffies + delay > work->timer.expires)
+                       return;
+
+               /* Try canceling - if it fails, work will be executed soon */
+               if (!cancel_delayed_work(work))
+                       return;
+               sock_put(sk);
+       }
+
+       if (!delay) {
+               unsigned char old_state = sk->sk_state;
+
+               /* We directly send the FIN. Because it may take so a long time,
+                * untile the work-queue will get scheduled...
+                *
+                * If mptcp_sub_send_fin returns 1, it failed and thus we reset
+                * the old state so that tcp_close will finally send the fin
+                * in user-context.
+                */
+               if (!sk->sk_err && old_state != TCP_CLOSE &&
+                   tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
+                       if (old_state == TCP_ESTABLISHED)
+                               TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+                       sk->sk_state = old_state;
+               }
+       }
+
+       sock_hold(sk);
+       queue_delayed_work(mptcp_wq, work, delay);
+}
+
+void mptcp_sub_force_close(struct sock *sk)
+{
+       /* The below tcp_done may have freed the socket, if he is already dead.
+        * Thus, we are not allowed to access it afterwards. That's why
+        * we have to store the dead-state in this local variable.
+        */
+       int sock_is_dead = sock_flag(sk, SOCK_DEAD);
+
+       tcp_sk(sk)->mp_killed = 1;
+
+       if (sk->sk_state != TCP_CLOSE)
+               tcp_done(sk);
+
+       if (!sock_is_dead)
+               mptcp_sub_close(sk, 0);
+}
+EXPORT_SYMBOL(mptcp_sub_force_close);
+
+/* Update the mpcb send window, based on the contributions
+ * of each subflow
+ */
+void mptcp_update_sndbuf(const struct tcp_sock *tp)
+{
+       struct sock *meta_sk = tp->meta_sk, *sk;
+       int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
+
+       mptcp_for_each_sk(tp->mpcb, sk) {
+               if (!mptcp_sk_can_send(sk))
+                       continue;
+
+               new_sndbuf += sk->sk_sndbuf;
+
+               if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {
+                       new_sndbuf = sysctl_tcp_wmem[2];
+                       break;
+               }
+       }
+       meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);
+
+       /* The subflow's call to sk_write_space in tcp_new_space ends up in
+        * mptcp_write_space.
+        * It has nothing to do with waking up the application.
+        * So, we do it here.
+        */
+       if (old_sndbuf != meta_sk->sk_sndbuf)
+               meta_sk->sk_write_space(meta_sk);
+}
+
+/* Similar to: tcp_close */
+void mptcp_close(struct sock *meta_sk, long timeout)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sock *sk_it, *tmpsk;
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       struct sk_buff *skb;
+       int data_was_unread = 0;
+       int state;
+
+       mptcp_debug("%s: Close of meta_sk with tok %#x\n",
+                   __func__, mpcb->mptcp_loc_token);
+
+       mutex_lock(&mpcb->mpcb_mutex);
+       lock_sock(meta_sk);
+
+       if (meta_tp->inside_tk_table)
+               /* Detach the mpcb from the token hashtable */
+               mptcp_hash_remove_bh(meta_tp);
+
+       meta_sk->sk_shutdown = SHUTDOWN_MASK;
+       /* We need to flush the recv. buffs.  We do this only on the
+        * descriptor close, not protocol-sourced closes, because the
+        * reader process may not have drained the data yet!
+        */
+       while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
+               u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+
+               if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+                       len--;
+               data_was_unread += len;
+               __kfree_skb(skb);
+       }
+
+       sk_mem_reclaim(meta_sk);
+
+       /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
+       if (meta_sk->sk_state == TCP_CLOSE) {
+               mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
+                       if (tcp_sk(sk_it)->send_mp_fclose)
+                               continue;
+                       mptcp_sub_close(sk_it, 0);
+               }
+               goto adjudge_to_death;
+       }
+
+       if (data_was_unread) {
+               /* Unread data was tossed, zap the connection. */
+               NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
+               tcp_set_state(meta_sk, TCP_CLOSE);
+               tcp_sk(meta_sk)->ops->send_active_reset(meta_sk,
+                                                       meta_sk->sk_allocation);
+       } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
+               /* Check zero linger _after_ checking for unread data. */
+               meta_sk->sk_prot->disconnect(meta_sk, 0);
+               NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
+       } else if (tcp_close_state(meta_sk)) {
+               mptcp_send_fin(meta_sk);
+       } else if (meta_tp->snd_una == meta_tp->write_seq) {
+               /* The DATA_FIN has been sent and acknowledged
+                * (e.g., by sk_shutdown). Close all the other subflows
+                */
+               mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
+                       unsigned long delay = 0;
+                       /* If we are the passive closer, don't trigger
+                        * subflow-fin until the subflow has been finned
+                        * by the peer. - thus we add a delay
+                        */
+                       if (mpcb->passive_close &&
+                           sk_it->sk_state == TCP_ESTABLISHED)
+                               delay = inet_csk(sk_it)->icsk_rto << 3;
+
+                       mptcp_sub_close(sk_it, delay);
+               }
+       }
+
+       sk_stream_wait_close(meta_sk, timeout);
+
+adjudge_to_death:
+       state = meta_sk->sk_state;
+       sock_hold(meta_sk);
+       sock_orphan(meta_sk);
+
+       /* socket will be freed after mptcp_close - we have to prevent
+        * access from the subflows.
+        */
+       mptcp_for_each_sk(mpcb, sk_it) {
+               /* Similar to sock_orphan, but we don't set it DEAD, because
+                * the callbacks are still set and must be called.
+                */
+               write_lock_bh(&sk_it->sk_callback_lock);
+               sk_set_socket(sk_it, NULL);
+               sk_it->sk_wq  = NULL;
+               write_unlock_bh(&sk_it->sk_callback_lock);
+       }
+
+       /* It is the last release_sock in its life. It will remove backlog. */
+       release_sock(meta_sk);
+
+       /* Now socket is owned by kernel and we acquire BH lock
+        * to finish close. No need to check for user refs.
+        */
+       local_bh_disable();
+       bh_lock_sock(meta_sk);
+       WARN_ON(sock_owned_by_user(meta_sk));
+
+       percpu_counter_inc(meta_sk->sk_prot->orphan_count);
+
+       /* Have we already been destroyed by a softirq or backlog? */
+       if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
+               goto out;
+
+       /*      This is a (useful) BSD violating of the RFC. There is a
+        *      problem with TCP as specified in that the other end could
+        *      keep a socket open forever with no application left this end.
+        *      We use a 3 minute timeout (about the same as BSD) then kill
+        *      our end. If they send after that then tough - BUT: long enough
+        *      that we won't make the old 4*rto = almost no time - whoops
+        *      reset mistake.
+        *
+        *      Nope, it was not mistake. It is really desired behaviour
+        *      f.e. on http servers, when such sockets are useless, but
+        *      consume significant resources. Let's do it with special
+        *      linger2 option.                                 --ANK
+        */
+
+       if (meta_sk->sk_state == TCP_FIN_WAIT2) {
+               if (meta_tp->linger2 < 0) {
+                       tcp_set_state(meta_sk, TCP_CLOSE);
+                       meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
+                       __NET_INC_STATS(sock_net(meta_sk),
+                                       LINUX_MIB_TCPABORTONLINGER);
+               } else {
+                       const int tmo = tcp_fin_time(meta_sk);
+
+                       if (tmo > TCP_TIMEWAIT_LEN) {
+                               inet_csk_reset_keepalive_timer(meta_sk,
+                                                              tmo - TCP_TIMEWAIT_LEN);
+                       } else {
+                               meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2,
+                                                       tmo);
+                               goto out;
+                       }
+               }
+       }
+       if (meta_sk->sk_state != TCP_CLOSE) {
+               sk_mem_reclaim(meta_sk);
+               if (tcp_check_oom(meta_sk, 0)) {
+                       if (net_ratelimit())
+                               pr_info("MPTCP: out of memory: force closing socket\n");
+                       tcp_set_state(meta_sk, TCP_CLOSE);
+                       meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
+                       __NET_INC_STATS(sock_net(meta_sk),
+                                       LINUX_MIB_TCPABORTONMEMORY);
+               }
+       }
+
+
+       if (meta_sk->sk_state == TCP_CLOSE)
+               inet_csk_destroy_sock(meta_sk);
+       /* Otherwise, socket is reprieved until protocol close. */
+
+out:
+       bh_unlock_sock(meta_sk);
+       local_bh_enable();
+       mutex_unlock(&mpcb->mpcb_mutex);
+       sock_put(meta_sk); /* Taken by sock_hold */
+}
+
+void mptcp_disconnect(struct sock *sk)
+{
+       struct sock *subsk, *tmpsk;
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       __skb_queue_purge(&tp->mpcb->reinject_queue);
+
+       if (tp->inside_tk_table)
+               mptcp_hash_remove_bh(tp);
+
+       local_bh_disable();
+       mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {
+               /* The socket will get removed from the subsocket-list
+                * and made non-mptcp by setting mpc to 0.
+                *
+                * This is necessary, because tcp_disconnect assumes
+                * that the connection is completely dead afterwards.
+                * Thus we need to do a mptcp_del_sock. Due to this call
+                * we have to make it non-mptcp.
+                *
+                * We have to lock the socket, because we set mpc to 0.
+                * An incoming packet would take the subsocket's lock
+                * and go on into the receive-path.
+                * This would be a race.
+                */
+
+               bh_lock_sock(subsk);
+               mptcp_del_sock(subsk);
+               tcp_sk(subsk)->mpc = 0;
+               tcp_sk(subsk)->ops = &tcp_specific;
+               mptcp_sub_force_close(subsk);
+               bh_unlock_sock(subsk);
+       }
+       local_bh_enable();
+
+       tp->was_meta_sk = 1;
+       tp->mpc = 0;
+       tp->ops = &tcp_specific;
+}
+
+/* Returns 1 if we should enable MPTCP for that socket. */
+int mptcp_doit(struct sock *sk)
+{
+       /* Don't do mptcp over loopback */
+       if (sk->sk_family == AF_INET &&
+           (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
+            ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
+               return 0;
+#if IS_ENABLED(CONFIG_IPV6)
+       if (sk->sk_family == AF_INET6 &&
+           (ipv6_addr_loopback(&sk->sk_v6_daddr) ||
+            ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
+               return 0;
+#endif
+       if (mptcp_v6_is_v4_mapped(sk) &&
+           ipv4_is_loopback(inet_sk(sk)->inet_saddr))
+               return 0;
+
+#ifdef CONFIG_TCP_MD5SIG
+       /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */
+       if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))
+               return 0;
+#endif
+
+       return 1;
+}
+
+int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key,
+                          __u8 mptcp_ver, u32 window)
+{
+       struct tcp_sock *master_tp;
+       struct sock *master_sk;
+
+       if (mptcp_alloc_mpcb(meta_sk, remote_key, mptcp_ver, window))
+               goto err_alloc_mpcb;
+
+       master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
+       master_tp = tcp_sk(master_sk);
+
+       if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))
+               goto err_add_sock;
+
+       if (__inet_inherit_port(meta_sk, master_sk) < 0)
+               goto err_add_sock;
+
+       meta_sk->sk_prot->unhash(meta_sk);
+       inet_ehash_nolisten(master_sk, NULL);
+
+       master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
+
+       return 0;
+
+err_add_sock:
+       mptcp_fallback_meta_sk(meta_sk);
+
+       inet_csk_prepare_forced_close(master_sk);
+       tcp_done(master_sk);
+       inet_csk_prepare_forced_close(meta_sk);
+       tcp_done(meta_sk);
+
+err_alloc_mpcb:
+       return -ENOBUFS;
+}
+
+static int __mptcp_check_req_master(struct sock *child,
+                                   struct request_sock *req)
+{
+       struct tcp_sock *child_tp = tcp_sk(child);
+       struct sock *meta_sk = child;
+       struct mptcp_cb *mpcb;
+       struct mptcp_request_sock *mtreq;
+
+       /* Never contained an MP_CAPABLE */
+       if (!inet_rsk(req)->mptcp_rqsk)
+               return 1;
+
+       if (!inet_rsk(req)->saw_mpc) {
+               /* Fallback to regular TCP, because we saw one SYN without
+                * MP_CAPABLE. In tcp_check_req we continue the regular path.
+                * But, the socket has been added to the reqsk_tk_htb, so we
+                * must still remove it.
+                */
+               MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
+               mptcp_reqsk_remove_tk(req);
+               return 1;
+       }
+
+       MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
+
+       /* Just set this values to pass them to mptcp_alloc_mpcb */
+       mtreq = mptcp_rsk(req);
+       child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
+       child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
+
+       if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
+                                  mtreq->mptcp_ver, child_tp->snd_wnd))
+               return -ENOBUFS;
+
+       child = tcp_sk(child)->mpcb->master_sk;
+       child_tp = tcp_sk(child);
+       mpcb = child_tp->mpcb;
+
+       child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
+       child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
+
+       mpcb->dss_csum = mtreq->dss_csum;
+       mpcb->server_side = 1;
+
+       /* Needs to be done here additionally, because when accepting a
+        * new connection we pass by __reqsk_free and not reqsk_free.
+        */
+       mptcp_reqsk_remove_tk(req);
+
+       /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
+       sock_put(meta_sk);
+
+       return 0;
+}
+
+int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req)
+{
+       struct sock *meta_sk = child, *master_sk;
+       struct sk_buff *skb;
+       u32 new_mapping;
+       int ret;
+
+       ret = __mptcp_check_req_master(child, req);
+       if (ret)
+               return ret;
+
+       master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
+
+       /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have
+        * pre-MPTCP data in the receive queue.
+        */
+       tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt -
+                                      tcp_rsk(req)->rcv_isn - 1;
+
+       /* Map subflow sequence number to data sequence numbers. We need to map
+        * these data to [IDSN - len - 1, IDSN[.
+        */
+       new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1;
+
+       /* There should be only one skb: the SYN + data. */
+       skb_queue_walk(&meta_sk->sk_receive_queue, skb) {
+               TCP_SKB_CB(skb)->seq += new_mapping;
+               TCP_SKB_CB(skb)->end_seq += new_mapping;
+       }
+
+       /* With fastopen we change the semantics of the relative subflow
+        * sequence numbers to deal with middleboxes that could add/remove
+        * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1
+        * instead of the regular TCP ISN.
+        */
+       tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1;
+
+       /* We need to update copied_seq of the master_sk to account for the
+        * already moved data to the meta receive queue.
+        */
+       tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt;
+
+       /* Handled by the master_sk */
+       tcp_sk(meta_sk)->fastopen_rsk = NULL;
+
+       return 0;
+}
+
+int mptcp_check_req_master(struct sock *sk, struct sock *child,
+                          struct request_sock *req, const struct sk_buff *skb,
+                          int drop)
+{
+       struct sock *meta_sk = child;
+       int ret;
+
+       ret = __mptcp_check_req_master(child, req);
+       if (ret)
+               return ret;
+       child = tcp_sk(child)->mpcb->master_sk;
+
+       sock_rps_save_rxhash(child, skb);
+
+       /* drop indicates that we come from tcp_check_req and thus need to
+        * handle the request-socket fully.
+        */
+       if (drop) {
+               tcp_synack_rtt_meas(child, req);
+               inet_csk_complete_hashdance(sk, meta_sk, req, true);
+       } else {
+               /* Thus, we come from syn-cookies */
+               refcount_set(&req->rsk_refcnt, 1);
+               inet_csk_reqsk_queue_add(sk, req, meta_sk);
+       }
+
+       return 0;
+}
+
+struct sock *mptcp_check_req_child(struct sock *meta_sk,
+                                  struct sock *child,
+                                  struct request_sock *req,
+                                  struct sk_buff *skb,
+                                  const struct mptcp_options_received *mopt)
+{
+       struct tcp_sock *child_tp = tcp_sk(child);
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       u8 hash_mac_check[20];
+
+       child_tp->inside_tk_table = 0;
+
+       if (!mopt->join_ack) {
+               MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKFAIL);
+               goto teardown;
+       }
+
+       mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
+                       (u8 *)&mpcb->mptcp_loc_key,
+                       (u32 *)hash_mac_check, 2,
+                       4, (u8 *)&mtreq->mptcp_rem_nonce,
+                       4, (u8 *)&mtreq->mptcp_loc_nonce);
+
+       if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20)) {
+               MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKMAC);
+               goto teardown;
+       }
+
+       /* Point it to the same struct socket and wq as the meta_sk */
+       sk_set_socket(child, meta_sk->sk_socket);
+       child->sk_wq = meta_sk->sk_wq;
+
+       if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {
+               /* Has been inherited, but now child_tp->mptcp is NULL */
+               child_tp->mpc = 0;
+               child_tp->ops = &tcp_specific;
+
+               /* TODO when we support acking the third ack for new subflows,
+                * we should silently discard this third ack, by returning NULL.
+                *
+                * Maybe, at the retransmission we will have enough memory to
+                * fully add the socket to the meta-sk.
+                */
+               goto teardown;
+       }
+
+       /* The child is a clone of the meta socket, we must now reset
+        * some of the fields
+        */
+       child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio;
+
+       /* We should allow proper increase of the snd/rcv-buffers. Thus, we
+        * use the original values instead of the bloated up ones from the
+        * clone.
+        */
+       child->sk_sndbuf = mpcb->orig_sk_sndbuf;
+       child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
+
+       child_tp->mptcp->slave_sk = 1;
+       child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
+       child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
+       child_tp->mptcp->init_rcv_wnd = req->rsk_rcv_wnd;
+
+       child->sk_tsq_flags = 0;
+       child_tp->out_of_order_queue = RB_ROOT;
+
+       sock_rps_save_rxhash(child, skb);
+       tcp_synack_rtt_meas(child, req);
+
+       /* Subflows do not use the accept queue, as they
+        * are attached immediately to the mpcb.
+        */
+       inet_csk_reqsk_queue_drop(meta_sk, req);
+       reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
+
+       /* The refcnt is initialized to 2, because regular TCP will put him
+        * in the socket's listener queue. However, we do not have a listener-queue.
+        * So, we need to make sure that this request-sock indeed gets destroyed.
+        */
+       reqsk_put(req);
+
+       MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINACKRX);
+       return child;
+
+teardown:
+       req->rsk_ops->send_reset(meta_sk, skb);
+
+       /* Drop this request - sock creation failed. */
+       inet_csk_reqsk_queue_drop(meta_sk, req);
+       reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
+       inet_csk_prepare_forced_close(child);
+       tcp_done(child);
+       return meta_sk;
+}
+
+int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw)
+{
+       struct mptcp_tw *mptw;
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_cb *mpcb = tp->mpcb;
+
+       /* A subsocket in tw can only receive data. So, if we are in
+        * infinite-receive, then we should not reply with a data-ack or act
+        * upon general MPTCP-signaling. We prevent this by simply not creating
+        * the mptcp_tw_sock.
+        */
+       if (mpcb->infinite_mapping_rcv) {
+               tw->mptcp_tw = NULL;
+               return 0;
+       }
+
+       /* Alloc MPTCP-tw-sock */
+       mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
+       if (!mptw) {
+               tw->mptcp_tw = NULL;
+               return -ENOBUFS;
+       }
+
+       atomic_inc(&mpcb->mpcb_refcnt);
+
+       tw->mptcp_tw = mptw;
+       mptw->loc_key = mpcb->mptcp_loc_key;
+       mptw->meta_tw = mpcb->in_time_wait;
+       mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
+       if (mptw->meta_tw && mpcb->mptw_state != TCP_TIME_WAIT)
+               mptw->rcv_nxt++;
+       rcu_assign_pointer(mptw->mpcb, mpcb);
+
+       spin_lock(&mpcb->tw_lock);
+       list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
+       mptw->in_list = 1;
+       spin_unlock(&mpcb->tw_lock);
+
+       return 0;
+}
+
+void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
+{
+       struct mptcp_cb *mpcb;
+
+       rcu_read_lock_bh();
+       mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
+
+       /* If we are still holding a ref to the mpcb, we have to remove ourself
+        * from the list and drop the ref properly.
+        */
+       if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {
+               spin_lock(&mpcb->tw_lock);
+               if (tw->mptcp_tw->in_list) {
+                       list_del_rcu(&tw->mptcp_tw->list);
+                       tw->mptcp_tw->in_list = 0;
+               }
+               spin_unlock(&mpcb->tw_lock);
+
+               /* Twice, because we increased it above */
+               mptcp_mpcb_put(mpcb);
+               mptcp_mpcb_put(mpcb);
+       }
+
+       rcu_read_unlock_bh();
+
+       kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
+}
+
+/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
+ * data-fin.
+ */
+void mptcp_time_wait(struct sock *meta_sk, int state, int timeo)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_tw *mptw;
+
+       /* Used for sockets that go into tw after the meta
+        * (see mptcp_init_tw_sock())
+        */
+       meta_tp->mpcb->in_time_wait = 1;
+       meta_tp->mpcb->mptw_state = state;
+
+       /* Update the time-wait-sock's information */
+       rcu_read_lock_bh();
+       list_for_each_entry_rcu(mptw, &meta_tp->mpcb->tw_list, list) {
+               mptw->meta_tw = 1;
+               mptw->rcv_nxt = mptcp_get_rcv_nxt_64(meta_tp);
+
+               /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
+                * pretend as if the DATA_FIN has already reached us, that way
+                * the checks in tcp_timewait_state_process will be good as the
+                * DATA_FIN comes in.
+                */
+               if (state != TCP_TIME_WAIT)
+                       mptw->rcv_nxt++;
+       }
+       rcu_read_unlock_bh();
+
+       if (meta_sk->sk_state != TCP_CLOSE)
+               tcp_done(meta_sk);
+}
+
+void mptcp_tsq_flags(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+
+       /* It will be handled as a regular deferred-call */
+       if (is_meta_sk(sk))
+               return;
+
+       if (hlist_unhashed(&tp->mptcp->cb_list)) {
+               hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list);
+               /* We need to hold it here, as the sock_hold is not assured
+                * by the release_sock as it is done in regular TCP.
+                *
+                * The subsocket may get inet_csk_destroy'd while it is inside
+                * the callback_list.
+                */
+               sock_hold(sk);
+       }
+
+       if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &meta_sk->sk_tsq_flags))
+               sock_hold(meta_sk);
+}
+
+void mptcp_tsq_sub_deferred(struct sock *meta_sk)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_tcp_sock *mptcp;
+       struct hlist_node *tmp;
+
+       WARN_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);
+
+       __sock_put(meta_sk);
+       hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {
+               struct tcp_sock *tp = mptcp->tp;
+               struct sock *sk = (struct sock *)tp;
+
+               hlist_del_init(&mptcp->cb_list);
+               sk->sk_prot->release_cb(sk);
+               /* Final sock_put (cfr. mptcp_tsq_flags) */
+               sock_put(sk);
+       }
+}
+
+void mptcp_join_reqsk_init(const struct mptcp_cb *mpcb,
+                          const struct request_sock *req,
+                          struct sk_buff *skb)
+{
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+       struct mptcp_options_received mopt;
+       u8 mptcp_hash_mac[20];
+
+       mptcp_init_mp_opt(&mopt);
+       tcp_parse_mptcp_options(skb, &mopt);
+
+       mtreq->is_sub = 1;
+       inet_rsk(req)->mptcp_rqsk = 1;
+
+       mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
+
+       mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
+                       (u8 *)&mpcb->mptcp_rem_key,
+                       (u32 *)mptcp_hash_mac, 2,
+                       4, (u8 *)&mtreq->mptcp_loc_nonce,
+                       4, (u8 *)&mtreq->mptcp_rem_nonce);
+       mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
+
+       mtreq->rem_id = mopt.rem_id;
+       mtreq->rcv_low_prio = mopt.low_prio;
+       inet_rsk(req)->saw_mpc = 1;
+
+       MPTCP_INC_STATS(sock_net(mpcb->meta_sk), MPTCP_MIB_JOINSYNRX);
+}
+
+void mptcp_reqsk_init(struct request_sock *req, const struct sock *sk,
+                     const struct sk_buff *skb, bool want_cookie)
+{
+       struct mptcp_options_received mopt;
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+
+       mptcp_init_mp_opt(&mopt);
+       tcp_parse_mptcp_options(skb, &mopt);
+
+       mtreq->dss_csum = mopt.dss_csum;
+
+       if (want_cookie) {
+               if (!mptcp_reqsk_new_cookie(req, &mopt, skb))
+                       /* No key available - back to regular TCP */
+                       inet_rsk(req)->mptcp_rqsk = 0;
+               return;
+       }
+
+       mptcp_reqsk_new_mptcp(req, sk, &mopt, skb);
+}
+
+void mptcp_cookies_reqsk_init(struct request_sock *req,
+                             struct mptcp_options_received *mopt,
+                             struct sk_buff *skb)
+{
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+
+       /* Absolutely need to always initialize this. */
+       mtreq->hash_entry.pprev = NULL;
+
+       mtreq->mptcp_rem_key = mopt->mptcp_sender_key;
+       mtreq->mptcp_loc_key = mopt->mptcp_receiver_key;
+
+       /* Generate the token */
+       mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
+
+       rcu_read_lock_bh();
+       spin_lock(&mptcp_tk_hashlock);
+
+       /* Check, if the key is still free */
+       if (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
+           mptcp_find_token(mtreq->mptcp_loc_token))
+               goto out;
+
+       inet_rsk(req)->saw_mpc = 1;
+       mtreq->is_sub = 0;
+       inet_rsk(req)->mptcp_rqsk = 1;
+       mtreq->dss_csum = mopt->dss_csum;
+
+out:
+       spin_unlock(&mptcp_tk_hashlock);
+       rcu_read_unlock_bh();
+}
+
+int mptcp_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+       struct mptcp_options_received mopt;
+
+       mptcp_init_mp_opt(&mopt);
+       tcp_parse_mptcp_options(skb, &mopt);
+
+       if (mopt.is_mp_join)
+               return mptcp_do_join_short(skb, &mopt, sock_net(sk));
+       if (mopt.drop_me)
+               goto drop;
+
+       if (!sock_flag(sk, SOCK_MPTCP))
+               mopt.saw_mpc = 0;
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               if (mopt.saw_mpc) {
+                       if (skb_rtable(skb)->rt_flags &
+                           (RTCF_BROADCAST | RTCF_MULTICAST))
+                               goto drop;
+
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE);
+                       return tcp_conn_request(&mptcp_request_sock_ops,
+                                               &mptcp_request_sock_ipv4_ops,
+                                               sk, skb);
+               }
+
+               return tcp_v4_conn_request(sk, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+       } else {
+               if (mopt.saw_mpc) {
+                       if (!ipv6_unicast_destination(skb))
+                               goto drop;
+
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVE);
+                       return tcp_conn_request(&mptcp6_request_sock_ops,
+                                               &mptcp_request_sock_ipv6_ops,
+                                               sk, skb);
+               }
+
+               return tcp_v6_conn_request(sk, skb);
+#endif
+       }
+drop:
+       __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
+       return 0;
+}
+
+static void __mptcp_get_info(const struct sock *meta_sk,
+                            struct mptcp_meta_info *info)
+{
+       const struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
+       const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       u32 now = tcp_jiffies32;
+
+       memset(info, 0, sizeof(*info));
+
+       info->mptcpi_state = meta_sk->sk_state;
+       info->mptcpi_retransmits = meta_icsk->icsk_retransmits;
+       info->mptcpi_probes = meta_icsk->icsk_probes_out;
+       info->mptcpi_backoff = meta_icsk->icsk_backoff;
+
+       info->mptcpi_rto = jiffies_to_usecs(meta_icsk->icsk_rto);
+
+       info->mptcpi_unacked = meta_tp->packets_out;
+
+       info->mptcpi_last_data_sent = jiffies_to_msecs(now - meta_tp->lsndtime);
+       info->mptcpi_last_data_recv = jiffies_to_msecs(now - meta_icsk->icsk_ack.lrcvtime);
+       info->mptcpi_last_ack_recv = jiffies_to_msecs(now - meta_tp->rcv_tstamp);
+
+       info->mptcpi_total_retrans = meta_tp->total_retrans;
+
+       info->mptcpi_bytes_acked = meta_tp->bytes_acked;
+       info->mptcpi_bytes_received = meta_tp->bytes_received;
+}
+
+static void mptcp_get_sub_info(struct sock *sk, struct mptcp_sub_info *info)
+{
+       struct inet_sock *inet = inet_sk(sk);
+
+       memset(info, 0, sizeof(*info));
+
+       if (sk->sk_family == AF_INET) {
+               info->src_v4.sin_family = AF_INET;
+               info->src_v4.sin_port = inet->inet_sport;
+
+               info->src_v4.sin_addr.s_addr = inet->inet_rcv_saddr;
+               if (!info->src_v4.sin_addr.s_addr)
+                       info->src_v4.sin_addr.s_addr = inet->inet_saddr;
+
+               info->dst_v4.sin_family = AF_INET;
+               info->dst_v4.sin_port = inet->inet_dport;
+               info->dst_v4.sin_addr.s_addr = inet->inet_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+       } else {
+               struct ipv6_pinfo *np = inet6_sk(sk);
+
+               info->src_v6.sin6_family = AF_INET6;
+               info->src_v6.sin6_port = inet->inet_sport;
+
+               if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+                       info->src_v6.sin6_addr = np->saddr;
+               else
+                       info->src_v6.sin6_addr = sk->sk_v6_rcv_saddr;
+
+               info->dst_v6.sin6_family = AF_INET6;
+               info->dst_v6.sin6_port = inet->inet_dport;
+               info->dst_v6.sin6_addr = sk->sk_v6_daddr;
+#endif
+       }
+}
+
+int mptcp_get_info(const struct sock *meta_sk, char __user *optval, int optlen)
+{
+       const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sock *sk;
+
+       struct mptcp_meta_info meta_info;
+       struct mptcp_info m_info;
+
+       unsigned int info_len;
+
+       if (copy_from_user(&m_info, optval, optlen))
+               return -EFAULT;
+
+       if (m_info.meta_info) {
+               unsigned int len;
+
+               __mptcp_get_info(meta_sk, &meta_info);
+
+               /* Need to set this, if user thinks that tcp_info is bigger than ours */
+               len = min_t(unsigned int, m_info.meta_len, sizeof(meta_info));
+               m_info.meta_len = len;
+
+               if (copy_to_user((void __user *)m_info.meta_info, &meta_info, len))
+                       return -EFAULT;
+       }
+
+       /* Need to set this, if user thinks that tcp_info is bigger than ours */
+       info_len = min_t(unsigned int, m_info.tcp_info_len, sizeof(struct tcp_info));
+       m_info.tcp_info_len = info_len;
+
+       if (m_info.initial) {
+               struct mptcp_cb *mpcb = meta_tp->mpcb;
+
+               if (mpcb->master_sk) {
+                       struct tcp_info info;
+
+                       tcp_get_info(mpcb->master_sk, &info);
+                       if (copy_to_user((void __user *)m_info.initial, &info, info_len))
+                               return -EFAULT;
+               } else if (meta_tp->record_master_info && mpcb->master_info) {
+                       if (copy_to_user((void __user *)m_info.initial, mpcb->master_info, info_len))
+                               return -EFAULT;
+               } else {
+                       return meta_tp->record_master_info ? -ENOMEM : -EINVAL;
+               }
+       }
+
+       if (m_info.subflows) {
+               unsigned int len, sub_len = 0;
+               char __user *ptr;
+
+               ptr = (char __user *)m_info.subflows;
+               len = m_info.sub_len;
+
+               mptcp_for_each_sk(meta_tp->mpcb, sk) {
+                       struct tcp_info t_info;
+                       unsigned int tmp_len;
+
+                       tcp_get_info(sk, &t_info);
+
+                       tmp_len = min_t(unsigned int, len, info_len);
+                       len -= tmp_len;
+
+                       if (copy_to_user(ptr, &t_info, tmp_len))
+                               return -EFAULT;
+
+                       ptr += tmp_len;
+                       sub_len += tmp_len;
+
+                       if (len == 0)
+                               break;
+               }
+
+               m_info.sub_len = sub_len;
+       }
+
+       if (m_info.subflow_info) {
+               unsigned int len, sub_info_len, total_sub_info_len = 0;
+               char __user *ptr;
+
+               ptr = (char __user *)m_info.subflow_info;
+               len = m_info.total_sub_info_len;
+
+               sub_info_len = min_t(unsigned int, m_info.sub_info_len,
+                                    sizeof(struct mptcp_sub_info));
+               m_info.sub_info_len = sub_info_len;
+
+               mptcp_for_each_sk(meta_tp->mpcb, sk) {
+                       struct mptcp_sub_info m_sub_info;
+                       unsigned int tmp_len;
+
+                       mptcp_get_sub_info(sk, &m_sub_info);
+
+                       tmp_len = min_t(unsigned int, len, sub_info_len);
+                       len -= tmp_len;
+
+                       if (copy_to_user(ptr, &m_sub_info, tmp_len))
+                               return -EFAULT;
+
+                       ptr += tmp_len;
+                       total_sub_info_len += tmp_len;
+
+                       if (len == 0)
+                               break;
+               }
+
+               m_info.total_sub_info_len = total_sub_info_len;
+       }
+
+       if (copy_to_user(optval, &m_info, optlen))
+               return -EFAULT;
+
+       return 0;
+}
+
+void mptcp_clear_sk(struct sock *sk, int size)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       /* we do not want to clear tk_table field, because of RCU lookups */
+       sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table.next));
+
+       size -= offsetof(struct tcp_sock, tk_table.pprev);
+       memset((char *)&tp->tk_table.pprev, 0, size);
+}
+
+static const struct snmp_mib mptcp_snmp_list[] = {
+       SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
+       SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE),
+       SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK),
+       SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
+       SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
+       SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+       SNMP_MIB_ITEM("MPCapableRetransFallback", MPTCP_MIB_MPCAPABLERETRANSFALLBACK),
+       SNMP_MIB_ITEM("MPTCPCsumEnabled", MPTCP_MIB_CSUMENABLED),
+       SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
+       SNMP_MIB_ITEM("MPFailRX", MPTCP_MIB_MPFAILRX),
+       SNMP_MIB_ITEM("MPCsumFail", MPTCP_MIB_CSUMFAIL),
+       SNMP_MIB_ITEM("MPFastcloseRX", MPTCP_MIB_FASTCLOSERX),
+       SNMP_MIB_ITEM("MPFastcloseTX", MPTCP_MIB_FASTCLOSETX),
+       SNMP_MIB_ITEM("MPFallbackAckSub", MPTCP_MIB_FBACKSUB),
+       SNMP_MIB_ITEM("MPFallbackAckInit", MPTCP_MIB_FBACKINIT),
+       SNMP_MIB_ITEM("MPFallbackDataSub", MPTCP_MIB_FBDATASUB),
+       SNMP_MIB_ITEM("MPFallbackDataInit", MPTCP_MIB_FBDATAINIT),
+       SNMP_MIB_ITEM("MPRemoveAddrSubDelete", MPTCP_MIB_REMADDRSUB),
+       SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
+       SNMP_MIB_ITEM("MPJoinAlreadyFallenback", MPTCP_MIB_JOINFALLBACK),
+       SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX),
+       SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
+       SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
+       SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
+       SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
+       SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+       SNMP_MIB_ITEM("MPJoinAckMissing", MPTCP_MIB_JOINACKFAIL),
+       SNMP_MIB_ITEM("MPJoinAckRTO", MPTCP_MIB_JOINACKRTO),
+       SNMP_MIB_ITEM("MPJoinAckRexmit", MPTCP_MIB_JOINACKRXMIT),
+       SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW),
+       SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+       SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+       SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH),
+       SNMP_MIB_ITEM("DSSTrimHead", MPTCP_MIB_DSSTRIMHEAD),
+       SNMP_MIB_ITEM("DSSSplitTail", MPTCP_MIB_DSSSPLITTAIL),
+       SNMP_MIB_ITEM("DSSPurgeOldSubSegs", MPTCP_MIB_PURGEOLD),
+       SNMP_MIB_ITEM("AddAddrRx", MPTCP_MIB_ADDADDRRX),
+       SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX),
+       SNMP_MIB_ITEM("RemAddrRx", MPTCP_MIB_REMADDRRX),
+       SNMP_MIB_ITEM("RemAddrTx", MPTCP_MIB_REMADDRTX),
+       SNMP_MIB_SENTINEL
+};
+
+struct workqueue_struct *mptcp_wq;
+EXPORT_SYMBOL(mptcp_wq);
+
+/* Output /proc/net/mptcp */
+static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
+{
+       struct tcp_sock *meta_tp;
+       const struct net *net = seq->private;
+       int i, n = 0;
+
+       seq_printf(seq, "  sl  loc_tok  rem_tok  v6 local_address                         remote_address                        st ns tx_queue rx_queue inode");
+       seq_putc(seq, '\n');
+
+       for (i = 0; i < MPTCP_HASH_SIZE; i++) {
+               struct hlist_nulls_node *node;
+
+               rcu_read_lock_bh();
+               hlist_nulls_for_each_entry_rcu(meta_tp, node,
+                                              &tk_hashtable[i], tk_table) {
+                       struct mptcp_cb *mpcb = meta_tp->mpcb;
+                       struct sock *meta_sk = (struct sock *)meta_tp;
+                       struct inet_sock *isk = inet_sk(meta_sk);
+
+                       if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk)))
+                               continue;
+
+                       if (capable(CAP_NET_ADMIN)) {
+                               seq_printf(seq, "%4d: %04X %04X ", n++,
+                                               mpcb->mptcp_loc_token,
+                                               mpcb->mptcp_rem_token);
+                       } else {
+                               seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1);
+                       }
+                       if (meta_sk->sk_family == AF_INET ||
+                           mptcp_v6_is_v4_mapped(meta_sk)) {
+                               seq_printf(seq, " 0 %08X:%04X                         %08X:%04X                        ",
+                                          isk->inet_rcv_saddr,
+                                          ntohs(isk->inet_sport),
+                                          isk->inet_daddr,
+                                          ntohs(isk->inet_dport));
+#if IS_ENABLED(CONFIG_IPV6)
+                       } else if (meta_sk->sk_family == AF_INET6) {
+                               struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr;
+                               struct in6_addr *dst = &meta_sk->sk_v6_daddr;
+
+                               seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
+                                          src->s6_addr32[0], src->s6_addr32[1],
+                                          src->s6_addr32[2], src->s6_addr32[3],
+                                          ntohs(isk->inet_sport),
+                                          dst->s6_addr32[0], dst->s6_addr32[1],
+                                          dst->s6_addr32[2], dst->s6_addr32[3],
+                                          ntohs(isk->inet_dport));
+#endif
+                       }
+                       seq_printf(seq, " %02X %02X %08X:%08X %lu",
+                                  meta_sk->sk_state, mpcb->cnt_subflows,
+                                  meta_tp->write_seq - meta_tp->snd_una,
+                                  max_t(int, meta_tp->rcv_nxt -
+                                        meta_tp->copied_seq, 0),
+                                  sock_i_ino(meta_sk));
+                       seq_putc(seq, '\n');
+               }
+
+               rcu_read_unlock_bh();
+       }
+
+       return 0;
+}
+
+static int mptcp_pm_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open_net(inode, file, mptcp_pm_seq_show);
+}
+
+static const struct file_operations mptcp_pm_seq_fops = {
+       .owner = THIS_MODULE,
+       .open = mptcp_pm_seq_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release_net,
+};
+
+static int mptcp_snmp_seq_show(struct seq_file *seq, void *v)
+{
+       struct net *net = seq->private;
+       int i;
+
+       for (i = 0; mptcp_snmp_list[i].name != NULL; i++)
+               seq_printf(seq, "%-32s\t%ld\n", mptcp_snmp_list[i].name,
+                          snmp_fold_field(net->mptcp.mptcp_statistics,
+                                     mptcp_snmp_list[i].entry));
+
+       return 0;
+}
+
+static int mptcp_snmp_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open_net(inode, file, mptcp_snmp_seq_show);
+}
+
+static const struct file_operations mptcp_snmp_seq_fops = {
+       .owner = THIS_MODULE,
+       .open = mptcp_snmp_seq_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release_net,
+};
+
+static int mptcp_pm_init_net(struct net *net)
+{
+       net->mptcp.mptcp_statistics = alloc_percpu(struct mptcp_mib);
+       if (!net->mptcp.mptcp_statistics)
+               goto out_mptcp_mibs;
+
+#ifdef CONFIG_PROC_FS
+       net->mptcp.proc_net_mptcp = proc_net_mkdir(net, "mptcp_net", net->proc_net);
+       if (!net->mptcp.proc_net_mptcp)
+               goto out_proc_net_mptcp;
+       if (!proc_create("mptcp", 0444, net->mptcp.proc_net_mptcp,
+                        &mptcp_pm_seq_fops))
+               goto out_mptcp_net_mptcp;
+       if (!proc_create("snmp", 0444, net->mptcp.proc_net_mptcp,
+                        &mptcp_snmp_seq_fops))
+               goto out_mptcp_net_snmp;
+#endif
+
+       return 0;
+
+#ifdef CONFIG_PROC_FS
+out_mptcp_net_snmp:
+       remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp);
+out_mptcp_net_mptcp:
+       remove_proc_subtree("mptcp_net", net->proc_net);
+       net->mptcp.proc_net_mptcp = NULL;
+out_proc_net_mptcp:
+       free_percpu(net->mptcp.mptcp_statistics);
+#endif
+out_mptcp_mibs:
+       return -ENOMEM;
+}
+
+static void mptcp_pm_exit_net(struct net *net)
+{
+       remove_proc_entry("snmp", net->mptcp.proc_net_mptcp);
+       remove_proc_entry("mptcp", net->mptcp.proc_net_mptcp);
+       remove_proc_subtree("mptcp_net", net->proc_net);
+       free_percpu(net->mptcp.mptcp_statistics);
+}
+
+static struct pernet_operations mptcp_pm_proc_ops = {
+       .init = mptcp_pm_init_net,
+       .exit = mptcp_pm_exit_net,
+};
+
+/* General initialization of mptcp */
+void __init mptcp_init(void)
+{
+       int i;
+       struct ctl_table_header *mptcp_sysctl;
+
+       mptcp_sock_cache = kmem_cache_create("mptcp_sock",
+                                            sizeof(struct mptcp_tcp_sock),
+                                            0, SLAB_HWCACHE_ALIGN,
+                                            NULL);
+       if (!mptcp_sock_cache)
+               goto mptcp_sock_cache_failed;
+
+       mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
+                                          0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
+                                          NULL);
+       if (!mptcp_cb_cache)
+               goto mptcp_cb_cache_failed;
+
+       mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
+                                          0, SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
+                                          NULL);
+       if (!mptcp_tw_cache)
+               goto mptcp_tw_cache_failed;
+
+       get_random_bytes(&mptcp_secret, sizeof(mptcp_secret));
+
+       mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
+       if (!mptcp_wq)
+               goto alloc_workqueue_failed;
+
+       for (i = 0; i < MPTCP_HASH_SIZE; i++) {
+               INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);
+               INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);
+       }
+
+       spin_lock_init(&mptcp_tk_hashlock);
+
+       if (register_pernet_subsys(&mptcp_pm_proc_ops))
+               goto pernet_failed;
+
+#if IS_ENABLED(CONFIG_IPV6)
+       if (mptcp_pm_v6_init())
+               goto mptcp_pm_v6_failed;
+#endif
+       if (mptcp_pm_v4_init())
+               goto mptcp_pm_v4_failed;
+
+       mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
+       if (!mptcp_sysctl)
+               goto register_sysctl_failed;
+
+       if (mptcp_register_path_manager(&mptcp_pm_default))
+               goto register_pm_failed;
+
+       if (mptcp_register_scheduler(&mptcp_sched_default))
+               goto register_sched_failed;
+
+       pr_info("MPTCP: Unstable branch");
+
+       mptcp_init_failed = false;
+
+       return;
+
+register_sched_failed:
+       mptcp_unregister_path_manager(&mptcp_pm_default);
+register_pm_failed:
+       unregister_net_sysctl_table(mptcp_sysctl);
+register_sysctl_failed:
+       mptcp_pm_v4_undo();
+mptcp_pm_v4_failed:
+#if IS_ENABLED(CONFIG_IPV6)
+       mptcp_pm_v6_undo();
+mptcp_pm_v6_failed:
+#endif
+       unregister_pernet_subsys(&mptcp_pm_proc_ops);
+pernet_failed:
+       destroy_workqueue(mptcp_wq);
+alloc_workqueue_failed:
+       kmem_cache_destroy(mptcp_tw_cache);
+mptcp_tw_cache_failed:
+       kmem_cache_destroy(mptcp_cb_cache);
+mptcp_cb_cache_failed:
+       kmem_cache_destroy(mptcp_sock_cache);
+mptcp_sock_cache_failed:
+       mptcp_init_failed = true;
+}
diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c
new file mode 100644 (file)
index 0000000..d50b9de
--- /dev/null
@@ -0,0 +1,1988 @@
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/mptcp_v6.h>
+#include <net/addrconf.h>
+#endif
+
+enum {
+       MPTCP_EVENT_ADD = 1,
+       MPTCP_EVENT_DEL,
+       MPTCP_EVENT_MOD,
+};
+
+#define MPTCP_SUBFLOW_RETRY_DELAY      1000
+
+/* Max number of local or remote addresses we can store.
+ * When changing, see the bitfield below in fullmesh_rem4/6.
+ */
+#define MPTCP_MAX_ADDR 8
+
+struct fullmesh_rem4 {
+       u8              rem4_id;
+       u8              bitfield;
+       u8              retry_bitfield;
+       __be16          port;
+       struct in_addr  addr;
+};
+
+struct fullmesh_rem6 {
+       u8              rem6_id;
+       u8              bitfield;
+       u8              retry_bitfield;
+       __be16          port;
+       struct in6_addr addr;
+};
+
+struct mptcp_loc_addr {
+       struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
+       u8 loc4_bits;
+       u8 next_v4_index;
+
+       struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
+       u8 loc6_bits;
+       u8 next_v6_index;
+       struct rcu_head rcu;
+};
+
+struct mptcp_addr_event {
+       struct list_head list;
+       unsigned short  family;
+       u8      code:7,
+               low_prio:1;
+       int     if_idx;
+       union inet_addr addr;
+};
+
+struct fullmesh_priv {
+       /* Worker struct for subflow establishment */
+       struct work_struct subflow_work;
+       /* Delayed worker, when the routing-tables are not yet ready. */
+       struct delayed_work subflow_retry_work;
+
+       /* Remote addresses */
+       struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR];
+       struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR];
+
+       struct mptcp_cb *mpcb;
+
+       u16 remove_addrs; /* Addresses to remove */
+       u8 announced_addrs_v4; /* IPv4 Addresses we did announce */
+       u8 announced_addrs_v6; /* IPv6 Addresses we did announce */
+
+       u8      add_addr; /* Are we sending an add_addr? */
+
+       u8 rem4_bits;
+       u8 rem6_bits;
+
+       /* Are we established the additional subflows for primary pair? */
+       u8 first_pair:1;
+};
+
+struct mptcp_fm_ns {
+       struct mptcp_loc_addr __rcu *local;
+       spinlock_t local_lock; /* Protecting the above pointer */
+       struct list_head events;
+       struct delayed_work address_worker;
+
+       struct net *net;
+};
+
+static int num_subflows __read_mostly = 1;
+module_param(num_subflows, int, 0644);
+MODULE_PARM_DESC(num_subflows, "choose the number of subflows per pair of IP addresses of MPTCP connection");
+
+static int create_on_err __read_mostly;
+module_param(create_on_err, int, 0644);
+MODULE_PARM_DESC(create_on_err, "recreate the subflow upon a timeout");
+
+static struct mptcp_pm_ops full_mesh __read_mostly;
+
+static void full_mesh_create_subflows(struct sock *meta_sk);
+
+static struct mptcp_fm_ns *fm_get_ns(const struct net *net)
+{
+       return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];
+}
+
+static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb)
+{
+       return (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
+}
+
+/* Find the first free index in the bitfield */
+static int __mptcp_find_free_index(u8 bitfield, u8 base)
+{
+       int i;
+
+       /* There are anyways no free bits... */
+       if (bitfield == 0xff)
+               goto exit;
+
+       i = ffs(~(bitfield >> base)) - 1;
+       if (i < 0)
+               goto exit;
+
+       /* No free bits when starting at base, try from 0 on */
+       if (i + base >= sizeof(bitfield) * 8)
+               return __mptcp_find_free_index(bitfield, 0);
+
+       return i + base;
+exit:
+       return -1;
+}
+
+static int mptcp_find_free_index(u8 bitfield)
+{
+       return __mptcp_find_free_index(bitfield, 0);
+}
+
+static void mptcp_addv4_raddr(struct mptcp_cb *mpcb,
+                             const struct in_addr *addr,
+                             __be16 port, u8 id)
+{
+       int i;
+       struct fullmesh_rem4 *rem4;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+
+       mptcp_for_each_bit_set(fmp->rem4_bits, i) {
+               rem4 = &fmp->remaddr4[i];
+
+               /* Address is already in the list --- continue */
+               if (rem4->rem4_id == id &&
+                   rem4->addr.s_addr == addr->s_addr && rem4->port == port)
+                       return;
+
+               /* This may be the case, when the peer is behind a NAT. He is
+                * trying to JOIN, thus sending the JOIN with a certain ID.
+                * However the src_addr of the IP-packet has been changed. We
+                * update the addr in the list, because this is the address as
+                * OUR BOX sees it.
+                */
+               if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {
+                       /* update the address */
+                       mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
+                                   __func__, &rem4->addr.s_addr,
+                                   &addr->s_addr, id);
+                       rem4->addr.s_addr = addr->s_addr;
+                       rem4->port = port;
+                       mpcb->list_rcvd = 1;
+                       return;
+               }
+       }
+
+       i = mptcp_find_free_index(fmp->rem4_bits);
+       /* Do we have already the maximum number of local/remote addresses? */
+       if (i < 0) {
+               mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
+                           __func__, MPTCP_MAX_ADDR, &addr->s_addr);
+               return;
+       }
+
+       rem4 = &fmp->remaddr4[i];
+
+       /* Address is not known yet, store it */
+       rem4->addr.s_addr = addr->s_addr;
+       rem4->port = port;
+       rem4->bitfield = 0;
+       rem4->retry_bitfield = 0;
+       rem4->rem4_id = id;
+       mpcb->list_rcvd = 1;
+       fmp->rem4_bits |= (1 << i);
+}
+
+static void mptcp_addv6_raddr(struct mptcp_cb *mpcb,
+                             const struct in6_addr *addr,
+                             __be16 port, u8 id)
+{
+       int i;
+       struct fullmesh_rem6 *rem6;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+
+       mptcp_for_each_bit_set(fmp->rem6_bits, i) {
+               rem6 = &fmp->remaddr6[i];
+
+               /* Address is already in the list --- continue */
+               if (rem6->rem6_id == id &&
+                   ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
+                       return;
+
+               /* This may be the case, when the peer is behind a NAT. He is
+                * trying to JOIN, thus sending the JOIN with a certain ID.
+                * However the src_addr of the IP-packet has been changed. We
+                * update the addr in the list, because this is the address as
+                * OUR BOX sees it.
+                */
+               if (rem6->rem6_id == id) {
+                       /* update the address */
+                       mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
+                                   __func__, &rem6->addr, addr, id);
+                       rem6->addr = *addr;
+                       rem6->port = port;
+                       mpcb->list_rcvd = 1;
+                       return;
+               }
+       }
+
+       i = mptcp_find_free_index(fmp->rem6_bits);
+       /* Do we have already the maximum number of local/remote addresses? */
+       if (i < 0) {
+               mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
+                           __func__, MPTCP_MAX_ADDR, addr);
+               return;
+       }
+
+       rem6 = &fmp->remaddr6[i];
+
+       /* Address is not known yet, store it */
+       rem6->addr = *addr;
+       rem6->port = port;
+       rem6->bitfield = 0;
+       rem6->retry_bitfield = 0;
+       rem6->rem6_id = id;
+       mpcb->list_rcvd = 1;
+       fmp->rem6_bits |= (1 << i);
+}
+
+static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
+{
+       int i;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+
+       mptcp_for_each_bit_set(fmp->rem4_bits, i) {
+               if (fmp->remaddr4[i].rem4_id == id) {
+                       /* remove address from bitfield */
+                       fmp->rem4_bits &= ~(1 << i);
+
+                       break;
+               }
+       }
+}
+
+static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id)
+{
+       int i;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+
+       mptcp_for_each_bit_set(fmp->rem6_bits, i) {
+               if (fmp->remaddr6[i].rem6_id == id) {
+                       /* remove address from bitfield */
+                       fmp->rem6_bits &= ~(1 << i);
+
+                       break;
+               }
+       }
+}
+
+/* Sets the bitfield of the remote-address field */
+static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb,
+                                      const struct in_addr *addr, u8 index)
+{
+       int i;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+
+       mptcp_for_each_bit_set(fmp->rem4_bits, i) {
+               if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) {
+                       fmp->remaddr4[i].bitfield |= (1 << index);
+                       return;
+               }
+       }
+}
+
+/* Sets the bitfield of the remote-address field */
+static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
+                                      const struct in6_addr *addr, u8 index)
+{
+       int i;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+
+       mptcp_for_each_bit_set(fmp->rem6_bits, i) {
+               if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) {
+                       fmp->remaddr6[i].bitfield |= (1 << index);
+                       return;
+               }
+       }
+}
+
+static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb,
+                                   const union inet_addr *addr,
+                                   sa_family_t family, u8 id)
+{
+       if (family == AF_INET)
+               mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id);
+       else
+               mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id);
+}
+
+static void mptcp_v4_subflows(struct sock *meta_sk,
+                             const struct mptcp_loc4 *loc,
+                             struct mptcp_rem4 *rem)
+{
+       int i;
+
+       for (i = 1; i < num_subflows; i++)
+               mptcp_init4_subsockets(meta_sk, loc, rem);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void mptcp_v6_subflows(struct sock *meta_sk,
+                             const struct mptcp_loc6 *loc,
+                             struct mptcp_rem6 *rem)
+{
+       int i;
+
+       for (i = 1; i < num_subflows; i++)
+               mptcp_init6_subsockets(meta_sk, loc, rem);
+}
+#endif
+
+static void retry_subflow_worker(struct work_struct *work)
+{
+       struct delayed_work *delayed_work = container_of(work,
+                                                        struct delayed_work,
+                                                        work);
+       struct fullmesh_priv *fmp = container_of(delayed_work,
+                                                struct fullmesh_priv,
+                                                subflow_retry_work);
+       struct mptcp_cb *mpcb = fmp->mpcb;
+       struct sock *meta_sk = mpcb->meta_sk;
+       struct mptcp_loc_addr *mptcp_local;
+       struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
+       int iter = 0, i;
+
+       /* We need a local (stable) copy of the address-list. Really, it is not
+        * such a big deal, if the address-list is not 100% up-to-date.
+        */
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference_bh(fm_ns->local);
+       mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
+       rcu_read_unlock_bh();
+
+       if (!mptcp_local)
+               return;
+
+next_subflow:
+       if (iter) {
+               release_sock(meta_sk);
+               mutex_unlock(&mpcb->mpcb_mutex);
+
+               cond_resched();
+       }
+       mutex_lock(&mpcb->mpcb_mutex);
+       lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
+
+       iter++;
+
+       if (sock_flag(meta_sk, SOCK_DEAD))
+               goto exit;
+
+       mptcp_for_each_bit_set(fmp->rem4_bits, i) {
+               struct fullmesh_rem4 *rem = &fmp->remaddr4[i];
+               /* Do we need to retry establishing a subflow ? */
+               if (rem->retry_bitfield) {
+                       int i = mptcp_find_free_index(~rem->retry_bitfield);
+                       struct mptcp_rem4 rem4;
+
+                       rem->bitfield |= (1 << i);
+                       rem->retry_bitfield &= ~(1 << i);
+
+                       rem4.addr = rem->addr;
+                       rem4.port = rem->port;
+                       rem4.rem4_id = rem->rem4_id;
+
+                       mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4);
+                       mptcp_v4_subflows(meta_sk,
+                                         &mptcp_local->locaddr4[i],
+                                         &rem4);
+                       goto next_subflow;
+               }
+       }
+
+#if IS_ENABLED(CONFIG_IPV6)
+       mptcp_for_each_bit_set(fmp->rem6_bits, i) {
+               struct fullmesh_rem6 *rem = &fmp->remaddr6[i];
+
+               /* Do we need to retry establishing a subflow ? */
+               if (rem->retry_bitfield) {
+                       int i = mptcp_find_free_index(~rem->retry_bitfield);
+                       struct mptcp_rem6 rem6;
+
+                       rem->bitfield |= (1 << i);
+                       rem->retry_bitfield &= ~(1 << i);
+
+                       rem6.addr = rem->addr;
+                       rem6.port = rem->port;
+                       rem6.rem6_id = rem->rem6_id;
+
+                       mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6);
+                       mptcp_v6_subflows(meta_sk,
+                                         &mptcp_local->locaddr6[i],
+                                         &rem6);
+                       goto next_subflow;
+               }
+       }
+#endif
+
+exit:
+       kfree(mptcp_local);
+       release_sock(meta_sk);
+       mutex_unlock(&mpcb->mpcb_mutex);
+       sock_put(meta_sk);
+}
+
+/**
+ * Create all new subflows, by doing calls to mptcp_initX_subsockets
+ *
+ * This function uses a goto next_subflow, to allow releasing the lock between
+ * new subflows and giving other processes a chance to do some work on the
+ * socket and potentially finishing the communication.
+ **/
+static void create_subflow_worker(struct work_struct *work)
+{
+       struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv,
+                                                subflow_work);
+       struct mptcp_cb *mpcb = fmp->mpcb;
+       struct sock *meta_sk = mpcb->meta_sk;
+       struct mptcp_loc_addr *mptcp_local;
+       const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
+       int iter = 0, retry = 0;
+       int i;
+
+       /* We need a local (stable) copy of the address-list. Really, it is not
+        * such a big deal, if the address-list is not 100% up-to-date.
+        */
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference_bh(fm_ns->local);
+       mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
+       rcu_read_unlock_bh();
+
+       if (!mptcp_local)
+               return;
+
+next_subflow:
+       if (iter) {
+               release_sock(meta_sk);
+               mutex_unlock(&mpcb->mpcb_mutex);
+
+               cond_resched();
+       }
+       mutex_lock(&mpcb->mpcb_mutex);
+       lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
+
+       if (sock_flag(meta_sk, SOCK_DEAD))
+               goto exit;
+
+       if (mpcb->master_sk &&
+           !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
+               goto exit;
+
+       /* Create the additional subflows for the first pair */
+       if (fmp->first_pair == 0 && mpcb->master_sk) {
+               struct mptcp_loc4 loc;
+               struct mptcp_rem4 rem;
+
+               loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
+               loc.loc4_id = 0;
+               loc.low_prio = 0;
+               loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
+
+               rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
+               rem.port = inet_sk(meta_sk)->inet_dport;
+               rem.rem4_id = 0; /* Default 0 */
+
+               mptcp_v4_subflows(meta_sk, &loc, &rem);
+
+               fmp->first_pair = 1;
+       }
+       iter++;
+
+       mptcp_for_each_bit_set(fmp->rem4_bits, i) {
+               struct fullmesh_rem4 *rem;
+               u8 remaining_bits;
+
+               rem = &fmp->remaddr4[i];
+               remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;
+
+               /* Are there still combinations to handle? */
+               if (remaining_bits) {
+                       int i = mptcp_find_free_index(~remaining_bits);
+                       struct mptcp_rem4 rem4;
+
+                       rem->bitfield |= (1 << i);
+
+                       rem4.addr = rem->addr;
+                       rem4.port = rem->port;
+                       rem4.rem4_id = rem->rem4_id;
+
+                       /* If a route is not yet available then retry once */
+                       if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],
+                                                  &rem4) == -ENETUNREACH)
+                               retry = rem->retry_bitfield |= (1 << i);
+                       else
+                               mptcp_v4_subflows(meta_sk,
+                                                 &mptcp_local->locaddr4[i],
+                                                 &rem4);
+                       goto next_subflow;
+               }
+       }
+
+#if IS_ENABLED(CONFIG_IPV6)
+       if (fmp->first_pair == 0 && mpcb->master_sk) {
+                       struct mptcp_loc6 loc;
+                       struct mptcp_rem6 rem;
+
+                       loc.addr = inet6_sk(meta_sk)->saddr;
+                       loc.loc6_id = 0;
+                       loc.low_prio = 0;
+                       loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
+
+                       rem.addr = meta_sk->sk_v6_daddr;
+                       rem.port = inet_sk(meta_sk)->inet_dport;
+                       rem.rem6_id = 0; /* Default 0 */
+
+                       mptcp_v6_subflows(meta_sk, &loc, &rem);
+
+                       fmp->first_pair = 1;
+       }
+       mptcp_for_each_bit_set(fmp->rem6_bits, i) {
+               struct fullmesh_rem6 *rem;
+               u8 remaining_bits;
+
+               rem = &fmp->remaddr6[i];
+               remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;
+
+               /* Are there still combinations to handle? */
+               if (remaining_bits) {
+                       int i = mptcp_find_free_index(~remaining_bits);
+                       struct mptcp_rem6 rem6;
+
+                       rem->bitfield |= (1 << i);
+
+                       rem6.addr = rem->addr;
+                       rem6.port = rem->port;
+                       rem6.rem6_id = rem->rem6_id;
+
+                       /* If a route is not yet available then retry once */
+                       if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],
+                                                  &rem6) == -ENETUNREACH)
+                               retry = rem->retry_bitfield |= (1 << i);
+                       else
+                               mptcp_v6_subflows(meta_sk,
+                                                 &mptcp_local->locaddr6[i],
+                                                 &rem6);
+                       goto next_subflow;
+               }
+       }
+#endif
+
+       if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) {
+               sock_hold(meta_sk);
+               queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work,
+                                  msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
+       }
+
+exit:
+       kfree(mptcp_local);
+       release_sock(meta_sk);
+       mutex_unlock(&mpcb->mpcb_mutex);
+       sock_put(meta_sk);
+}
+
+static void announce_remove_addr(u8 addr_id, struct sock *meta_sk)
+{
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+       struct sock *sk = mptcp_select_ack_sock(meta_sk);
+
+       fmp->remove_addrs |= (1 << addr_id);
+       mpcb->addr_signal = 1;
+
+       if (sk)
+               tcp_send_ack(sk);
+}
+
+static void update_addr_bitfields(struct sock *meta_sk,
+                                 const struct mptcp_loc_addr *mptcp_local)
+{
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+       int i;
+
+       /* The bits in announced_addrs_* always match with loc*_bits. So, a
+        * simple & operation unsets the correct bits, because these go from
+        * announced to non-announced
+        */
+       fmp->announced_addrs_v4 &= mptcp_local->loc4_bits;
+
+       mptcp_for_each_bit_set(fmp->rem4_bits, i) {
+               fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits;
+               fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;
+       }
+
+       fmp->announced_addrs_v6 &= mptcp_local->loc6_bits;
+
+       mptcp_for_each_bit_set(fmp->rem6_bits, i) {
+               fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits;
+               fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;
+       }
+}
+
+static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local,
+                             sa_family_t family, const union inet_addr *addr,
+                             int if_idx)
+{
+       int i;
+       u8 loc_bits;
+       bool found = false;
+
+       if (family == AF_INET)
+               loc_bits = mptcp_local->loc4_bits;
+       else
+               loc_bits = mptcp_local->loc6_bits;
+
+       mptcp_for_each_bit_set(loc_bits, i) {
+               if (family == AF_INET &&
+                   (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) &&
+                   mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {
+                       found = true;
+                       break;
+               }
+               if (family == AF_INET6 &&
+                   (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) &&
+                   ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,
+                                   &addr->in6)) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found)
+               return -1;
+
+       return i;
+}
+
+static int mptcp_find_address_transp(const struct mptcp_loc_addr *mptcp_local,
+                                    sa_family_t family, int if_idx)
+{
+       bool found = false;
+       u8 loc_bits;
+       int i;
+
+       if (family == AF_INET)
+               loc_bits = mptcp_local->loc4_bits;
+       else
+               loc_bits = mptcp_local->loc6_bits;
+
+       mptcp_for_each_bit_set(loc_bits, i) {
+               if (family == AF_INET &&
+                   (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx)) {
+                       found = true;
+                       break;
+               }
+               if (family == AF_INET6 &&
+                   (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx)) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found)
+               return -1;
+
+       return i;
+}
+
+static void mptcp_address_worker(struct work_struct *work)
+{
+       const struct delayed_work *delayed_work = container_of(work,
+                                                        struct delayed_work,
+                                                        work);
+       struct mptcp_fm_ns *fm_ns = container_of(delayed_work,
+                                                struct mptcp_fm_ns,
+                                                address_worker);
+       struct net *net = fm_ns->net;
+       struct mptcp_addr_event *event = NULL;
+       struct mptcp_loc_addr *mptcp_local, *old;
+       int i, id = -1; /* id is used in the socket-code on a delete-event */
+       bool success; /* Used to indicate if we succeeded handling the event */
+
+next_event:
+       success = false;
+       kfree(event);
+
+       /* First, let's dequeue an event from our event-list */
+       rcu_read_lock_bh();
+       spin_lock(&fm_ns->local_lock);
+
+       event = list_first_entry_or_null(&fm_ns->events,
+                                        struct mptcp_addr_event, list);
+       if (!event) {
+               spin_unlock(&fm_ns->local_lock);
+               rcu_read_unlock_bh();
+               return;
+       }
+
+       list_del(&event->list);
+
+       mptcp_local = rcu_dereference_bh(fm_ns->local);
+
+       if (event->code == MPTCP_EVENT_DEL) {
+               id = mptcp_find_address(mptcp_local, event->family,
+                                       &event->addr, event->if_idx);
+
+               /* Not in the list - so we don't care */
+               if (id < 0) {
+                       mptcp_debug("%s could not find id\n", __func__);
+                       goto duno;
+               }
+
+               old = mptcp_local;
+               mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
+                                     GFP_ATOMIC);
+               if (!mptcp_local)
+                       goto duno;
+
+               if (event->family == AF_INET)
+                       mptcp_local->loc4_bits &= ~(1 << id);
+               else
+                       mptcp_local->loc6_bits &= ~(1 << id);
+
+               rcu_assign_pointer(fm_ns->local, mptcp_local);
+               kfree_rcu(old, rcu);
+       } else {
+               int i = mptcp_find_address(mptcp_local, event->family,
+                                          &event->addr, event->if_idx);
+               int j = i;
+
+               if (j < 0) {
+                       /* Not in the list, so we have to find an empty slot */
+                       if (event->family == AF_INET)
+                               i = __mptcp_find_free_index(mptcp_local->loc4_bits,
+                                                           mptcp_local->next_v4_index);
+                       if (event->family == AF_INET6)
+                               i = __mptcp_find_free_index(mptcp_local->loc6_bits,
+                                                           mptcp_local->next_v6_index);
+
+                       if (i < 0) {
+                               mptcp_debug("%s no more space\n", __func__);
+                               goto duno;
+                       }
+
+                       /* It might have been a MOD-event. */
+                       event->code = MPTCP_EVENT_ADD;
+               } else {
+                       /* Let's check if anything changes */
+                       if (event->family == AF_INET &&
+                           event->low_prio == mptcp_local->locaddr4[i].low_prio)
+                               goto duno;
+
+                       if (event->family == AF_INET6 &&
+                           event->low_prio == mptcp_local->locaddr6[i].low_prio)
+                               goto duno;
+               }
+
+               old = mptcp_local;
+               mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
+                                     GFP_ATOMIC);
+               if (!mptcp_local)
+                       goto duno;
+
+               if (event->family == AF_INET) {
+                       mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;
+                       mptcp_local->locaddr4[i].loc4_id = i + 1;
+                       mptcp_local->locaddr4[i].low_prio = event->low_prio;
+                       mptcp_local->locaddr4[i].if_idx = event->if_idx;
+               } else {
+                       mptcp_local->locaddr6[i].addr = event->addr.in6;
+                       mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;
+                       mptcp_local->locaddr6[i].low_prio = event->low_prio;
+                       mptcp_local->locaddr6[i].if_idx = event->if_idx;
+               }
+
+               if (j < 0) {
+                       if (event->family == AF_INET) {
+                               mptcp_local->loc4_bits |= (1 << i);
+                               mptcp_local->next_v4_index = i + 1;
+                       } else {
+                               mptcp_local->loc6_bits |= (1 << i);
+                               mptcp_local->next_v6_index = i + 1;
+                       }
+               }
+
+               rcu_assign_pointer(fm_ns->local, mptcp_local);
+               kfree_rcu(old, rcu);
+       }
+       success = true;
+
+duno:
+       spin_unlock(&fm_ns->local_lock);
+       rcu_read_unlock_bh();
+
+       if (!success)
+               goto next_event;
+
+       /* Now we iterate over the MPTCP-sockets and apply the event. */
+       for (i = 0; i < MPTCP_HASH_SIZE; i++) {
+               const struct hlist_nulls_node *node;
+               struct tcp_sock *meta_tp;
+
+               rcu_read_lock_bh();
+               hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],
+                                              tk_table) {
+                       struct mptcp_cb *mpcb = meta_tp->mpcb;
+                       struct sock *meta_sk = (struct sock *)meta_tp, *sk;
+                       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+                       bool meta_v4 = meta_sk->sk_family == AF_INET;
+
+                       if (sock_net(meta_sk) != net)
+                               continue;
+
+                       if (meta_v4) {
+                               /* skip IPv6 events if meta is IPv4 */
+                               if (event->family == AF_INET6)
+                                       continue;
+                       }
+                       /* skip IPv4 events if IPV6_V6ONLY is set */
+                       else if (event->family == AF_INET && meta_sk->sk_ipv6only)
+                               continue;
+
+                       if (unlikely(!refcount_inc_not_zero(&meta_sk->sk_refcnt)))
+                               continue;
+
+                       bh_lock_sock(meta_sk);
+
+                       if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) ||
+                           mpcb->infinite_mapping_snd ||
+                           mpcb->infinite_mapping_rcv ||
+                           mpcb->send_infinite_mapping)
+                               goto next;
+
+                       /* May be that the pm has changed in-between */
+                       if (mpcb->pm_ops != &full_mesh)
+                               goto next;
+
+                       if (sock_owned_by_user(meta_sk)) {
+                               if (!test_and_set_bit(MPTCP_PATH_MANAGER_DEFERRED,
+                                                     &meta_sk->sk_tsq_flags))
+                                       sock_hold(meta_sk);
+
+                               goto next;
+                       }
+
+                       if (event->code == MPTCP_EVENT_ADD) {
+                               fmp->add_addr++;
+                               mpcb->addr_signal = 1;
+
+                               sk = mptcp_select_ack_sock(meta_sk);
+                               if (sk)
+                                       tcp_send_ack(sk);
+
+                               full_mesh_create_subflows(meta_sk);
+                       }
+
+                       if (event->code == MPTCP_EVENT_DEL) {
+                               struct sock *sk, *tmpsk;
+                               struct mptcp_loc_addr *mptcp_local;
+                               bool found = false;
+
+                               mptcp_local = rcu_dereference_bh(fm_ns->local);
+
+                               /* In any case, we need to update our bitfields */
+                               if (id >= 0)
+                                       update_addr_bitfields(meta_sk, mptcp_local);
+
+                               /* Look for the socket and remove him */
+                               mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
+                                       if ((event->family == AF_INET6 &&
+                                            (sk->sk_family == AF_INET ||
+                                             mptcp_v6_is_v4_mapped(sk))) ||
+                                           (event->family == AF_INET &&
+                                            (sk->sk_family == AF_INET6 &&
+                                             !mptcp_v6_is_v4_mapped(sk))))
+                                               continue;
+
+                                       if (event->family == AF_INET &&
+                                           (sk->sk_family == AF_INET ||
+                                            mptcp_v6_is_v4_mapped(sk)) &&
+                                            inet_sk(sk)->inet_saddr != event->addr.in.s_addr)
+                                               continue;
+
+                                       if (event->family == AF_INET6 &&
+                                           sk->sk_family == AF_INET6 &&
+                                           !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))
+                                               continue;
+
+                                       /* Reinject, so that pf = 1 and so we
+                                        * won't select this one as the
+                                        * ack-sock.
+                                        */
+                                       mptcp_reinject_data(sk, 0);
+
+                                       /* We announce the removal of this id */
+                                       announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk);
+
+                                       mptcp_sub_force_close(sk);
+                                       found = true;
+                               }
+
+                               if (found)
+                                       goto next;
+
+                               /* The id may have been given by the event,
+                                * matching on a local address. And it may not
+                                * have matched on one of the above sockets,
+                                * because the client never created a subflow.
+                                * So, we have to finally remove it here.
+                                */
+                               if (id > 0)
+                                       announce_remove_addr(id, meta_sk);
+                       }
+
+                       if (event->code == MPTCP_EVENT_MOD) {
+                               struct sock *sk;
+
+                               mptcp_for_each_sk(mpcb, sk) {
+                                       struct tcp_sock *tp = tcp_sk(sk);
+
+                                       if (event->family == AF_INET &&
+                                           (sk->sk_family == AF_INET ||
+                                            mptcp_v6_is_v4_mapped(sk)) &&
+                                            inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {
+                                               if (event->low_prio != tp->mptcp->low_prio) {
+                                                       tp->mptcp->send_mp_prio = 1;
+                                                       tp->mptcp->low_prio = event->low_prio;
+
+                                                       tcp_send_ack(sk);
+                                               }
+                                       }
+
+                                       if (event->family == AF_INET6 &&
+                                           sk->sk_family == AF_INET6 &&
+                                           !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {
+                                               if (event->low_prio != tp->mptcp->low_prio) {
+                                                       tp->mptcp->send_mp_prio = 1;
+                                                       tp->mptcp->low_prio = event->low_prio;
+
+                                                       tcp_send_ack(sk);
+                                               }
+                                       }
+                               }
+                       }
+next:
+                       bh_unlock_sock(meta_sk);
+                       sock_put(meta_sk);
+               }
+               rcu_read_unlock_bh();
+       }
+       goto next_event;
+}
+
+static struct mptcp_addr_event *lookup_similar_event(const struct net *net,
+                                                    const struct mptcp_addr_event *event)
+{
+       struct mptcp_addr_event *eventq;
+       struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
+
+       list_for_each_entry(eventq, &fm_ns->events, list) {
+               if (eventq->family != event->family)
+                       continue;
+               if (event->family == AF_INET) {
+                       if (eventq->addr.in.s_addr == event->addr.in.s_addr)
+                               return eventq;
+               } else {
+                       if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))
+                               return eventq;
+               }
+       }
+       return NULL;
+}
+
+/* We already hold the net-namespace MPTCP-lock */
+static void add_pm_event(struct net *net, const struct mptcp_addr_event *event)
+{
+       struct mptcp_addr_event *eventq = lookup_similar_event(net, event);
+       struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
+
+       if (eventq) {
+               switch (event->code) {
+               case MPTCP_EVENT_DEL:
+                       mptcp_debug("%s del old_code %u\n", __func__, eventq->code);
+                       list_del(&eventq->list);
+                       kfree(eventq);
+                       break;
+               case MPTCP_EVENT_ADD:
+                       mptcp_debug("%s add old_code %u\n", __func__, eventq->code);
+                       eventq->low_prio = event->low_prio;
+                       eventq->code = MPTCP_EVENT_ADD;
+                       return;
+               case MPTCP_EVENT_MOD:
+                       mptcp_debug("%s mod old_code %u\n", __func__, eventq->code);
+                       eventq->low_prio = event->low_prio;
+                       eventq->code = MPTCP_EVENT_MOD;
+                       return;
+               }
+       }
+
+       /* OK, we have to add the new address to the wait queue */
+       eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);
+       if (!eventq)
+               return;
+
+       list_add_tail(&eventq->list, &fm_ns->events);
+
+       /* Create work-queue */
+       if (!delayed_work_pending(&fm_ns->address_worker))
+               queue_delayed_work(mptcp_wq, &fm_ns->address_worker,
+                                  msecs_to_jiffies(500));
+}
+
+static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event,
+                               struct net *net)
+{
+       const struct net_device *netdev = ifa->ifa_dev->dev;
+       struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
+       struct mptcp_addr_event mpevent;
+
+       if (ifa->ifa_scope > RT_SCOPE_LINK ||
+           ipv4_is_loopback(ifa->ifa_local))
+               return;
+
+       spin_lock_bh(&fm_ns->local_lock);
+
+       mpevent.family = AF_INET;
+       mpevent.addr.in.s_addr = ifa->ifa_local;
+       mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
+       mpevent.if_idx  = netdev->ifindex;
+
+       if (event == NETDEV_DOWN || !netif_running(netdev) ||
+           (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
+               mpevent.code = MPTCP_EVENT_DEL;
+       else if (event == NETDEV_UP)
+               mpevent.code = MPTCP_EVENT_ADD;
+       else if (event == NETDEV_CHANGE)
+               mpevent.code = MPTCP_EVENT_MOD;
+
+       mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__,
+                   &ifa->ifa_local, mpevent.code, mpevent.low_prio);
+       add_pm_event(net, &mpevent);
+
+       spin_unlock_bh(&fm_ns->local_lock);
+}
+
+/* React on IPv4-addr add/rem-events */
+static int mptcp_pm_inetaddr_event(struct notifier_block *this,
+                                  unsigned long event, void *ptr)
+{
+       const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+       struct net *net = dev_net(ifa->ifa_dev->dev);
+
+       if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
+             event == NETDEV_CHANGE))
+               return NOTIFY_DONE;
+
+       addr4_event_handler(ifa, event, net);
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block mptcp_pm_inetaddr_notifier = {
+               .notifier_call = mptcp_pm_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+/* IPV6-related address/interface watchers */
+struct mptcp_dad_data {
+       struct timer_list timer;
+       struct inet6_ifaddr *ifa;
+};
+
+static void dad_callback(unsigned long arg);
+static int inet6_addr_event(struct notifier_block *this,
+                                    unsigned long event, void *ptr);
+
+static bool ipv6_dad_finished(const struct inet6_ifaddr *ifa)
+{
+       return !(ifa->flags & IFA_F_TENTATIVE) ||
+              ifa->state > INET6_IFADDR_STATE_DAD;
+}
+
+static void dad_init_timer(struct mptcp_dad_data *data,
+                                struct inet6_ifaddr *ifa)
+{
+       data->ifa = ifa;
+       data->timer.data = (unsigned long)data;
+       data->timer.function = dad_callback;
+       if (ifa->idev->cnf.rtr_solicit_delay)
+               data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;
+       else
+               data->timer.expires = jiffies + (HZ/10);
+}
+
+static void dad_callback(unsigned long arg)
+{
+       struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;
+
+       /* DAD failed or IP brought down? */
+       if (data->ifa->state == INET6_IFADDR_STATE_ERRDAD ||
+           data->ifa->state == INET6_IFADDR_STATE_DEAD)
+               goto exit;
+
+       if (!ipv6_dad_finished(data->ifa)) {
+               dad_init_timer(data, data->ifa);
+               add_timer(&data->timer);
+               return;
+       }
+
+       inet6_addr_event(NULL, NETDEV_UP, data->ifa);
+
+exit:
+       in6_ifa_put(data->ifa);
+       kfree(data);
+}
+
+static inline void dad_setup_timer(struct inet6_ifaddr *ifa)
+{
+       struct mptcp_dad_data *data;
+
+       data = kmalloc(sizeof(*data), GFP_ATOMIC);
+
+       if (!data)
+               return;
+
+       init_timer(&data->timer);
+       dad_init_timer(data, ifa);
+       add_timer(&data->timer);
+       in6_ifa_hold(ifa);
+}
+
+static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event,
+                               struct net *net)
+{
+       const struct net_device *netdev = ifa->idev->dev;
+       int addr_type = ipv6_addr_type(&ifa->addr);
+       struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
+       struct mptcp_addr_event mpevent;
+
+       if (ifa->scope > RT_SCOPE_LINK ||
+           addr_type == IPV6_ADDR_ANY ||
+           (addr_type & IPV6_ADDR_LOOPBACK) ||
+           (addr_type & IPV6_ADDR_LINKLOCAL))
+               return;
+
+       spin_lock_bh(&fm_ns->local_lock);
+
+       mpevent.family = AF_INET6;
+       mpevent.addr.in6 = ifa->addr;
+       mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
+       mpevent.if_idx = netdev->ifindex;
+
+       if (event == NETDEV_DOWN || !netif_running(netdev) ||
+           (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
+               mpevent.code = MPTCP_EVENT_DEL;
+       else if (event == NETDEV_UP)
+               mpevent.code = MPTCP_EVENT_ADD;
+       else if (event == NETDEV_CHANGE)
+               mpevent.code = MPTCP_EVENT_MOD;
+
+       mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__,
+                   &ifa->addr, mpevent.code, mpevent.low_prio);
+       add_pm_event(net, &mpevent);
+
+       spin_unlock_bh(&fm_ns->local_lock);
+}
+
+/* React on IPv6-addr add/rem-events */
+static int inet6_addr_event(struct notifier_block *this, unsigned long event,
+                           void *ptr)
+{
+       struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;
+       struct net *net = dev_net(ifa6->idev->dev);
+
+       if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
+             event == NETDEV_CHANGE))
+               return NOTIFY_DONE;
+
+       if (!ipv6_dad_finished(ifa6))
+               dad_setup_timer(ifa6);
+       else
+               addr6_event_handler(ifa6, event, net);
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block inet6_addr_notifier = {
+               .notifier_call = inet6_addr_event,
+};
+
+#endif
+
+/* React on ifup/down-events */
+static int netdev_event(struct notifier_block *this, unsigned long event,
+                       void *ptr)
+{
+       const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+       struct in_device *in_dev;
+#if IS_ENABLED(CONFIG_IPV6)
+       struct inet6_dev *in6_dev;
+#endif
+
+       if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
+             event == NETDEV_CHANGE))
+               return NOTIFY_DONE;
+
+       rcu_read_lock();
+       in_dev = __in_dev_get_rtnl(dev);
+
+       if (in_dev) {
+               for_ifa(in_dev) {
+                       mptcp_pm_inetaddr_event(NULL, event, ifa);
+               } endfor_ifa(in_dev);
+       }
+
+#if IS_ENABLED(CONFIG_IPV6)
+       in6_dev = __in6_dev_get(dev);
+
+       if (in6_dev) {
+               struct inet6_ifaddr *ifa6;
+
+               list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
+                       inet6_addr_event(NULL, event, ifa6);
+       }
+#endif
+
+       rcu_read_unlock();
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block mptcp_pm_netdev_notifier = {
+               .notifier_call = netdev_event,
+};
+
+static void full_mesh_add_raddr(struct mptcp_cb *mpcb,
+                               const union inet_addr *addr,
+                               sa_family_t family, __be16 port, u8 id)
+{
+       if (family == AF_INET)
+               mptcp_addv4_raddr(mpcb, &addr->in, port, id);
+       else
+               mptcp_addv6_raddr(mpcb, &addr->in6, port, id);
+}
+
+static void full_mesh_new_session(const struct sock *meta_sk)
+{
+       struct mptcp_loc_addr *mptcp_local;
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+       const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
+       struct tcp_sock *master_tp = tcp_sk(mpcb->master_sk);
+       int i, index, if_idx;
+       union inet_addr saddr, daddr;
+       sa_family_t family;
+       bool meta_v4 = meta_sk->sk_family == AF_INET;
+
+       /* Init local variables necessary for the rest */
+       if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) {
+               saddr.ip = inet_sk(meta_sk)->inet_saddr;
+               daddr.ip = inet_sk(meta_sk)->inet_daddr;
+               if_idx = mpcb->master_sk->sk_bound_dev_if;
+               family = AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+       } else {
+               saddr.in6 = inet6_sk(meta_sk)->saddr;
+               daddr.in6 = meta_sk->sk_v6_daddr;
+               if_idx = mpcb->master_sk->sk_bound_dev_if;
+               family = AF_INET6;
+#endif
+       }
+
+       if (inet_sk(meta_sk)->transparent)
+               if_idx = inet_sk(meta_sk)->rx_dst_ifindex;
+
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference(fm_ns->local);
+
+       if (inet_sk(meta_sk)->transparent)
+               index = mptcp_find_address_transp(mptcp_local, family, if_idx);
+       else
+               index = mptcp_find_address(mptcp_local, family, &saddr, if_idx);
+       if (index < 0)
+               goto fallback;
+
+       if (family == AF_INET)
+               master_tp->mptcp->low_prio = mptcp_local->locaddr4[index].low_prio;
+       else
+               master_tp->mptcp->low_prio = mptcp_local->locaddr6[index].low_prio;
+       master_tp->mptcp->send_mp_prio = master_tp->mptcp->low_prio;
+
+       full_mesh_add_raddr(mpcb, &daddr, family, 0, 0);
+       mptcp_set_init_addr_bit(mpcb, &daddr, family, index);
+
+       /* Initialize workqueue-struct */
+       INIT_WORK(&fmp->subflow_work, create_subflow_worker);
+       INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);
+       fmp->mpcb = mpcb;
+
+       if (!meta_v4 && meta_sk->sk_ipv6only)
+               goto skip_ipv4;
+
+       /* Look for the address among the local addresses */
+       mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
+               __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;
+
+               /* We do not need to announce the initial subflow's address again */
+               if (family == AF_INET &&
+                   (!if_idx || mptcp_local->locaddr4[i].if_idx == if_idx) &&
+                   saddr.ip == ifa_address)
+                       continue;
+
+               fmp->add_addr++;
+               mpcb->addr_signal = 1;
+       }
+
+skip_ipv4:
+#if IS_ENABLED(CONFIG_IPV6)
+       /* skip IPv6 addresses if meta-socket is IPv4 */
+       if (meta_v4)
+               goto skip_ipv6;
+
+       mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
+               const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;
+
+               /* We do not need to announce the initial subflow's address again */
+               if (family == AF_INET6 &&
+                   (!if_idx || mptcp_local->locaddr6[i].if_idx == if_idx) &&
+                   ipv6_addr_equal(&saddr.in6, ifa6))
+                       continue;
+
+               fmp->add_addr++;
+               mpcb->addr_signal = 1;
+       }
+
+skip_ipv6:
+#endif
+
+       rcu_read_unlock_bh();
+
+       if (family == AF_INET)
+               fmp->announced_addrs_v4 |= (1 << index);
+       else
+               fmp->announced_addrs_v6 |= (1 << index);
+
+       for (i = fmp->add_addr; i && fmp->add_addr; i--)
+               tcp_send_ack(mpcb->master_sk);
+
+       if (master_tp->mptcp->send_mp_prio)
+               tcp_send_ack(mpcb->master_sk);
+
+       return;
+
+fallback:
+       rcu_read_unlock_bh();
+       mptcp_fallback_default(mpcb);
+}
+
+static void full_mesh_create_subflows(struct sock *meta_sk)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+
+       if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
+           mpcb->send_infinite_mapping ||
+           mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
+               return;
+
+       if (mpcb->master_sk &&
+           !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
+               return;
+
+       if (!work_pending(&fmp->subflow_work)) {
+               sock_hold(meta_sk);
+               queue_work(mptcp_wq, &fmp->subflow_work);
+       }
+}
+
+static void full_mesh_subflow_error(struct sock *meta_sk, struct sock *sk)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+
+       if (!create_on_err)
+               return;
+
+       if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
+           mpcb->send_infinite_mapping ||
+           mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
+               return;
+
+       if (sk->sk_err != ETIMEDOUT)
+               return;
+
+       full_mesh_create_subflows(meta_sk);
+}
+
+/* Called upon release_sock, if the socket was owned by the user during
+ * a path-management event.
+ */
+static void full_mesh_release_sock(struct sock *meta_sk)
+{
+       struct mptcp_loc_addr *mptcp_local;
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+       const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
+       struct sock *sk, *tmpsk;
+       bool meta_v4 = meta_sk->sk_family == AF_INET;
+       int i;
+
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference(fm_ns->local);
+
+       if (!meta_v4 && meta_sk->sk_ipv6only)
+               goto skip_ipv4;
+
+       /* First, detect modifications or additions */
+       mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
+               struct in_addr ifa = mptcp_local->locaddr4[i].addr;
+               bool found = false;
+
+               mptcp_for_each_sk(mpcb, sk) {
+                       struct tcp_sock *tp = tcp_sk(sk);
+
+                       if (sk->sk_family == AF_INET6 &&
+                           !mptcp_v6_is_v4_mapped(sk))
+                               continue;
+
+                       if (inet_sk(sk)->inet_saddr != ifa.s_addr)
+                               continue;
+
+                       found = true;
+
+                       if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {
+                               tp->mptcp->send_mp_prio = 1;
+                               tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;
+
+                               tcp_send_ack(sk);
+                       }
+               }
+
+               if (!found) {
+                       fmp->add_addr++;
+                       mpcb->addr_signal = 1;
+
+                       sk = mptcp_select_ack_sock(meta_sk);
+                       if (sk)
+                               tcp_send_ack(sk);
+                       full_mesh_create_subflows(meta_sk);
+               }
+       }
+
+skip_ipv4:
+#if IS_ENABLED(CONFIG_IPV6)
+       /* skip IPv6 addresses if meta-socket is IPv4 */
+       if (meta_v4)
+               goto removal;
+
+       mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
+               struct in6_addr ifa = mptcp_local->locaddr6[i].addr;
+               bool found = false;
+
+               mptcp_for_each_sk(mpcb, sk) {
+                       struct tcp_sock *tp = tcp_sk(sk);
+
+                       if (sk->sk_family == AF_INET ||
+                           mptcp_v6_is_v4_mapped(sk))
+                               continue;
+
+                       if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))
+                               continue;
+
+                       found = true;
+
+                       if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {
+                               tp->mptcp->send_mp_prio = 1;
+                               tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;
+
+                               tcp_send_ack(sk);
+                       }
+               }
+
+               if (!found) {
+                       fmp->add_addr++;
+                       mpcb->addr_signal = 1;
+
+                       sk = mptcp_select_ack_sock(meta_sk);
+                       if (sk)
+                               tcp_send_ack(sk);
+                       full_mesh_create_subflows(meta_sk);
+               }
+       }
+
+removal:
+#endif
+
+       /* Now, detect address-removals */
+       mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
+               bool shall_remove = true;
+
+               if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
+                       mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
+                               if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {
+                                       shall_remove = false;
+                                       break;
+                               }
+                       }
+               } else {
+                       mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
+                               if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {
+                                       shall_remove = false;
+                                       break;
+                               }
+                       }
+               }
+
+               if (shall_remove) {
+                       /* Reinject, so that pf = 1 and so we
+                        * won't select this one as the
+                        * ack-sock.
+                        */
+                       mptcp_reinject_data(sk, 0);
+
+                       announce_remove_addr(tcp_sk(sk)->mptcp->loc_id,
+                                            meta_sk);
+
+                       mptcp_sub_force_close(sk);
+               }
+       }
+
+       /* Just call it optimistically. It actually cannot do any harm */
+       update_addr_bitfields(meta_sk, mptcp_local);
+
+       rcu_read_unlock_bh();
+}
+
+static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,
+                                 struct net *net, bool *low_prio)
+{
+       struct mptcp_loc_addr *mptcp_local;
+       const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
+       int index, id = -1;
+
+       /* Handle the backup-flows */
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference(fm_ns->local);
+
+       index = mptcp_find_address(mptcp_local, family, addr, 0);
+
+       if (index != -1) {
+               if (family == AF_INET) {
+                       id = mptcp_local->locaddr4[index].loc4_id;
+                       *low_prio = mptcp_local->locaddr4[index].low_prio;
+               } else {
+                       id = mptcp_local->locaddr6[index].loc6_id;
+                       *low_prio = mptcp_local->locaddr6[index].low_prio;
+               }
+       }
+
+       rcu_read_unlock_bh();
+
+       return id;
+}
+
+static void full_mesh_addr_signal(struct sock *sk, unsigned int *size,
+                                 struct tcp_out_options *opts,
+                                 struct sk_buff *skb)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_cb *mpcb = tp->mpcb;
+       struct sock *meta_sk = mpcb->meta_sk;
+       struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
+       struct mptcp_loc_addr *mptcp_local;
+       struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
+       int remove_addr_len;
+       u8 unannouncedv4 = 0, unannouncedv6 = 0;
+       bool meta_v4 = meta_sk->sk_family == AF_INET;
+
+       mpcb->addr_signal = 0;
+
+       if (likely(!fmp->add_addr))
+               goto remove_addr;
+
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference(fm_ns->local);
+
+       if (!meta_v4 && meta_sk->sk_ipv6only)
+               goto skip_ipv4;
+
+       /* IPv4 */
+       unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;
+       if (unannouncedv4 &&
+           ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
+           MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) ||
+           (mpcb->mptcp_ver >= MPTCP_VERSION_1 &&
+           MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1))) {
+               int ind = mptcp_find_free_index(~unannouncedv4);
+
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_ADD_ADDR;
+               opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;
+               opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;
+               opts->add_addr_v4 = 1;
+               if (mpcb->mptcp_ver >= MPTCP_VERSION_1) {
+                       u8 mptcp_hash_mac[20];
+                       u8 no_key[8];
+
+                       *(u64 *)no_key = 0;
+                       mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
+                                       (u8 *)no_key,
+                                       (u32 *)mptcp_hash_mac, 2,
+                                       1, (u8 *)&mptcp_local->locaddr4[ind].loc4_id,
+                                       4, (u8 *)&opts->add_addr4.addr.s_addr);
+                       opts->add_addr4.trunc_mac = *(u64 *)mptcp_hash_mac;
+               }
+
+               if (skb) {
+                       fmp->announced_addrs_v4 |= (1 << ind);
+                       fmp->add_addr--;
+               }
+
+               if (mpcb->mptcp_ver < MPTCP_VERSION_1)
+                       *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
+               if (mpcb->mptcp_ver >= MPTCP_VERSION_1)
+                       *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1;
+
+               goto skip_ipv6;
+       }
+
+       if (meta_v4)
+               goto skip_ipv6;
+skip_ipv4:
+       /* IPv6 */
+       unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;
+       if (unannouncedv6 &&
+           ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
+           MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) ||
+           (mpcb->mptcp_ver >= MPTCP_VERSION_1 &&
+           MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1))) {
+               int ind = mptcp_find_free_index(~unannouncedv6);
+
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_ADD_ADDR;
+               opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;
+               opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;
+               opts->add_addr_v6 = 1;
+               if (mpcb->mptcp_ver >= MPTCP_VERSION_1) {
+                       u8 mptcp_hash_mac[20];
+                       u8 no_key[8];
+
+                       *(u64 *)no_key = 0;
+                       mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
+                                       (u8 *)no_key,
+                                       (u32 *)mptcp_hash_mac, 2,
+                                       1, (u8 *)&mptcp_local->locaddr6[ind].loc6_id,
+                                       16, (u8 *)&opts->add_addr6.addr.s6_addr);
+                       opts->add_addr6.trunc_mac = *(u64 *)mptcp_hash_mac;
+               }
+
+               if (skb) {
+                       fmp->announced_addrs_v6 |= (1 << ind);
+                       fmp->add_addr--;
+               }
+               if (mpcb->mptcp_ver < MPTCP_VERSION_1)
+                       *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
+               if (mpcb->mptcp_ver >= MPTCP_VERSION_1)
+                       *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1;
+       }
+
+skip_ipv6:
+       rcu_read_unlock_bh();
+
+       if (!unannouncedv4 && !unannouncedv6 && skb)
+               fmp->add_addr--;
+
+remove_addr:
+       if (likely(!fmp->remove_addrs))
+               goto exit;
+
+       remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);
+       if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
+               goto exit;
+
+       opts->options |= OPTION_MPTCP;
+       opts->mptcp_options |= OPTION_REMOVE_ADDR;
+       opts->remove_addrs = fmp->remove_addrs;
+       *size += remove_addr_len;
+       if (skb)
+               fmp->remove_addrs = 0;
+
+exit:
+       mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs);
+}
+
+static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id)
+{
+       mptcp_v4_rem_raddress(mpcb, rem_id);
+       mptcp_v6_rem_raddress(mpcb, rem_id);
+}
+
+static void full_mesh_delete_subflow(struct sock *sk)
+{
+       struct fullmesh_priv *fmp = fullmesh_get_priv(tcp_sk(sk)->mpcb);
+       struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
+       struct mptcp_loc_addr *mptcp_local;
+       int index, i;
+
+       if (!create_on_err)
+               return;
+
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference_bh(fm_ns->local);
+
+       if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
+               union inet_addr saddr;
+
+               saddr.ip = inet_sk(sk)->inet_saddr;
+               index = mptcp_find_address(mptcp_local, AF_INET, &saddr,
+                                          sk->sk_bound_dev_if);
+               if (index < 0)
+                       goto out;
+
+               mptcp_for_each_bit_set(fmp->rem4_bits, i) {
+                       struct fullmesh_rem4 *rem4 = &fmp->remaddr4[i];
+
+                       if (rem4->addr.s_addr != sk->sk_daddr)
+                               continue;
+
+                       if (rem4->port && rem4->port != inet_sk(sk)->inet_dport)
+                               continue;
+
+                       rem4->bitfield &= ~(1 << index);
+               }
+#if IS_ENABLED(CONFIG_IPV6)
+       } else {
+               union inet_addr saddr;
+
+               saddr.in6 = inet6_sk(sk)->saddr;
+               index = mptcp_find_address(mptcp_local, AF_INET6, &saddr,
+                                          sk->sk_bound_dev_if);
+               if (index < 0)
+                       goto out;
+
+               mptcp_for_each_bit_set(fmp->rem6_bits, i) {
+                       struct fullmesh_rem6 *rem6 = &fmp->remaddr6[i];
+
+                       if (!ipv6_addr_equal(&rem6->addr, &sk->sk_v6_daddr))
+                               continue;
+
+                       if (rem6->port && rem6->port != inet_sk(sk)->inet_dport)
+                               continue;
+
+                       rem6->bitfield &= ~(1 << index);
+               }
+#endif
+       }
+
+out:
+       rcu_read_unlock_bh();
+}
+
+/* Output /proc/net/mptcp_fullmesh */
+static int mptcp_fm_seq_show(struct seq_file *seq, void *v)
+{
+       const struct net *net = seq->private;
+       struct mptcp_loc_addr *mptcp_local;
+       const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
+       int i;
+
+       seq_printf(seq, "Index, Address-ID, Backup, IP-address\n");
+
+       rcu_read_lock_bh();
+       mptcp_local = rcu_dereference(fm_ns->local);
+
+       seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index);
+
+       mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
+               struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i];
+
+               seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id,
+                          loc4->low_prio, &loc4->addr);
+       }
+
+       seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index);
+
+       mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
+               struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i];
+
+               seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id,
+                          loc6->low_prio, &loc6->addr);
+       }
+       rcu_read_unlock_bh();
+
+       return 0;
+}
+
+static int mptcp_fm_seq_open(struct inode *inode, struct file *file)
+{
+       return single_open_net(inode, file, mptcp_fm_seq_show);
+}
+
+static const struct file_operations mptcp_fm_seq_fops = {
+       .owner = THIS_MODULE,
+       .open = mptcp_fm_seq_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release_net,
+};
+
+static int mptcp_fm_init_net(struct net *net)
+{
+       struct mptcp_loc_addr *mptcp_local;
+       struct mptcp_fm_ns *fm_ns;
+       int err = 0;
+
+       fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);
+       if (!fm_ns)
+               return -ENOBUFS;
+
+       mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);
+       if (!mptcp_local) {
+               err = -ENOBUFS;
+               goto err_mptcp_local;
+       }
+
+       if (!proc_create("mptcp_fullmesh", 0444, net->proc_net,
+                        &mptcp_fm_seq_fops)) {
+               err = -ENOMEM;
+               goto err_seq_fops;
+       }
+
+       mptcp_local->next_v4_index = 1;
+
+       rcu_assign_pointer(fm_ns->local, mptcp_local);
+       INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);
+       INIT_LIST_HEAD(&fm_ns->events);
+       spin_lock_init(&fm_ns->local_lock);
+       fm_ns->net = net;
+       net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;
+
+       return 0;
+err_seq_fops:
+       kfree(mptcp_local);
+err_mptcp_local:
+       kfree(fm_ns);
+       return err;
+}
+
+static void mptcp_fm_exit_net(struct net *net)
+{
+       struct mptcp_addr_event *eventq, *tmp;
+       struct mptcp_fm_ns *fm_ns;
+       struct mptcp_loc_addr *mptcp_local;
+
+       fm_ns = fm_get_ns(net);
+       cancel_delayed_work_sync(&fm_ns->address_worker);
+
+       rcu_read_lock_bh();
+
+       mptcp_local = rcu_dereference_bh(fm_ns->local);
+       kfree_rcu(mptcp_local, rcu);
+
+       spin_lock(&fm_ns->local_lock);
+       list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {
+               list_del(&eventq->list);
+               kfree(eventq);
+       }
+       spin_unlock(&fm_ns->local_lock);
+
+       rcu_read_unlock_bh();
+
+       remove_proc_entry("mptcp_fullmesh", net->proc_net);
+
+       kfree(fm_ns);
+}
+
+static struct pernet_operations full_mesh_net_ops = {
+       .init = mptcp_fm_init_net,
+       .exit = mptcp_fm_exit_net,
+};
+
+static struct mptcp_pm_ops full_mesh __read_mostly = {
+       .new_session = full_mesh_new_session,
+       .release_sock = full_mesh_release_sock,
+       .fully_established = full_mesh_create_subflows,
+       .new_remote_address = full_mesh_create_subflows,
+       .subflow_error = full_mesh_subflow_error,
+       .get_local_id = full_mesh_get_local_id,
+       .addr_signal = full_mesh_addr_signal,
+       .add_raddr = full_mesh_add_raddr,
+       .rem_raddr = full_mesh_rem_raddr,
+       .delete_subflow = full_mesh_delete_subflow,
+       .name = "fullmesh",
+       .owner = THIS_MODULE,
+};
+
+/* General initialization of MPTCP_PM */
+static int __init full_mesh_register(void)
+{
+       int ret;
+
+       BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);
+
+       ret = register_pernet_subsys(&full_mesh_net_ops);
+       if (ret)
+               goto out;
+
+       ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
+       if (ret)
+               goto err_reg_inetaddr;
+       ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
+       if (ret)
+               goto err_reg_netdev;
+
+#if IS_ENABLED(CONFIG_IPV6)
+       ret = register_inet6addr_notifier(&inet6_addr_notifier);
+       if (ret)
+               goto err_reg_inet6addr;
+#endif
+
+       ret = mptcp_register_path_manager(&full_mesh);
+       if (ret)
+               goto err_reg_pm;
+
+out:
+       return ret;
+
+err_reg_pm:
+#if IS_ENABLED(CONFIG_IPV6)
+       unregister_inet6addr_notifier(&inet6_addr_notifier);
+err_reg_inet6addr:
+#endif
+       unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
+err_reg_netdev:
+       unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
+err_reg_inetaddr:
+       unregister_pernet_subsys(&full_mesh_net_ops);
+       goto out;
+}
+
+static void full_mesh_unregister(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+       unregister_inet6addr_notifier(&inet6_addr_notifier);
+#endif
+       unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
+       unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
+       unregister_pernet_subsys(&full_mesh_net_ops);
+       mptcp_unregister_path_manager(&full_mesh);
+}
+
+module_init(full_mesh_register);
+module_exit(full_mesh_unregister);
+
+MODULE_AUTHOR("Christoph Paasch");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Full-Mesh MPTCP");
+MODULE_VERSION("0.88");
diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c
new file mode 100644 (file)
index 0000000..a58b004
--- /dev/null
@@ -0,0 +1,2463 @@
+/*
+ *     MPTCP implementation - Sending side
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/unaligned.h>
+
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+#include <net/mptcp_v6.h>
+
+#include <linux/kconfig.h>
+
+/* is seq1 < seq2 ? */
+static inline bool before64(const u64 seq1, const u64 seq2)
+{
+       return (s64)(seq1 - seq2) < 0;
+}
+
+/* is seq1 > seq2 ? */
+#define after64(seq1, seq2)    before64(seq2, seq1)
+
+static inline void mptcp_become_fully_estab(struct sock *sk)
+{
+       tcp_sk(sk)->mptcp->fully_established = 1;
+
+       if (is_master_tp(tcp_sk(sk)) &&
+           tcp_sk(sk)->mpcb->pm_ops->fully_established)
+               tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));
+}
+
+/* Similar to tcp_tso_acked without any memory accounting */
+static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk,
+                                          struct sk_buff *skb)
+{
+       const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       u32 packets_acked, len, delta_truesize;
+
+       WARN_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una));
+
+       packets_acked = tcp_skb_pcount(skb);
+
+       if (skb_unclone(skb, GFP_ATOMIC))
+               return 0;
+
+       len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq;
+       delta_truesize = __pskb_trim_head(skb, len);
+
+       TCP_SKB_CB(skb)->seq += len;
+       skb->ip_summed = CHECKSUM_PARTIAL;
+
+       if (delta_truesize)
+               skb->truesize -= delta_truesize;
+
+       /* Any change of skb->len requires recalculation of tso factor. */
+       if (tcp_skb_pcount(skb) > 1)
+               tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
+       packets_acked -= tcp_skb_pcount(skb);
+
+       if (packets_acked) {
+               WARN_ON(tcp_skb_pcount(skb) == 0);
+               WARN_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
+       }
+
+       return packets_acked;
+}
+
+/**
+ * Cleans the meta-socket retransmission queue and the reinject-queue.
+ * @sk must be the metasocket.
+ */
+static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
+{
+       struct sk_buff *skb, *tmp;
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       bool acked = false;
+       u32 acked_pcount;
+
+       while ((skb = tcp_write_queue_head(meta_sk)) &&
+              skb != tcp_send_head(meta_sk)) {
+               bool fully_acked = true;
+
+               if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
+                       if (tcp_skb_pcount(skb) == 1 ||
+                           !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
+                               break;
+
+                       acked_pcount = tcp_tso_acked(meta_sk, skb);
+                       if (!acked_pcount)
+                               break;
+
+                       fully_acked = false;
+               } else {
+                       acked_pcount = tcp_skb_pcount(skb);
+               }
+
+               acked = true;
+               meta_tp->packets_out -= acked_pcount;
+               meta_tp->retrans_stamp = 0;
+
+               if (!fully_acked)
+                       break;
+
+               tcp_unlink_write_queue(skb, meta_sk);
+
+               if (mptcp_is_data_fin(skb)) {
+                       struct sock *sk_it, *sk_tmp;
+
+                       /* DATA_FIN has been acknowledged - now we can close
+                        * the subflows
+                        */
+                       mptcp_for_each_sk_safe(mpcb, sk_it, sk_tmp) {
+                               unsigned long delay = 0;
+
+                               /* If we are the passive closer, don't trigger
+                                * subflow-fin until the subflow has been finned
+                                * by the peer - thus we add a delay.
+                                */
+                               if (mpcb->passive_close &&
+                                   sk_it->sk_state == TCP_ESTABLISHED)
+                                       delay = inet_csk(sk_it)->icsk_rto << 3;
+
+                               mptcp_sub_close(sk_it, delay);
+                       }
+               }
+               sk_wmem_free_skb(meta_sk, skb);
+       }
+       /* Remove acknowledged data from the reinject queue */
+       skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
+               if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
+                       if (tcp_skb_pcount(skb) == 1 ||
+                           !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
+                               break;
+
+                       mptcp_tso_acked_reinject(meta_sk, skb);
+                       break;
+               }
+
+               __skb_unlink(skb, &mpcb->reinject_queue);
+               __kfree_skb(skb);
+       }
+
+       if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
+               meta_tp->snd_up = meta_tp->snd_una;
+
+       if (acked) {
+               tcp_rearm_rto(meta_sk);
+               /* Normally this is done in tcp_try_undo_loss - but MPTCP
+                * does not call this function.
+                */
+               inet_csk(meta_sk)->icsk_retransmits = 0;
+       }
+}
+
+/* Inspired by tcp_rcv_state_process */
+static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
+                                  const struct sk_buff *skb, u32 data_seq,
+                                  u16 data_len)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
+       const struct tcphdr *th = tcp_hdr(skb);
+
+       /* State-machine handling if FIN has been enqueued and he has
+        * been acked (snd_una == write_seq) - it's important that this
+        * here is after sk_wmem_free_skb because otherwise
+        * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
+        */
+       switch (meta_sk->sk_state) {
+       case TCP_FIN_WAIT1: {
+               struct dst_entry *dst;
+               int tmo;
+
+               if (meta_tp->snd_una != meta_tp->write_seq)
+                       break;
+
+               tcp_set_state(meta_sk, TCP_FIN_WAIT2);
+               meta_sk->sk_shutdown |= SEND_SHUTDOWN;
+
+               dst = __sk_dst_get(sk);
+               if (dst)
+                       dst_confirm(dst);
+
+               if (!sock_flag(meta_sk, SOCK_DEAD)) {
+                       /* Wake up lingering close() */
+                       meta_sk->sk_state_change(meta_sk);
+                       break;
+               }
+
+               if (meta_tp->linger2 < 0 ||
+                   (data_len &&
+                    after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
+                          meta_tp->rcv_nxt))) {
+                       mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
+                       tcp_done(meta_sk);
+                       __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
+                       return 1;
+               }
+
+               tmo = tcp_fin_time(meta_sk);
+               if (tmo > TCP_TIMEWAIT_LEN) {
+                       inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
+               } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) {
+                       /* Bad case. We could lose such FIN otherwise.
+                        * It is not a big problem, but it looks confusing
+                        * and not so rare event. We still can lose it now,
+                        * if it spins in bh_lock_sock(), but it is really
+                        * marginal case.
+                        */
+                       inet_csk_reset_keepalive_timer(meta_sk, tmo);
+               } else {
+                       meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
+               }
+               break;
+       }
+       case TCP_CLOSING:
+       case TCP_LAST_ACK:
+               if (meta_tp->snd_una == meta_tp->write_seq) {
+                       tcp_done(meta_sk);
+                       return 1;
+               }
+               break;
+       }
+
+       /* step 7: process the segment text */
+       switch (meta_sk->sk_state) {
+       case TCP_FIN_WAIT1:
+       case TCP_FIN_WAIT2:
+               /* RFC 793 says to queue data in these states,
+                * RFC 1122 says we MUST send a reset.
+                * BSD 4.4 also does reset.
+                */
+               if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
+                       if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+                           after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
+                           !mptcp_is_data_fin2(skb, tp)) {
+                               __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
+                               mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
+                               tcp_reset(meta_sk);
+                               return 1;
+                       }
+               }
+               break;
+       }
+
+       return 0;
+}
+
+/**
+ * @return:
+ *  i) 1: Everything's fine.
+ *  ii) -1: A reset has been sent on the subflow - csum-failure
+ *  iii) 0: csum-failure but no reset sent, because it's the last subflow.
+ *      Last packet should not be destroyed by the caller because it has
+ *      been done here.
+ */
+static int mptcp_verif_dss_csum(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *tmp, *tmp1, *last = NULL;
+       __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
+       int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
+       int iter = 0;
+
+       skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
+               unsigned int csum_len;
+
+               if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
+                       /* Mapping ends in the middle of the packet -
+                        * csum only these bytes
+                        */
+                       csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
+               else
+                       csum_len = tmp->len;
+
+               offset = 0;
+               if (overflowed) {
+                       char first_word[4];
+
+                       first_word[0] = 0;
+                       first_word[1] = 0;
+                       first_word[2] = 0;
+                       first_word[3] = *(tmp->data);
+                       csum_tcp = csum_partial(first_word, 4, csum_tcp);
+                       offset = 1;
+                       csum_len--;
+                       overflowed = 0;
+               }
+
+               csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);
+
+               /* Was it on an odd-length? Then we have to merge the next byte
+                * correctly (see above)
+                */
+               if (csum_len != (csum_len & (~1)))
+                       overflowed = 1;
+
+               if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
+                       __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
+
+                       /* If a 64-bit dss is present, we increase the offset
+                        * by 4 bytes, as the high-order 64-bits will be added
+                        * in the final csum_partial-call.
+                        */
+                       u32 offset = skb_transport_offset(tmp) +
+                                    TCP_SKB_CB(tmp)->dss_off;
+                       if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
+                               offset += 4;
+
+                       csum_tcp = skb_checksum(tmp, offset,
+                                               MPTCP_SUB_LEN_SEQ_CSUM,
+                                               csum_tcp);
+
+                       csum_tcp = csum_partial(&data_seq,
+                                               sizeof(data_seq), csum_tcp);
+
+                       dss_csum_added = 1; /* Just do it once */
+               }
+               last = tmp;
+               iter++;
+
+               if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
+                   !before(TCP_SKB_CB(tmp1)->seq,
+                           tp->mptcp->map_subseq + tp->mptcp->map_data_len))
+                       break;
+       }
+
+       /* Now, checksum must be 0 */
+       if (unlikely(csum_fold(csum_tcp))) {
+               pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",
+                      __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq,
+                      dss_csum_added, overflowed, iter);
+
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMFAIL);
+               tp->mptcp->send_mp_fail = 1;
+
+               /* map_data_seq is the data-seq number of the
+                * mapping we are currently checking
+                */
+               tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
+
+               if (tp->mpcb->cnt_subflows > 1) {
+                       mptcp_send_reset(sk);
+                       ans = -1;
+               } else {
+                       tp->mpcb->send_infinite_mapping = 1;
+
+                       /* Need to purge the rcv-queue as it's no more valid */
+                       while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+                               tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
+                               kfree_skb(tmp);
+                       }
+
+                       ans = 0;
+               }
+       }
+
+       return ans;
+}
+
+static inline void mptcp_prepare_skb(struct sk_buff *skb,
+                                    const struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+       u32 inc = 0, end_seq = tcb->end_seq;
+
+       if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+               end_seq--;
+       /* If skb is the end of this mapping (end is always at mapping-boundary
+        * thanks to the splitting/trimming), then we need to increase
+        * data-end-seq by 1 if this here is a data-fin.
+        *
+        * We need to do -1 because end_seq includes the subflow-FIN.
+        */
+       if (tp->mptcp->map_data_fin &&
+           end_seq == tp->mptcp->map_subseq + tp->mptcp->map_data_len) {
+               inc = 1;
+
+               /* We manually set the fin-flag if it is a data-fin. For easy
+                * processing in tcp_recvmsg.
+                */
+               TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
+       } else {
+               /* We may have a subflow-fin with data but without data-fin */
+               TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_FIN;
+       }
+
+       /* Adapt data-seq's to the packet itself. We kinda transform the
+        * dss-mapping to a per-packet granularity. This is necessary to
+        * correctly handle overlapping mappings coming from different
+        * subflows. Otherwise it would be a complete mess.
+        */
+       tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
+       tcb->end_seq = tcb->seq + skb->len + inc;
+}
+
+static inline void mptcp_reset_mapping(struct tcp_sock *tp, u32 old_copied_seq)
+{
+       tp->mptcp->map_data_len = 0;
+       tp->mptcp->map_data_seq = 0;
+       tp->mptcp->map_subseq = 0;
+       tp->mptcp->map_data_fin = 0;
+       tp->mptcp->mapping_present = 0;
+
+       /* In infinite mapping receiver mode, we have to advance the implied
+        * data-sequence number when we progress the subflow's data.
+        */
+       if (tp->mpcb->infinite_mapping_rcv)
+               tp->mpcb->infinite_rcv_seq += (tp->copied_seq - old_copied_seq);
+}
+
+/* The DSS-mapping received on the sk only covers the second half of the skb
+ * (cut at seq). We trim the head from the skb.
+ * Data will be freed upon kfree().
+ *
+ * Inspired by tcp_trim_head().
+ */
+static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
+{
+       int len = seq - TCP_SKB_CB(skb)->seq;
+       u32 new_seq = TCP_SKB_CB(skb)->seq + len;
+       u32 delta_truesize;
+
+       delta_truesize = __pskb_trim_head(skb, len);
+
+       TCP_SKB_CB(skb)->seq = new_seq;
+
+       if (delta_truesize) {
+               skb->truesize -= delta_truesize;
+               atomic_sub(delta_truesize, &sk->sk_rmem_alloc);
+               sk_mem_uncharge(sk, delta_truesize);
+       }
+}
+
+/* The DSS-mapping received on the sk only covers the first half of the skb
+ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
+ * as further packets may resolve the mapping of the second half of data.
+ *
+ * Inspired by tcp_fragment().
+ */
+static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
+{
+       struct sk_buff *buff;
+       int nsize;
+       int nlen, len;
+       u8 flags;
+
+       len = seq - TCP_SKB_CB(skb)->seq;
+       nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
+       if (nsize < 0)
+               nsize = 0;
+
+       /* Get a new skb... force flag on. */
+       buff = alloc_skb(nsize, GFP_ATOMIC);
+       if (buff == NULL)
+               return -ENOMEM;
+
+       skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
+       skb_reset_transport_header(buff);
+
+       flags = TCP_SKB_CB(skb)->tcp_flags;
+       TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN);
+       TCP_SKB_CB(buff)->tcp_flags = flags;
+
+       /* We absolutly need to call skb_set_owner_r before refreshing the
+        * truesize of buff, otherwise the moved data will account twice.
+        */
+       skb_set_owner_r(buff, sk);
+       nlen = skb->len - len - nsize;
+       buff->truesize += nlen;
+       skb->truesize -= nlen;
+
+       /* Correct the sequence numbers. */
+       TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+       TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+       TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+       skb_split(skb, buff, len);
+
+       __skb_queue_after(&sk->sk_receive_queue, skb, buff);
+
+       return 0;
+}
+
+/* @return: 0  everything is fine. Just continue processing
+ *         1  subflow is broken stop everything
+ *         -1 this packet was broken - continue with the next one.
+ */
+static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_cb *mpcb = tp->mpcb;
+
+       /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
+       if (!skb->len && (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
+           !mptcp_is_data_fin(skb) && !mpcb->infinite_mapping_rcv) {
+               /* Remove a pure subflow-fin from the queue and increase
+                * copied_seq.
+                */
+               tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
+               __skb_unlink(skb, &sk->sk_receive_queue);
+               __kfree_skb(skb);
+               return -1;
+       }
+
+       /* If we are not yet fully established and do not know the mapping for
+        * this segment, this path has to fallback to infinite or be torn down.
+        */
+       if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
+           !tp->mptcp->mapping_present && !mpcb->infinite_mapping_rcv) {
+               pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",
+                      __func__, mpcb->mptcp_loc_token,
+                      tp->mptcp->path_index, __builtin_return_address(0),
+                      TCP_SKB_CB(skb)->seq);
+
+               if (!is_master_tp(tp)) {
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATASUB);
+                       mptcp_send_reset(sk);
+                       return 1;
+               }
+
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FBDATAINIT);
+
+               mpcb->infinite_mapping_snd = 1;
+               mpcb->infinite_mapping_rcv = 1;
+               mpcb->infinite_rcv_seq = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
+
+               mptcp_sub_force_close_all(mpcb, sk);
+
+               /* We do a seamless fallback and should not send a inf.mapping. */
+               mpcb->send_infinite_mapping = 0;
+               tp->mptcp->fully_established = 1;
+       }
+
+       /* Receiver-side becomes fully established when a whole rcv-window has
+        * been received without the need to fallback due to the previous
+        * condition.
+        */
+       if (!tp->mptcp->fully_established) {
+               tp->mptcp->init_rcv_wnd -= skb->len;
+               if (tp->mptcp->init_rcv_wnd < 0)
+                       mptcp_become_fully_estab(sk);
+       }
+
+       return 0;
+}
+
+static void mptcp_restart_sending(struct sock *meta_sk)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+
+       /* We resend everything that has not been acknowledged */
+       meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
+
+       /* We artificially restart the whole send-queue. Thus,
+        * it is as if no packets are in flight
+        */
+       meta_tp->packets_out = 0;
+
+       /* If the snd_nxt already wrapped around, we have to
+        * undo the wrapping, as we are restarting from snd_una
+        * on.
+        */
+       if (meta_tp->snd_nxt < meta_tp->snd_una) {
+               mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
+               mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
+       }
+       meta_tp->snd_nxt = meta_tp->snd_una;
+
+       /* Trigger a sending on the meta. */
+       mptcp_push_pending_frames(meta_sk);
+}
+
+/* @return: 0  everything is fine. Just continue processing
+ *         1  subflow is broken stop everything
+ *         -1 this packet was broken - continue with the next one.
+ */
+static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
+       struct mptcp_cb *mpcb = tp->mpcb;
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+       u32 *ptr;
+       u32 data_seq, sub_seq, data_len, tcp_end_seq;
+       bool set_infinite_rcv = false;
+
+       /* If we are in infinite-mapping-mode, the subflow is guaranteed to be
+        * in-order at the data-level. Thus data-seq-numbers can be inferred
+        * from what is expected at the data-level.
+        */
+       if (mpcb->infinite_mapping_rcv) {
+               /* copied_seq may be bigger than tcb->seq (e.g., when the peer
+                * retransmits data that actually has already been acknowledged with
+                * newer data, if he did not receive our acks). Thus, we need
+                * to account for this overlap as well.
+                */
+               tp->mptcp->map_data_seq = mpcb->infinite_rcv_seq - (tp->copied_seq - tcb->seq);
+               tp->mptcp->map_subseq = tcb->seq;
+               tp->mptcp->map_data_len = skb->len;
+               tp->mptcp->map_data_fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN);
+               tp->mptcp->mapping_present = 1;
+               return 0;
+       }
+
+       /* No mapping here? Exit - it is either already set or still on its way */
+       if (!mptcp_is_data_seq(skb)) {
+               /* Too many packets without a mapping - this subflow is broken */
+               if (!tp->mptcp->mapping_present &&
+                   tp->rcv_nxt - tp->copied_seq > 65536) {
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
+                       mptcp_send_reset(sk);
+                       return 1;
+               }
+
+               return 0;
+       }
+
+       ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
+       ptr++;
+       sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
+       ptr++;
+       data_len = get_unaligned_be16(ptr);
+
+       /* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
+        * The draft sets it to 0, but we really would like to have the
+        * real value, to have an easy handling afterwards here in this
+        * function.
+        */
+       if (mptcp_is_data_fin(skb) && skb->len == 0)
+               sub_seq = TCP_SKB_CB(skb)->seq;
+
+       /* If there is already a mapping - we check if it maps with the current
+        * one. If not - we reset.
+        */
+       if (tp->mptcp->mapping_present &&
+           (data_seq != (u32)tp->mptcp->map_data_seq ||
+            sub_seq != tp->mptcp->map_subseq ||
+            data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
+            mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
+               /* Mapping in packet is different from what we want */
+               pr_err("%s Mappings do not match!\n", __func__);
+               pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
+                      __func__, data_seq, (u32)tp->mptcp->map_data_seq,
+                      sub_seq, tp->mptcp->map_subseq, data_len,
+                      tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
+                      tp->mptcp->map_data_fin);
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSNOMATCH);
+               mptcp_send_reset(sk);
+               return 1;
+       }
+
+       /* If the previous check was good, the current mapping is valid and we exit. */
+       if (tp->mptcp->mapping_present)
+               return 0;
+
+       /* Mapping not yet set on this subflow - we set it here! */
+
+       if (!data_len) {
+               mpcb->infinite_mapping_rcv = 1;
+               mpcb->send_infinite_mapping = 1;
+               tp->mptcp->fully_established = 1;
+               /* We need to repeat mp_fail's until the sender felt
+                * back to infinite-mapping - here we stop repeating it.
+                */
+               tp->mptcp->send_mp_fail = 0;
+
+               /* We have to fixup data_len - it must be the same as skb->len */
+               data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
+               sub_seq = tcb->seq;
+
+               mptcp_restart_sending(tp->meta_sk);
+
+               mptcp_sub_force_close_all(mpcb, sk);
+
+               /* data_seq and so on are set correctly */
+
+               /* At this point, the meta-ofo-queue has to be emptied,
+                * as the following data is guaranteed to be in-order at
+                * the data and subflow-level
+                */
+               skb_rbtree_purge(&meta_tp->out_of_order_queue);
+
+               set_infinite_rcv = true;
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_INFINITEMAPRX);
+       }
+
+       /* We are sending mp-fail's and thus are in fallback mode.
+        * Ignore packets which do not announce the fallback and still
+        * want to provide a mapping.
+        */
+       if (tp->mptcp->send_mp_fail) {
+               tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
+               __skb_unlink(skb, &sk->sk_receive_queue);
+               __kfree_skb(skb);
+               return -1;
+       }
+
+       /* FIN increased the mapping-length by 1 */
+       if (mptcp_is_data_fin(skb))
+               data_len--;
+
+       /* Subflow-sequences of packet must be
+        * (at least partially) be part of the DSS-mapping's
+        * subflow-sequence-space.
+        *
+        * Basically the mapping is not valid, if either of the
+        * following conditions is true:
+        *
+        * 1. It's not a data_fin and
+        *    MPTCP-sub_seq >= TCP-end_seq
+        *
+        * 2. It's a data_fin and TCP-end_seq > TCP-seq and
+        *    MPTCP-sub_seq >= TCP-end_seq
+        *
+        * The previous two can be merged into:
+        *    TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
+        *    Because if it's not a data-fin, TCP-end_seq > TCP-seq
+        *
+        * 3. It's a data_fin and skb->len == 0 and
+        *    MPTCP-sub_seq > TCP-end_seq
+        *
+        * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
+        *    MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
+        */
+
+       /* subflow-fin is not part of the mapping - ignore it here ! */
+       tcp_end_seq = tcb->end_seq;
+       if (tcb->tcp_flags & TCPHDR_FIN)
+               tcp_end_seq--;
+       if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
+           (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
+           (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq))) {
+               /* Subflow-sequences of packet is different from what is in the
+                * packet's dss-mapping. The peer is misbehaving - reset
+                */
+               pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "
+                      "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"
+                      "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),
+                      skb->len, data_len, tp->copied_seq);
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTCPMISMATCH);
+               mptcp_send_reset(sk);
+               return 1;
+       }
+
+       /* Does the DSS had 64-bit seqnum's ? */
+       if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
+               /* Wrapped around? */
+               if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
+                       tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
+               } else {
+                       /* Else, access the default high-order bits */
+                       tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
+               }
+       } else {
+               tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
+
+               if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
+                       /* We make sure that the data_seq is invalid.
+                        * It will be dropped later.
+                        */
+                       tp->mptcp->map_data_seq += 0xFFFFFFFF;
+                       tp->mptcp->map_data_seq += 0xFFFFFFFF;
+               }
+       }
+
+       if (set_infinite_rcv)
+               mpcb->infinite_rcv_seq = tp->mptcp->map_data_seq;
+
+       tp->mptcp->map_data_len = data_len;
+       tp->mptcp->map_subseq = sub_seq;
+       tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
+       tp->mptcp->mapping_present = 1;
+
+       return 0;
+}
+
+/* Similar to tcp_sequence(...) */
+static inline bool mptcp_sequence(const struct tcp_sock *meta_tp,
+                                u64 data_seq, u64 end_data_seq)
+{
+       const struct mptcp_cb *mpcb = meta_tp->mpcb;
+       u64 rcv_wup64;
+
+       /* Wrap-around? */
+       if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
+               rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
+                               meta_tp->rcv_wup;
+       } else {
+               rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
+                                                 meta_tp->rcv_wup);
+       }
+
+       return  !before64(end_data_seq, rcv_wup64) &&
+               !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));
+}
+
+/* @return: 0  everything is fine. Just continue processing
+ *         -1 this packet was broken - continue with the next one.
+ */
+static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *tmp, *tmp1;
+       u32 tcp_end_seq;
+
+       if (!tp->mptcp->mapping_present)
+               return 0;
+
+       /* either, the new skb gave us the mapping and the first segment
+        * in the sub-rcv-queue has to be trimmed ...
+        */
+       tmp = skb_peek(&sk->sk_receive_queue);
+       if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
+           after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq)) {
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSTRIMHEAD);
+               mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
+       }
+
+       /* ... or the new skb (tail) has to be split at the end. */
+       tcp_end_seq = TCP_SKB_CB(skb)->end_seq;
+       if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+               tcp_end_seq--;
+       if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
+               u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
+
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DSSSPLITTAIL);
+               if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
+                       /* TODO : maybe handle this here better.
+                        * We now just force meta-retransmission.
+                        */
+                       tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
+                       __skb_unlink(skb, &sk->sk_receive_queue);
+                       __kfree_skb(skb);
+                       return -1;
+               }
+       }
+
+       /* Now, remove old sk_buff's from the receive-queue.
+        * This may happen if the mapping has been lost for these segments and
+        * the next mapping has already been received.
+        */
+       if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
+               skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
+                       if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
+                               break;
+
+                       tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
+                       __skb_unlink(tmp1, &sk->sk_receive_queue);
+
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PURGEOLD);
+                       /* Impossible that we could free skb here, because his
+                        * mapping is known to be valid from previous checks
+                        */
+                       __kfree_skb(tmp1);
+               }
+       }
+
+       return 0;
+}
+
+/* @return: 0  everything is fine. Just continue processing
+ *         1  subflow is broken stop everything
+ *         -1 this mapping has been put in the meta-receive-queue
+ *         -2 this mapping has been eaten by the application
+ */
+static int mptcp_queue_skb(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct mptcp_cb *mpcb = tp->mpcb;
+       struct sk_buff *tmp, *tmp1;
+       u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
+       u32 old_copied_seq = tp->copied_seq;
+       bool data_queued = false;
+
+       /* Have we not yet received the full mapping? */
+       if (!tp->mptcp->mapping_present ||
+           before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
+               return 0;
+
+       /* Is this an overlapping mapping? rcv_nxt >= end_data_seq
+        * OR
+        * This mapping is out of window
+        */
+       if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
+           !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
+                           tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
+               skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
+                       __skb_unlink(tmp1, &sk->sk_receive_queue);
+                       tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
+                       __kfree_skb(tmp1);
+
+                       if (!skb_queue_empty(&sk->sk_receive_queue) &&
+                           !before(TCP_SKB_CB(tmp)->seq,
+                                   tp->mptcp->map_subseq + tp->mptcp->map_data_len))
+                               break;
+               }
+
+               mptcp_reset_mapping(tp, old_copied_seq);
+
+               return -1;
+       }
+
+       /* Record it, because we want to send our data_fin on the same path */
+       if (tp->mptcp->map_data_fin) {
+               mpcb->dfin_path_index = tp->mptcp->path_index;
+               mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
+       }
+
+       /* Verify the checksum */
+       if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
+               int ret = mptcp_verif_dss_csum(sk);
+
+               if (ret <= 0) {
+                       mptcp_reset_mapping(tp, old_copied_seq);
+                       return 1;
+               }
+       }
+
+       if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
+               /* Seg's have to go to the meta-ofo-queue */
+               skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
+                       tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
+                       mptcp_prepare_skb(tmp1, sk);
+                       __skb_unlink(tmp1, &sk->sk_receive_queue);
+                       /* MUST be done here, because fragstolen may be true later.
+                        * Then, kfree_skb_partial will not account the memory.
+                        */
+                       skb_orphan(tmp1);
+
+                       if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
+                               tcp_data_queue_ofo(meta_sk, tmp1);
+                       else
+                               __kfree_skb(tmp1);
+
+                       if (!skb_queue_empty(&sk->sk_receive_queue) &&
+                           !before(TCP_SKB_CB(tmp)->seq,
+                                   tp->mptcp->map_subseq + tp->mptcp->map_data_len))
+                               break;
+               }
+
+               /* Quick ACK if more 3/4 of the receive window is filled */
+               if (after64(tp->mptcp->map_data_seq,
+                           rcv_nxt64 + 3 * (tcp_receive_window(meta_tp) >> 2)))
+                       tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
+
+       } else {
+               /* Ready for the meta-rcv-queue */
+               skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
+                       int eaten = 0;
+                       bool fragstolen = false;
+                       u32 old_rcv_nxt = meta_tp->rcv_nxt;
+
+                       tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
+                       mptcp_prepare_skb(tmp1, sk);
+                       __skb_unlink(tmp1, &sk->sk_receive_queue);
+                       /* MUST be done here, because fragstolen may be true.
+                        * Then, kfree_skb_partial will not account the memory.
+                        */
+                       skb_orphan(tmp1);
+
+                       /* This segment has already been received */
+                       if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
+                               __kfree_skb(tmp1);
+                               goto next;
+                       }
+
+                       if (mpcb->in_time_wait) /* In time-wait, do not receive data */
+                               eaten = 1;
+
+                       if (!eaten)
+                               eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);
+
+                       meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
+
+                       if (TCP_SKB_CB(tmp1)->tcp_flags & TCPHDR_FIN)
+                               mptcp_fin(meta_sk);
+
+                       /* Check if this fills a gap in the ofo queue */
+                       if (!RB_EMPTY_ROOT(&meta_tp->out_of_order_queue))
+                               tcp_ofo_queue(meta_sk);
+
+                       mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
+
+                       if (eaten)
+                               kfree_skb_partial(tmp1, fragstolen);
+
+                       data_queued = true;
+next:
+                       if (!skb_queue_empty(&sk->sk_receive_queue) &&
+                           !before(TCP_SKB_CB(tmp)->seq,
+                                   tp->mptcp->map_subseq + tp->mptcp->map_data_len))
+                               break;
+               }
+       }
+
+       inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_jiffies32;
+       mptcp_reset_mapping(tp, old_copied_seq);
+
+       return data_queued ? -1 : -2;
+}
+
+void mptcp_data_ready(struct sock *sk)
+{
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct sk_buff *skb, *tmp;
+       int queued = 0;
+
+       /* restart before the check, because mptcp_fin might have changed the
+        * state.
+        */
+restart:
+       /* If the meta cannot receive data, there is no point in pushing data.
+        * If we are in time-wait, we may still be waiting for the final FIN.
+        * So, we should proceed with the processing.
+        */
+       if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) {
+               skb_queue_purge(&sk->sk_receive_queue);
+               tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
+               goto exit;
+       }
+
+       /* Iterate over all segments, detect their mapping (if we don't have
+        * one yet), validate them and push everything one level higher.
+        */
+       skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
+               int ret;
+               /* Pre-validation - e.g., early fallback */
+               ret = mptcp_prevalidate_skb(sk, skb);
+               if (ret < 0)
+                       goto restart;
+               else if (ret > 0)
+                       break;
+
+               /* Set the current mapping */
+               ret = mptcp_detect_mapping(sk, skb);
+               if (ret < 0)
+                       goto restart;
+               else if (ret > 0)
+                       break;
+
+               /* Validation */
+               if (mptcp_validate_mapping(sk, skb) < 0)
+                       goto restart;
+
+               /* Push a level higher */
+               ret = mptcp_queue_skb(sk);
+               if (ret < 0) {
+                       if (ret == -1)
+                               queued = ret;
+                       goto restart;
+               } else if (ret == 0) {
+                       continue;
+               } else { /* ret == 1 */
+                       break;
+               }
+       }
+
+exit:
+       if (tcp_sk(sk)->close_it && sk->sk_state == TCP_FIN_WAIT2) {
+               tcp_send_ack(sk);
+               tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0);
+       }
+
+       if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
+               meta_sk->sk_data_ready(meta_sk);
+}
+
+struct mp_join *mptcp_find_join(const struct sk_buff *skb)
+{
+       const struct tcphdr *th = tcp_hdr(skb);
+       unsigned char *ptr;
+       int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+       /* Jump through the options to check whether JOIN is there */
+       ptr = (unsigned char *)(th + 1);
+       while (length > 0) {
+               int opcode = *ptr++;
+               int opsize;
+
+               switch (opcode) {
+               case TCPOPT_EOL:
+                       return NULL;
+               case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
+                       length--;
+                       continue;
+               default:
+                       opsize = *ptr++;
+                       if (opsize < 2) /* "silly options" */
+                               return NULL;
+                       if (opsize > length)
+                               return NULL;  /* don't parse partial options */
+                       if (opcode == TCPOPT_MPTCP &&
+                           ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
+                               return (struct mp_join *)(ptr - 2);
+                       }
+                       ptr += opsize - 2;
+                       length -= opsize;
+               }
+       }
+       return NULL;
+}
+
+int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
+{
+       const struct mptcp_cb *mpcb;
+       struct sock *meta_sk;
+       u32 token;
+       bool meta_v4;
+       struct mp_join *join_opt = mptcp_find_join(skb);
+
+       if (!join_opt)
+               return 0;
+
+       /* MPTCP structures were not initialized, so return error */
+       if (mptcp_init_failed)
+               return -1;
+
+       token = join_opt->u.syn.token;
+       meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
+       if (!meta_sk) {
+               MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN);
+               mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
+               return -1;
+       }
+
+       meta_v4 = meta_sk->sk_family == AF_INET;
+       if (meta_v4) {
+               if (skb->protocol == htons(ETH_P_IPV6)) {
+                       mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
+                       sock_put(meta_sk); /* Taken by mptcp_hash_find */
+                       return -1;
+               }
+       } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) {
+               mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
+               sock_put(meta_sk); /* Taken by mptcp_hash_find */
+               return -1;
+       }
+
+       mpcb = tcp_sk(meta_sk)->mpcb;
+       if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {
+               /* We are in fallback-mode on the reception-side -
+                * no new subflows!
+                */
+               sock_put(meta_sk); /* Taken by mptcp_hash_find */
+               MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINFALLBACK);
+               return -1;
+       }
+
+       /* Coming from time-wait-sock processing in tcp_v4_rcv.
+        * We have to deschedule it before continuing, because otherwise
+        * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
+        */
+       if (tw)
+               inet_twsk_deschedule_put(tw);
+
+       /* OK, this is a new syn/join, let's create a new open request and
+        * send syn+ack
+        */
+       bh_lock_sock_nested(meta_sk);
+       if (sock_owned_by_user(meta_sk)) {
+               skb->sk = meta_sk;
+               if (unlikely(sk_add_backlog(meta_sk, skb,
+                                           meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
+                       bh_unlock_sock(meta_sk);
+                       __NET_INC_STATS(sock_net(meta_sk),
+                                        LINUX_MIB_TCPBACKLOGDROP);
+                       sock_put(meta_sk); /* Taken by mptcp_hash_find */
+                       kfree_skb(skb);
+                       return 1;
+               }
+       } else if (skb->protocol == htons(ETH_P_IP)) {
+               tcp_v4_do_rcv(meta_sk, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+       } else {
+               tcp_v6_do_rcv(meta_sk, skb);
+#endif /* CONFIG_IPV6 */
+       }
+       bh_unlock_sock(meta_sk);
+       sock_put(meta_sk); /* Taken by mptcp_hash_find */
+       return 1;
+}
+
+int mptcp_do_join_short(struct sk_buff *skb,
+                       const struct mptcp_options_received *mopt,
+                       struct net *net)
+{
+       struct sock *meta_sk;
+       u32 token;
+       bool meta_v4;
+
+       token = mopt->mptcp_rem_token;
+       meta_sk = mptcp_hash_find(net, token);
+       if (!meta_sk) {
+               MPTCP_INC_STATS(dev_net(skb_dst(skb)->dev), MPTCP_MIB_JOINNOTOKEN);
+               mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
+               return -1;
+       }
+
+       meta_v4 = meta_sk->sk_family == AF_INET;
+       if (meta_v4) {
+               if (skb->protocol == htons(ETH_P_IPV6)) {
+                       mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
+                       sock_put(meta_sk); /* Taken by mptcp_hash_find */
+                       return -1;
+               }
+       } else if (skb->protocol == htons(ETH_P_IP) && meta_sk->sk_ipv6only) {
+               mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
+               sock_put(meta_sk); /* Taken by mptcp_hash_find */
+               return -1;
+       }
+
+       /* OK, this is a new syn/join, let's create a new open request and
+        * send syn+ack
+        */
+       bh_lock_sock(meta_sk);
+
+       /* This check is also done in mptcp_vX_do_rcv. But, there we cannot
+        * call tcp_vX_send_reset, because we hold already two socket-locks.
+        * (the listener and the meta from above)
+        *
+        * And the send-reset will try to take yet another one (ip_send_reply).
+        * Thus, we propagate the reset up to tcp_rcv_state_process.
+        */
+       if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||
+           tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||
+           meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {
+               MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINFALLBACK);
+               bh_unlock_sock(meta_sk);
+               sock_put(meta_sk); /* Taken by mptcp_hash_find */
+               return -1;
+       }
+
+       if (sock_owned_by_user(meta_sk)) {
+               skb->sk = meta_sk;
+               if (unlikely(sk_add_backlog(meta_sk, skb,
+                                           meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))
+                       __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
+               else
+                       /* Must make sure that upper layers won't free the
+                        * skb if it is added to the backlog-queue.
+                        */
+                       skb_get(skb);
+       } else {
+               /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
+                * the skb will finally be freed by tcp_v4_do_rcv (where we are
+                * coming from)
+                */
+               skb_get(skb);
+               if (skb->protocol == htons(ETH_P_IP)) {
+                       tcp_v4_do_rcv(meta_sk, skb);
+#if IS_ENABLED(CONFIG_IPV6)
+               } else { /* IPv6 */
+                       tcp_v6_do_rcv(meta_sk, skb);
+#endif /* CONFIG_IPV6 */
+               }
+       }
+
+       bh_unlock_sock(meta_sk);
+       sock_put(meta_sk); /* Taken by mptcp_hash_find */
+       return 0;
+}
+
+/**
+ * Equivalent of tcp_fin() for MPTCP
+ * Can be called only when the FIN is validly part
+ * of the data seqnum space. Not before when we get holes.
+ */
+void mptcp_fin(struct sock *meta_sk)
+{
+       struct sock *sk = NULL, *sk_it;
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       unsigned char state;
+
+       mptcp_for_each_sk(mpcb, sk_it) {
+               if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
+                       sk = sk_it;
+                       break;
+               }
+       }
+
+       if (!sk || sk->sk_state == TCP_CLOSE)
+               sk = mptcp_select_ack_sock(meta_sk);
+
+       inet_csk_schedule_ack(sk);
+
+       if (!mpcb->in_time_wait) {
+               meta_sk->sk_shutdown |= RCV_SHUTDOWN;
+               sock_set_flag(meta_sk, SOCK_DONE);
+               state = meta_sk->sk_state;
+       } else {
+               state = mpcb->mptw_state;
+       }
+
+       switch (state) {
+       case TCP_SYN_RECV:
+       case TCP_ESTABLISHED:
+               /* Move to CLOSE_WAIT */
+               tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
+               inet_csk(sk)->icsk_ack.pingpong = 1;
+               break;
+
+       case TCP_CLOSE_WAIT:
+       case TCP_CLOSING:
+               /* Received a retransmission of the FIN, do
+                * nothing.
+                */
+               break;
+       case TCP_LAST_ACK:
+               /* RFC793: Remain in the LAST-ACK state. */
+               break;
+
+       case TCP_FIN_WAIT1:
+               /* This case occurs when a simultaneous close
+                * happens, we must ack the received FIN and
+                * enter the CLOSING state.
+                */
+               tcp_send_ack(sk);
+               tcp_set_state(meta_sk, TCP_CLOSING);
+               break;
+       case TCP_FIN_WAIT2:
+               /* Received a FIN -- send ACK and enter TIME_WAIT. */
+               tcp_send_ack(sk);
+               meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0);
+               break;
+       default:
+               /* Only TCP_LISTEN and TCP_CLOSE are left, in these
+                * cases we should never reach this piece of code.
+                */
+               pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
+                      meta_sk->sk_state);
+               break;
+       }
+
+       /* It _is_ possible, that we have something out-of-order _after_ FIN.
+        * Probably, we should reset in this case. For now drop them.
+        */
+       skb_rbtree_purge(&meta_tp->out_of_order_queue);
+       sk_mem_reclaim(meta_sk);
+
+       if (!sock_flag(meta_sk, SOCK_DEAD)) {
+               meta_sk->sk_state_change(meta_sk);
+
+               /* Do not send POLL_HUP for half duplex close. */
+               if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
+                   meta_sk->sk_state == TCP_CLOSE)
+                       sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
+               else
+                       sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
+       }
+}
+
+static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sk_buff *skb;
+
+       if (!meta_tp->packets_out)
+               return;
+
+       tcp_for_write_queue(skb, meta_sk) {
+               if (skb == tcp_send_head(meta_sk))
+                       break;
+
+               if (mptcp_retransmit_skb(meta_sk, skb))
+                       return;
+
+               if (skb == tcp_write_queue_head(meta_sk))
+                       inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
+                                                 inet_csk(meta_sk)->icsk_rto,
+                                                 TCP_RTO_MAX);
+       }
+}
+
+static void mptcp_snd_una_update(struct tcp_sock *meta_tp, u32 data_ack)
+{
+       u32 delta = data_ack - meta_tp->snd_una;
+
+       sock_owned_by_me((struct sock *)meta_tp);
+       meta_tp->bytes_acked += delta;
+       meta_tp->snd_una = data_ack;
+}
+
+/* Handle the DATA_ACK */
+static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)
+{
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+       u32 prior_snd_una = meta_tp->snd_una;
+       int prior_packets;
+       u32 nwin, data_ack, data_seq;
+       u16 data_len = 0;
+
+       if (meta_sk->sk_state == TCP_CLOSE)
+               return;
+
+       /* A valid packet came in - subflow is operational again */
+       tp->pf = 0;
+
+       /* Even if there is no data-ack, we stop retransmitting.
+        * Except if this is a SYN/ACK. Then it is just a retransmission
+        */
+       if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
+               tp->mptcp->pre_established = 0;
+               sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
+       }
+
+       /* If we are in infinite mapping mode, rx_opt.data_ack has been
+        * set by mptcp_clean_rtx_infinite.
+        */
+       if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
+               goto exit;
+
+       data_ack = tp->mptcp->rx_opt.data_ack;
+
+       if (unlikely(!tp->mptcp->fully_established) &&
+           tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
+               /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1)
+                * includes a data-ack, we are fully established
+                */
+               mptcp_become_fully_estab(sk);
+
+       /* Get the data_seq */
+       if (mptcp_is_data_seq(skb)) {
+               data_seq = tp->mptcp->rx_opt.data_seq;
+               data_len = tp->mptcp->rx_opt.data_len;
+       } else {
+               data_seq = meta_tp->snd_wl1;
+       }
+
+       /* If the ack is older than previous acks
+        * then we can probably ignore it.
+        */
+       if (before(data_ack, prior_snd_una))
+               goto exit;
+
+       /* If the ack includes data we haven't sent yet, discard
+        * this segment (RFC793 Section 3.9).
+        */
+       if (after(data_ack, meta_tp->snd_nxt))
+               goto exit;
+
+       /*** Now, update the window  - inspired by tcp_ack_update_window ***/
+       nwin = ntohs(tcp_hdr(skb)->window);
+
+       if (likely(!tcp_hdr(skb)->syn))
+               nwin <<= tp->rx_opt.snd_wscale;
+
+       if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
+               tcp_update_wl(meta_tp, data_seq);
+
+               /* Draft v09, Section 3.3.5:
+                * [...] It should only update its local receive window values
+                * when the largest sequence number allowed (i.e.  DATA_ACK +
+                * receive window) increases. [...]
+                */
+               if (meta_tp->snd_wnd != nwin &&
+                   !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
+                       meta_tp->snd_wnd = nwin;
+
+                       if (nwin > meta_tp->max_window)
+                               meta_tp->max_window = nwin;
+               }
+       }
+       /*** Done, update the window ***/
+
+       /* We passed data and got it acked, remove any soft error
+        * log. Something worked...
+        */
+       sk->sk_err_soft = 0;
+       inet_csk(meta_sk)->icsk_probes_out = 0;
+       meta_tp->rcv_tstamp = tcp_jiffies32;
+       prior_packets = meta_tp->packets_out;
+       if (!prior_packets)
+               goto no_queue;
+
+       mptcp_snd_una_update(meta_tp, data_ack);
+
+       mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
+
+       /* We are in loss-state, and something got acked, retransmit the whole
+        * queue now!
+        */
+       if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
+           after(data_ack, prior_snd_una)) {
+               mptcp_xmit_retransmit_queue(meta_sk);
+               inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
+       }
+
+       /* Simplified version of tcp_new_space, because the snd-buffer
+        * is handled by all the subflows.
+        */
+       if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
+               sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
+               if (meta_sk->sk_socket &&
+                   test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
+                       meta_sk->sk_write_space(meta_sk);
+       }
+
+       if (meta_sk->sk_state != TCP_ESTABLISHED &&
+           mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))
+               return;
+
+exit:
+       mptcp_push_pending_frames(meta_sk);
+
+       return;
+
+no_queue:
+       if (tcp_send_head(meta_sk))
+               tcp_ack_probe(meta_sk);
+
+       mptcp_push_pending_frames(meta_sk);
+}
+
+void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
+
+       if (!tp->mpcb->infinite_mapping_snd)
+               return;
+
+       /* The difference between both write_seq's represents the offset between
+        * data-sequence and subflow-sequence. As we are infinite, this must
+        * match.
+        *
+        * Thus, from this difference we can infer the meta snd_una.
+        */
+       tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
+                                    tp->snd_una;
+
+       mptcp_data_ack(sk, skb);
+}
+
+/**** static functions used by mptcp_parse_options */
+
+static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
+{
+       struct sock *sk_it, *tmpsk;
+
+       mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
+               if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
+                       mptcp_reinject_data(sk_it, 0);
+                       mptcp_send_reset(sk_it);
+               }
+       }
+}
+
+static inline bool is_valid_addropt_opsize(u8 mptcp_ver,
+                                          struct mp_add_addr *mpadd,
+                                          int opsize)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+       if (mptcp_ver < MPTCP_VERSION_1 && mpadd->ipver == 6) {
+               return opsize == MPTCP_SUB_LEN_ADD_ADDR6 ||
+                      opsize == MPTCP_SUB_LEN_ADD_ADDR6 + 2;
+       }
+       if (mptcp_ver >= MPTCP_VERSION_1 && mpadd->ipver == 6)
+               return opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 ||
+                      opsize == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2;
+#endif
+       if (mptcp_ver < MPTCP_VERSION_1 && mpadd->ipver == 4) {
+               return opsize == MPTCP_SUB_LEN_ADD_ADDR4 ||
+                      opsize == MPTCP_SUB_LEN_ADD_ADDR4 + 2;
+       }
+       if (mptcp_ver >= MPTCP_VERSION_1 && mpadd->ipver == 4) {
+               return opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 ||
+                      opsize == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2;
+       }
+       return false;
+}
+
+void mptcp_parse_options(const uint8_t *ptr, int opsize,
+                        struct mptcp_options_received *mopt,
+                        const struct sk_buff *skb,
+                        struct tcp_sock *tp)
+{
+       const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
+
+       /* If the socket is mp-capable we would have a mopt. */
+       if (!mopt)
+               return;
+
+       switch (mp_opt->sub) {
+       case MPTCP_SUB_CAPABLE:
+       {
+               const struct mp_capable *mpcapable = (struct mp_capable *)ptr;
+
+               if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&
+                   opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {
+                       mptcp_debug("%s: mp_capable: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+
+               /* MPTCP-RFC 6824:
+                * "If receiving a message with the 'B' flag set to 1, and this
+                * is not understood, then this SYN MUST be silently ignored;
+                */
+               if (mpcapable->b) {
+                       mopt->drop_me = 1;
+                       break;
+               }
+
+               /* MPTCP-RFC 6824:
+                * "An implementation that only supports this method MUST set
+                *  bit "H" to 1, and bits "C" through "G" to 0."
+                */
+               if (!mpcapable->h)
+                       break;
+
+               mopt->saw_mpc = 1;
+               mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
+
+               if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)
+                       mopt->mptcp_sender_key = mpcapable->sender_key;
+               if (opsize == MPTCP_SUB_LEN_CAPABLE_ACK)
+                       mopt->mptcp_receiver_key = mpcapable->receiver_key;
+
+               mopt->mptcp_ver = mpcapable->ver;
+               break;
+       }
+       case MPTCP_SUB_JOIN:
+       {
+               const struct mp_join *mpjoin = (struct mp_join *)ptr;
+
+               if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
+                   opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
+                   opsize != MPTCP_SUB_LEN_JOIN_ACK) {
+                       mptcp_debug("%s: mp_join: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+
+               /* saw_mpc must be set, because in tcp_check_req we assume that
+                * it is set to support falling back to reg. TCP if a rexmitted
+                * SYN has no MP_CAPABLE or MP_JOIN
+                */
+               switch (opsize) {
+               case MPTCP_SUB_LEN_JOIN_SYN:
+                       mopt->is_mp_join = 1;
+                       mopt->saw_mpc = 1;
+                       mopt->low_prio = mpjoin->b;
+                       mopt->rem_id = mpjoin->addr_id;
+                       mopt->mptcp_rem_token = mpjoin->u.syn.token;
+                       mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
+                       break;
+               case MPTCP_SUB_LEN_JOIN_SYNACK:
+                       mopt->saw_mpc = 1;
+                       mopt->low_prio = mpjoin->b;
+                       mopt->rem_id = mpjoin->addr_id;
+                       mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
+                       mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
+                       break;
+               case MPTCP_SUB_LEN_JOIN_ACK:
+                       mopt->saw_mpc = 1;
+                       mopt->join_ack = 1;
+                       memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
+                       break;
+               }
+               break;
+       }
+       case MPTCP_SUB_DSS:
+       {
+               const struct mp_dss *mdss = (struct mp_dss *)ptr;
+               struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+               /* We check opsize for the csum and non-csum case. We do this,
+                * because the draft says that the csum SHOULD be ignored if
+                * it has not been negotiated in the MP_CAPABLE but still is
+                * present in the data.
+                *
+                * It will get ignored later in mptcp_queue_skb.
+                */
+               if (opsize != mptcp_sub_len_dss(mdss, 0) &&
+                   opsize != mptcp_sub_len_dss(mdss, 1)) {
+                       mptcp_debug("%s: mp_dss: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+
+               ptr += 4;
+
+               if (mdss->A) {
+                       tcb->mptcp_flags |= MPTCPHDR_ACK;
+
+                       if (mdss->a) {
+                               mopt->data_ack = (u32) get_unaligned_be64(ptr);
+                               ptr += MPTCP_SUB_LEN_ACK_64;
+                       } else {
+                               mopt->data_ack = get_unaligned_be32(ptr);
+                               ptr += MPTCP_SUB_LEN_ACK;
+                       }
+               }
+
+               tcb->dss_off = (ptr - skb_transport_header(skb));
+
+               if (mdss->M) {
+                       if (mdss->m) {
+                               u64 data_seq64 = get_unaligned_be64(ptr);
+
+                               tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
+                               mopt->data_seq = (u32) data_seq64;
+
+                               ptr += 12; /* 64-bit dseq + subseq */
+                       } else {
+                               mopt->data_seq = get_unaligned_be32(ptr);
+                               ptr += 8; /* 32-bit dseq + subseq */
+                       }
+                       mopt->data_len = get_unaligned_be16(ptr);
+
+                       tcb->mptcp_flags |= MPTCPHDR_SEQ;
+
+                       /* Is a check-sum present? */
+                       if (opsize == mptcp_sub_len_dss(mdss, 1))
+                               tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
+
+                       /* DATA_FIN only possible with DSS-mapping */
+                       if (mdss->F)
+                               tcb->mptcp_flags |= MPTCPHDR_FIN;
+               }
+
+               break;
+       }
+       case MPTCP_SUB_ADD_ADDR:
+       {
+               struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
+
+               /* If tcp_sock is not available, MPTCP version can't be
+                * retrieved and ADD_ADDR opsize validation is not possible.
+                */
+               if (!tp || !tp->mpcb)
+                       break;
+
+               if (!is_valid_addropt_opsize(tp->mpcb->mptcp_ver,
+                                            mpadd, opsize)) {
+                       mptcp_debug("%s: mp_add_addr: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+
+               /* We have to manually parse the options if we got two of them. */
+               if (mopt->saw_add_addr) {
+                       mopt->more_add_addr = 1;
+                       break;
+               }
+               mopt->saw_add_addr = 1;
+               mopt->add_addr_ptr = ptr;
+               break;
+       }
+       case MPTCP_SUB_REMOVE_ADDR:
+               if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
+                       mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+
+               if (mopt->saw_rem_addr) {
+                       mopt->more_rem_addr = 1;
+                       break;
+               }
+               mopt->saw_rem_addr = 1;
+               mopt->rem_addr_ptr = ptr;
+               break;
+       case MPTCP_SUB_PRIO:
+       {
+               const struct mp_prio *mpprio = (struct mp_prio *)ptr;
+
+               if (opsize != MPTCP_SUB_LEN_PRIO &&
+                   opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
+                       mptcp_debug("%s: mp_prio: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+
+               mopt->saw_low_prio = 1;
+               mopt->low_prio = mpprio->b;
+
+               if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
+                       mopt->saw_low_prio = 2;
+                       mopt->prio_addr_id = mpprio->addr_id;
+               }
+               break;
+       }
+       case MPTCP_SUB_FAIL:
+               if (opsize != MPTCP_SUB_LEN_FAIL) {
+                       mptcp_debug("%s: mp_fail: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+               mopt->mp_fail = 1;
+               break;
+       case MPTCP_SUB_FCLOSE:
+               if (opsize != MPTCP_SUB_LEN_FCLOSE) {
+                       mptcp_debug("%s: mp_fclose: bad option size %d\n",
+                                   __func__, opsize);
+                       break;
+               }
+
+               mopt->mp_fclose = 1;
+               mopt->mptcp_sender_key = ((struct mp_fclose *)ptr)->key;
+
+               break;
+       default:
+               mptcp_debug("%s: Received unknown subtype: %d\n",
+                           __func__, mp_opt->sub);
+               break;
+       }
+}
+
+/** Parse only MPTCP options */
+void tcp_parse_mptcp_options(const struct sk_buff *skb,
+                            struct mptcp_options_received *mopt)
+{
+       const struct tcphdr *th = tcp_hdr(skb);
+       int length = (th->doff * 4) - sizeof(struct tcphdr);
+       const unsigned char *ptr = (const unsigned char *)(th + 1);
+
+       while (length > 0) {
+               int opcode = *ptr++;
+               int opsize;
+
+               switch (opcode) {
+               case TCPOPT_EOL:
+                       return;
+               case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
+                       length--;
+                       continue;
+               default:
+                       opsize = *ptr++;
+                       if (opsize < 2) /* "silly options" */
+                               return;
+                       if (opsize > length)
+                               return; /* don't parse partial options */
+                       if (opcode == TCPOPT_MPTCP)
+                               mptcp_parse_options(ptr - 2, opsize, mopt, skb, NULL);
+               }
+               ptr += opsize - 2;
+               length -= opsize;
+       }
+}
+
+bool mptcp_check_rtt(const struct tcp_sock *tp, int time)
+{
+       struct mptcp_cb *mpcb = tp->mpcb;
+       struct sock *sk;
+       u32 rtt_max = 0;
+
+       /* In MPTCP, we take the max delay across all flows,
+        * in order to take into account meta-reordering buffers.
+        */
+       mptcp_for_each_sk(mpcb, sk) {
+               if (!mptcp_sk_can_recv(sk))
+                       continue;
+
+               if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt_us)
+                       rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt_us;
+       }
+       if (time < (rtt_max >> 3) || !rtt_max)
+               return true;
+
+       return false;
+}
+
+static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
+{
+       struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
+       struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+       __be16 port = 0;
+       union inet_addr addr;
+       sa_family_t family;
+
+       if (mpadd->ipver == 4) {
+               char *recv_hmac;
+               u8 hash_mac_check[20];
+               u8 no_key[8];
+               int msg_parts = 0;
+
+               if (mpcb->mptcp_ver < MPTCP_VERSION_1)
+                       goto skip_hmac_v4;
+
+               *(u64 *)no_key = 0;
+               recv_hmac = (char *)mpadd->u.v4.mac;
+               if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1) {
+                       recv_hmac -= sizeof(mpadd->u.v4.port);
+                       msg_parts = 2;
+               } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2) {
+                       msg_parts = 3;
+               }
+               mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
+                               (u8 *)no_key,
+                               (u32 *)hash_mac_check, msg_parts,
+                               1, (u8 *)&mpadd->addr_id,
+                               4, (u8 *)&mpadd->u.v4.addr.s_addr,
+                               2, (u8 *)&mpadd->u.v4.port);
+               if (memcmp(hash_mac_check, recv_hmac, 8) != 0)
+                       /* ADD_ADDR2 discarded */
+                       return;
+skip_hmac_v4:
+               if ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
+                    mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
+                    (mpcb->mptcp_ver == MPTCP_VERSION_1 &&
+                    mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4_VER1 + 2))
+                       port  = mpadd->u.v4.port;
+               family = AF_INET;
+               addr.in = mpadd->u.v4.addr;
+#if IS_ENABLED(CONFIG_IPV6)
+       } else if (mpadd->ipver == 6) {
+               char *recv_hmac;
+               u8 hash_mac_check[20];
+               u8 no_key[8];
+               int msg_parts = 0;
+
+               if (mpcb->mptcp_ver < MPTCP_VERSION_1)
+                       goto skip_hmac_v6;
+
+               *(u64 *)no_key = 0;
+               recv_hmac = (char *)mpadd->u.v6.mac;
+               if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1) {
+                       recv_hmac -= sizeof(mpadd->u.v6.port);
+                       msg_parts = 2;
+               } else if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2) {
+                       msg_parts = 3;
+               }
+               mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
+                               (u8 *)no_key,
+                               (u32 *)hash_mac_check, msg_parts,
+                               1, (u8 *)&mpadd->addr_id,
+                               16, (u8 *)&mpadd->u.v6.addr.s6_addr,
+                               2, (u8 *)&mpadd->u.v6.port);
+               if (memcmp(hash_mac_check, recv_hmac, 8) != 0)
+                       /* ADD_ADDR2 discarded */
+                       return;
+skip_hmac_v6:
+               if ((mpcb->mptcp_ver == MPTCP_VERSION_0 &&
+                    mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2) ||
+                    (mpcb->mptcp_ver == MPTCP_VERSION_1 &&
+                    mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6_VER1 + 2))
+                       port  = mpadd->u.v6.port;
+               family = AF_INET6;
+               addr.in6 = mpadd->u.v6.addr;
+#endif /* CONFIG_IPV6 */
+       } else {
+               return;
+       }
+
+       if (mpcb->pm_ops->add_raddr)
+               mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id);
+
+       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRRX);
+}
+
+static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
+{
+       struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
+       int i;
+       u8 rem_id;
+       struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+
+       for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
+               rem_id = (&mprem->addrs_id)[i];
+
+               if (mpcb->pm_ops->rem_raddr)
+                       mpcb->pm_ops->rem_raddr(mpcb, rem_id);
+               mptcp_send_reset_rem_id(mpcb, rem_id);
+
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRSUB);
+       }
+
+       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_REMADDRRX);
+}
+
+static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
+{
+       struct tcphdr *th = tcp_hdr(skb);
+       unsigned char *ptr;
+       int length = (th->doff * 4) - sizeof(struct tcphdr);
+
+       /* Jump through the options to check whether ADD_ADDR is there */
+       ptr = (unsigned char *)(th + 1);
+       while (length > 0) {
+               int opcode = *ptr++;
+               int opsize;
+
+               switch (opcode) {
+               case TCPOPT_EOL:
+                       return;
+               case TCPOPT_NOP:
+                       length--;
+                       continue;
+               default:
+                       opsize = *ptr++;
+                       if (opsize < 2)
+                               return;
+                       if (opsize > length)
+                               return;  /* don't parse partial options */
+                       if (opcode == TCPOPT_MPTCP &&
+                           ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
+                               u8 mptcp_ver = tcp_sk(sk)->mpcb->mptcp_ver;
+                               struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
+
+                               if (!is_valid_addropt_opsize(mptcp_ver, mpadd,
+                                                            opsize))
+                                       goto cont;
+
+                               mptcp_handle_add_addr(ptr, sk);
+                       }
+                       if (opcode == TCPOPT_MPTCP &&
+                           ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
+                               if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
+                                       goto cont;
+
+                               mptcp_handle_rem_addr(ptr, sk);
+                       }
+cont:
+                       ptr += opsize - 2;
+                       length -= opsize;
+               }
+       }
+}
+
+static bool mptcp_mp_fastclose_rcvd(struct sock *sk)
+{
+       struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
+       struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+
+       if (likely(!mptcp->rx_opt.mp_fclose))
+               return false;
+
+       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_FASTCLOSERX);
+       mptcp->rx_opt.mp_fclose = 0;
+       if (mptcp->rx_opt.mptcp_sender_key != mpcb->mptcp_loc_key)
+               return false;
+
+       mptcp_sub_force_close_all(mpcb, NULL);
+
+       tcp_reset(mptcp_meta_sk(sk));
+
+       return true;
+}
+
+static void mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
+{
+       struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+
+       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX);
+       mptcp->rx_opt.mp_fail = 0;
+
+       if (!th->rst && !mpcb->infinite_mapping_snd) {
+               mpcb->send_infinite_mapping = 1;
+
+               mptcp_restart_sending(meta_sk);
+
+               mptcp_sub_force_close_all(mpcb, sk);
+       }
+}
+
+static inline void mptcp_path_array_check(struct sock *meta_sk)
+{
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+
+       if (unlikely(mpcb->list_rcvd)) {
+               mpcb->list_rcvd = 0;
+               if (mpcb->pm_ops->new_remote_address)
+                       mpcb->pm_ops->new_remote_address(meta_sk);
+       }
+}
+
+bool mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
+                         const struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
+
+       if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
+               return false;
+
+       if (mptcp_mp_fastclose_rcvd(sk))
+               return true;
+
+       if (sk->sk_state == TCP_RST_WAIT && !th->rst)
+               return true;
+
+       if (unlikely(mopt->mp_fail))
+               mptcp_mp_fail_rcvd(sk, th);
+
+       /* RFC 6824, Section 3.3:
+        * If a checksum is not present when its use has been negotiated, the
+        * receiver MUST close the subflow with a RST as it is considered broken.
+        */
+       if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&
+           !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
+               mptcp_send_reset(sk);
+               return true;
+       }
+
+       /* We have to acknowledge retransmissions of the third
+        * ack.
+        */
+       if (mopt->join_ack) {
+               tcp_send_delayed_ack(sk);
+               mopt->join_ack = 0;
+       }
+
+       if (mopt->saw_add_addr || mopt->saw_rem_addr) {
+               if (mopt->more_add_addr || mopt->more_rem_addr) {
+                       mptcp_parse_addropt(skb, sk);
+               } else {
+                       if (mopt->saw_add_addr)
+                               mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
+                       if (mopt->saw_rem_addr)
+                               mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
+               }
+
+               mopt->more_add_addr = 0;
+               mopt->saw_add_addr = 0;
+               mopt->more_rem_addr = 0;
+               mopt->saw_rem_addr = 0;
+       }
+       if (mopt->saw_low_prio) {
+               if (mopt->saw_low_prio == 1) {
+                       tp->mptcp->rcv_low_prio = mopt->low_prio;
+               } else {
+                       struct sock *sk_it;
+
+                       mptcp_for_each_sk(tp->mpcb, sk_it) {
+                               struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;
+
+                               if (mptcp->rem_id == mopt->prio_addr_id)
+                                       mptcp->rcv_low_prio = mopt->low_prio;
+                       }
+               }
+               mopt->saw_low_prio = 0;
+       }
+
+       mptcp_data_ack(sk, skb);
+
+       mptcp_path_array_check(mptcp_meta_sk(sk));
+       /* Socket may have been mp_killed by a REMOVE_ADDR */
+       if (tp->mp_killed)
+               return true;
+
+       return false;
+}
+
+/* In case of fastopen, some data can already be in the write queue.
+ * We need to update the sequence number of the segments as they
+ * were initially TCP sequence numbers.
+ */
+static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);
+       struct sk_buff *skb;
+       u32 new_mapping = meta_tp->write_seq - master_tp->snd_una;
+
+       /* There should only be one skb in write queue: the data not
+        * acknowledged in the SYN+ACK. In this case, we need to map
+        * this data to data sequence numbers.
+        */
+       skb_queue_walk(&meta_sk->sk_write_queue, skb) {
+               /* If the server only acknowledges partially the data sent in
+                * the SYN, we need to trim the acknowledged part because
+                * we don't want to retransmit this already received data.
+                * When we reach this point, tcp_ack() has already cleaned up
+                * fully acked segments. However, tcp trims partially acked
+                * segments only when retransmitting. Since MPTCP comes into
+                * play only now, we will fake an initial transmit, and
+                * retransmit_skb() will not be called. The following fragment
+                * comes from __tcp_retransmit_skb().
+                */
+               if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) {
+                       WARN_ON(before(TCP_SKB_CB(skb)->end_seq,
+                                     master_tp->snd_una));
+                       /* tcp_trim_head can only returns ENOMEM if skb is
+                        * cloned. It is not the case here (see
+                        * tcp_send_syn_data).
+                        */
+                       WARN_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una -
+                                            TCP_SKB_CB(skb)->seq));
+               }
+
+               TCP_SKB_CB(skb)->seq += new_mapping;
+               TCP_SKB_CB(skb)->end_seq += new_mapping;
+       }
+
+       /* We can advance write_seq by the number of bytes unacknowledged
+        * and that were mapped in the previous loop.
+        */
+       meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una;
+
+       /* The packets from the master_sk will be entailed to it later
+        * Until that time, its write queue is empty, and
+        * write_seq must align with snd_una
+        */
+       master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una;
+       master_tp->packets_out = 0;
+
+       /* Although these data have been sent already over the subsk,
+        * They have never been sent over the meta_sk, so we rewind
+        * the send_head so that tcp considers it as an initial send
+        * (instead of retransmit).
+        */
+       meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
+}
+
+/* The skptr is needed, because if we become MPTCP-capable, we have to switch
+ * from meta-socket to master-socket.
+ *
+ * @return: 1 - we want to reset this connection
+ *         2 - we want to discard the received syn/ack
+ *         0 - everything is fine - continue
+ */
+int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
+                                   const struct sk_buff *skb,
+                                   const struct mptcp_options_received *mopt)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       if (mptcp(tp)) {
+               u8 hash_mac_check[20];
+               struct mptcp_cb *mpcb = tp->mpcb;
+
+               mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
+                               (u8 *)&mpcb->mptcp_loc_key,
+                               (u32 *)hash_mac_check, 2,
+                               4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
+                               4, (u8 *)&tp->mptcp->mptcp_loc_nonce);
+               if (memcmp(hash_mac_check,
+                          (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC);
+                       mptcp_sub_force_close(sk);
+                       return 1;
+               }
+
+               /* Set this flag in order to postpone data sending
+                * until the 4th ack arrives.
+                */
+               tp->mptcp->pre_established = 1;
+               tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
+
+               mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
+                               (u8 *)&mpcb->mptcp_rem_key,
+                               (u32 *)&tp->mptcp->sender_mac[0], 2,
+                               4, (u8 *)&tp->mptcp->mptcp_loc_nonce,
+                               4, (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce);
+
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+       } else if (mopt->saw_mpc) {
+               struct sock *meta_sk = sk;
+
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
+               if (mopt->mptcp_ver > tcp_sk(sk)->mptcp_ver)
+                       /* TODO Consider adding new MPTCP_INC_STATS entry */
+                       goto fallback;
+
+               if (mptcp_create_master_sk(sk, mopt->mptcp_sender_key,
+                                          mopt->mptcp_ver,
+                                          ntohs(tcp_hdr(skb)->window)))
+                       return 2;
+
+               sk = tcp_sk(sk)->mpcb->master_sk;
+               *skptr = sk;
+               tp = tcp_sk(sk);
+
+               sk->sk_bound_dev_if = inet_iif(skb);
+
+               /* If fastopen was used data might be in the send queue. We
+                * need to update their sequence number to MPTCP-level seqno.
+                * Note that it can happen in rare cases that fastopen_req is
+                * NULL and syn_data is 0 but fastopen indeed occurred and
+                * data has been queued in the write queue (but not sent).
+                * Example of such rare cases: connect is non-blocking and
+                * TFO is configured to work without cookies.
+                */
+               if (!skb_queue_empty(&meta_sk->sk_write_queue))
+                       mptcp_rcv_synsent_fastopen(meta_sk);
+
+               /* -1, because the SYN consumed 1 byte. In case of TFO, we
+                * start the subflow-sequence number as if the data of the SYN
+                * is not part of any mapping.
+                */
+               tp->mptcp->snt_isn = tp->snd_una - 1;
+               tp->mpcb->dss_csum = mopt->dss_csum;
+               if (tp->mpcb->dss_csum)
+                       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CSUMENABLED);
+
+               tp->mptcp->include_mpc = 1;
+
+               /* Ensure that fastopen is handled at the meta-level. */
+               tp->fastopen_req = NULL;
+
+               sk_set_socket(sk, meta_sk->sk_socket);
+               sk->sk_wq = meta_sk->sk_wq;
+
+                /* hold in sk_clone_lock due to initialization to 2 */
+               sock_put(sk);
+       } else {
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+fallback:
+               tp->request_mptcp = 0;
+
+               if (tp->inside_tk_table)
+                       mptcp_hash_remove_bh(tp);
+       }
+
+       if (mptcp(tp))
+               tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
+
+       return 0;
+}
+
+/* Similar to tcp_should_expand_sndbuf */
+bool mptcp_should_expand_sndbuf(const struct sock *sk)
+{
+       const struct sock *sk_it;
+       const struct sock *meta_sk = mptcp_meta_sk(sk);
+       const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       int cnt_backups = 0;
+       int backup_available = 0;
+
+       /* We circumvent this check in tcp_check_space, because we want to
+        * always call sk_write_space. So, we reproduce the check here.
+        */
+       if (!meta_sk->sk_socket ||
+           !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
+               return false;
+
+       /* If the user specified a specific send buffer setting, do
+        * not modify it.
+        */
+       if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+               return false;
+
+       /* If we are under global TCP memory pressure, do not expand.  */
+       if (tcp_under_memory_pressure(meta_sk))
+               return false;
+
+       /* If we are under soft global TCP memory pressure, do not expand.  */
+       if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))
+               return false;
+
+       /* For MPTCP we look for a subsocket that could send data.
+        * If we found one, then we update the send-buffer.
+        */
+       mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
+               struct tcp_sock *tp_it = tcp_sk(sk_it);
+
+               if (!mptcp_sk_can_send(sk_it))
+                       continue;
+
+               /* Backup-flows have to be counted - if there is no other
+                * subflow we take the backup-flow into account.
+                */
+               if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio)
+                       cnt_backups++;
+
+               if (tcp_packets_in_flight(tp_it) < tp_it->snd_cwnd) {
+                       if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
+                               backup_available = 1;
+                               continue;
+                       }
+                       return true;
+               }
+       }
+
+       /* Backup-flow is available for sending - update send-buffer */
+       if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)
+               return true;
+       return false;
+}
+
+void mptcp_init_buffer_space(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       int space;
+
+       tcp_init_buffer_space(sk);
+
+       if (is_master_tp(tp)) {
+               meta_tp->rcvq_space.space = meta_tp->rcv_wnd;
+               tcp_mstamp_refresh(meta_tp);
+               meta_tp->rcvq_space.time = meta_tp->tcp_mstamp;
+               meta_tp->rcvq_space.seq = meta_tp->copied_seq;
+
+               /* If there is only one subflow, we just use regular TCP
+                * autotuning. User-locks are handled already by
+                * tcp_init_buffer_space
+                */
+               meta_tp->window_clamp = tp->window_clamp;
+               meta_tp->rcv_ssthresh = tp->rcv_ssthresh;
+               meta_sk->sk_rcvbuf = sk->sk_rcvbuf;
+               meta_sk->sk_sndbuf = sk->sk_sndbuf;
+
+               return;
+       }
+
+       if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)
+               goto snd_buf;
+
+       /* Adding a new subflow to the rcv-buffer space. We make a simple
+        * addition, to give some space to allow traffic on the new subflow.
+        * Autotuning will increase it further later on.
+        */
+       space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);
+       if (space > meta_sk->sk_rcvbuf) {
+               meta_tp->window_clamp += tp->window_clamp;
+               meta_tp->rcv_ssthresh += tp->rcv_ssthresh;
+               meta_sk->sk_rcvbuf = space;
+       }
+
+snd_buf:
+       if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+               return;
+
+       /* Adding a new subflow to the send-buffer space. We make a simple
+        * addition, to give some space to allow traffic on the new subflow.
+        * Autotuning will increase it further later on.
+        */
+       space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);
+       if (space > meta_sk->sk_sndbuf) {
+               meta_sk->sk_sndbuf = space;
+               meta_sk->sk_write_space(meta_sk);
+       }
+}
+
+void mptcp_tcp_set_rto(struct sock *sk)
+{
+       tcp_set_rto(sk);
+       mptcp_set_rto(sk);
+}
diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c
new file mode 100644 (file)
index 0000000..d8cc68a
--- /dev/null
@@ -0,0 +1,432 @@
+/*
+ *     MPTCP implementation - IPv4-specific functions
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/tcp.h>
+
+#include <net/inet_common.h>
+#include <net/inet_connection_sock.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+#include <net/request_sock.h>
+#include <net/tcp.h>
+
+u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
+{
+       return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+                           (__force u32)sport << 16 | (__force u32)dport,
+                           mptcp_seed++, &mptcp_secret);
+}
+
+u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
+                    u32 seed)
+{
+       return siphash_2u64((__force u64)saddr << 32 | (__force u64)daddr,
+                           (__force u64)seed << 32 | (__force u64)sport << 16 | (__force u64)dport,
+                           &mptcp_secret);
+}
+
+
+static void mptcp_v4_reqsk_destructor(struct request_sock *req)
+{
+       mptcp_reqsk_destructor(req);
+
+       tcp_v4_reqsk_destructor(req);
+}
+
+static int mptcp_v4_init_req(struct request_sock *req, const struct sock *sk,
+                            struct sk_buff *skb, bool want_cookie)
+{
+       tcp_request_sock_ipv4_ops.init_req(req, sk, skb, want_cookie);
+
+       mptcp_rsk(req)->hash_entry.pprev = NULL;
+       mptcp_rsk(req)->is_sub = 0;
+       inet_rsk(req)->mptcp_rqsk = 1;
+
+       /* In case of SYN-cookies, we wait for the isn to be generated - it is
+        * input to the key-generation.
+        */
+       if (!want_cookie)
+               mptcp_reqsk_init(req, sk, skb, false);
+
+       return 0;
+}
+
+#ifdef CONFIG_SYN_COOKIES
+static u32 mptcp_v4_cookie_init_seq(struct request_sock *req, const struct sock *sk,
+                                   const struct sk_buff *skb, __u16 *mssp)
+{
+       __u32 isn = cookie_v4_init_sequence(req, sk, skb, mssp);
+
+       tcp_rsk(req)->snt_isn = isn;
+
+       mptcp_reqsk_init(req, sk, skb, true);
+
+       return isn;
+}
+#endif
+
+static int mptcp_v4_join_init_req(struct request_sock *req, const struct sock *sk,
+                                 struct sk_buff *skb, bool want_cookie)
+{
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+       const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+       union inet_addr addr;
+       int loc_id;
+       bool low_prio = false;
+
+       /* We need to do this as early as possible. Because, if we fail later
+        * (e.g., get_local_id), then reqsk_free tries to remove the
+        * request-socket from the htb in mptcp_hash_request_remove as pprev
+        * may be different from NULL.
+        */
+       mtreq->hash_entry.pprev = NULL;
+
+       tcp_request_sock_ipv4_ops.init_req(req, sk, skb, want_cookie);
+
+       mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,
+                                                   ip_hdr(skb)->daddr,
+                                                   tcp_hdr(skb)->source,
+                                                   tcp_hdr(skb)->dest);
+       addr.ip = inet_rsk(req)->ir_loc_addr;
+       loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio);
+       if (loc_id == -1)
+               return -1;
+       mtreq->loc_id = loc_id;
+       mtreq->low_prio = low_prio;
+
+       mptcp_join_reqsk_init(mpcb, req, skb);
+
+       return 0;
+}
+
+/* Similar to tcp_request_sock_ops */
+struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
+       .family         =       PF_INET,
+       .obj_size       =       sizeof(struct mptcp_request_sock),
+       .rtx_syn_ack    =       tcp_rtx_synack,
+       .send_ack       =       tcp_v4_reqsk_send_ack,
+       .destructor     =       mptcp_v4_reqsk_destructor,
+       .send_reset     =       tcp_v4_send_reset,
+       .syn_ack_timeout =      tcp_syn_ack_timeout,
+};
+
+/* Similar to: tcp_v4_conn_request */
+static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
+{
+       return tcp_conn_request(&mptcp_request_sock_ops,
+                               &mptcp_join_request_sock_ipv4_ops,
+                               meta_sk, skb);
+}
+
+int mptcp_finish_handshake(struct sock *child, struct sk_buff *skb)
+       __releases(&child->sk_lock.slock)
+{
+       int ret;
+
+       /* We don't call tcp_child_process here, because we hold
+        * already the meta-sk-lock and are sure that it is not owned
+        * by the user.
+        */
+       tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+       ret = tcp_rcv_state_process(child, skb);
+       bh_unlock_sock(child);
+       sock_put(child);
+
+       return ret;
+}
+
+
+/* Similar to: tcp_v4_do_rcv
+ * We only process join requests here. (either the SYN or the final ACK)
+ */
+int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
+{
+       const struct tcphdr *th = tcp_hdr(skb);
+       const struct iphdr *iph = ip_hdr(skb);
+       struct sock *child, *rsk = NULL, *sk;
+       int ret;
+
+       sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
+                                    iph->saddr, th->source, iph->daddr,
+                                    th->dest, inet_iif(skb));
+
+       if (!sk)
+               goto new_subflow;
+
+       if (is_meta_sk(sk)) {
+               WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
+               sock_put(sk);
+               goto discard;
+       }
+
+       if (sk->sk_state == TCP_TIME_WAIT) {
+               inet_twsk_put(inet_twsk(sk));
+               goto discard;
+       }
+
+       if (sk->sk_state == TCP_NEW_SYN_RECV) {
+               struct request_sock *req = inet_reqsk(sk);
+
+               if (!mptcp_can_new_subflow(meta_sk))
+                       goto reset_and_discard;
+
+               local_bh_disable();
+
+               child = tcp_check_req(meta_sk, skb, req, false);
+               if (!child) {
+                       reqsk_put(req);
+                       local_bh_enable();
+                       goto discard;
+               }
+
+               if (child != meta_sk) {
+                       ret = mptcp_finish_handshake(child, skb);
+                       if (ret) {
+                               rsk = child;
+                               local_bh_enable();
+                               goto reset_and_discard;
+                       }
+
+                       local_bh_enable();
+                       return 0;
+               }
+
+               /* tcp_check_req failed */
+               reqsk_put(req);
+
+               local_bh_enable();
+               goto discard;
+       }
+
+       ret = tcp_v4_do_rcv(sk, skb);
+       sock_put(sk);
+
+       return ret;
+
+new_subflow:
+       if (!mptcp_can_new_subflow(meta_sk))
+               goto reset_and_discard;
+
+       child = tcp_v4_cookie_check(meta_sk, skb);
+       if (!child)
+               goto discard;
+
+       if (child != meta_sk) {
+               ret = mptcp_finish_handshake(child, skb);
+               if (ret) {
+                       rsk = child;
+                       goto reset_and_discard;
+               }
+       }
+
+       if (tcp_hdr(skb)->syn) {
+               local_bh_disable();
+               mptcp_v4_join_request(meta_sk, skb);
+               local_bh_enable();
+       }
+
+discard:
+       kfree_skb(skb);
+       return 0;
+
+reset_and_discard:
+       tcp_v4_send_reset(rsk, skb);
+       goto discard;
+}
+
+/* Create a new IPv4 subflow.
+ *
+ * We are in user-context and meta-sock-lock is hold.
+ */
+int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
+                          struct mptcp_rem4 *rem)
+{
+       struct tcp_sock *tp;
+       struct sock *sk;
+       struct sockaddr_in loc_in, rem_in;
+       struct socket_alloc sock_full;
+       struct socket *sock = (struct socket *)&sock_full;
+       int ret;
+
+       /** First, create and prepare the new socket */
+       memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full));
+       sock->state = SS_UNCONNECTED;
+       sock->ops = NULL;
+
+       ret = inet_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1);
+       if (unlikely(ret < 0)) {
+               mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);
+               return ret;
+       }
+
+       sk = sock->sk;
+       tp = tcp_sk(sk);
+
+       /* All subsockets need the MPTCP-lock-class */
+       lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name);
+       lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0);
+
+       if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))
+               goto error;
+
+       tp->mptcp->slave_sk = 1;
+       tp->mptcp->low_prio = loc->low_prio;
+
+       /* Initializing the timer for an MPTCP subflow */
+       setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
+
+       /** Then, connect the socket to the peer */
+       loc_in.sin_family = AF_INET;
+       rem_in.sin_family = AF_INET;
+       loc_in.sin_port = 0;
+       if (rem->port)
+               rem_in.sin_port = rem->port;
+       else
+               rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
+       loc_in.sin_addr = loc->addr;
+       rem_in.sin_addr = rem->addr;
+
+       if (loc->if_idx)
+               sk->sk_bound_dev_if = loc->if_idx;
+
+       ret = kernel_bind(sock, (struct sockaddr *)&loc_in,
+                         sizeof(struct sockaddr_in));
+       if (ret < 0) {
+               mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",
+                           __func__, ret);
+               goto error;
+       }
+
+       mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d ifidx: %d\n",
+                   __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
+                   tp->mptcp->path_index, &loc_in.sin_addr,
+                   ntohs(loc_in.sin_port), &rem_in.sin_addr,
+                   ntohs(rem_in.sin_port), loc->if_idx);
+
+       if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)
+               tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);
+
+       ret = kernel_connect(sock, (struct sockaddr *)&rem_in,
+                            sizeof(struct sockaddr_in), O_NONBLOCK);
+       if (ret < 0 && ret != -EINPROGRESS) {
+               mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
+                           __func__, ret);
+               goto error;
+       }
+
+       MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX);
+
+       sk_set_socket(sk, meta_sk->sk_socket);
+       sk->sk_wq = meta_sk->sk_wq;
+
+       return 0;
+
+error:
+       /* May happen if mptcp_add_sock fails first */
+       if (!mptcp(tp)) {
+               tcp_close(sk, 0);
+       } else {
+               local_bh_disable();
+               mptcp_sub_force_close(sk);
+               local_bh_enable();
+       }
+       return ret;
+}
+EXPORT_SYMBOL(mptcp_init4_subsockets);
+
+const struct inet_connection_sock_af_ops mptcp_v4_specific = {
+       .queue_xmit        = ip_queue_xmit,
+       .send_check        = tcp_v4_send_check,
+       .rebuild_header    = inet_sk_rebuild_header,
+       .sk_rx_dst_set     = inet_sk_rx_dst_set,
+       .conn_request      = mptcp_conn_request,
+       .syn_recv_sock     = tcp_v4_syn_recv_sock,
+       .net_header_len    = sizeof(struct iphdr),
+       .setsockopt        = ip_setsockopt,
+       .getsockopt        = ip_getsockopt,
+       .addr2sockaddr     = inet_csk_addr2sockaddr,
+       .sockaddr_len      = sizeof(struct sockaddr_in),
+#ifdef CONFIG_COMPAT
+       .compat_setsockopt = compat_ip_setsockopt,
+       .compat_getsockopt = compat_ip_getsockopt,
+#endif
+       .mtu_reduced       = tcp_v4_mtu_reduced,
+};
+
+struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
+struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
+
+/* General initialization of IPv4 for MPTCP */
+int mptcp_pm_v4_init(void)
+{
+       int ret = 0;
+       struct request_sock_ops *ops = &mptcp_request_sock_ops;
+
+       mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
+       mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;
+#ifdef CONFIG_SYN_COOKIES
+       mptcp_request_sock_ipv4_ops.cookie_init_seq = mptcp_v4_cookie_init_seq;
+#endif
+       mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
+       mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;
+
+       ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
+       if (ops->slab_name == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
+                                     SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
+                                     NULL);
+
+       if (ops->slab == NULL) {
+               ret =  -ENOMEM;
+               goto err_reqsk_create;
+       }
+
+out:
+       return ret;
+
+err_reqsk_create:
+       kfree(ops->slab_name);
+       ops->slab_name = NULL;
+       goto out;
+}
+
+void mptcp_pm_v4_undo(void)
+{
+       kmem_cache_destroy(mptcp_request_sock_ops.slab);
+       kfree(mptcp_request_sock_ops.slab_name);
+}
diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c
new file mode 100644 (file)
index 0000000..07240b1
--- /dev/null
@@ -0,0 +1,460 @@
+/*
+ *     MPTCP implementation - IPv6-specific functions
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/export.h>
+#include <linux/in6.h>
+#include <linux/kernel.h>
+
+#include <net/addrconf.h>
+#include <net/flow.h>
+#include <net/inet6_connection_sock.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet_common.h>
+#include <net/ipv6.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
+#include <net/mptcp.h>
+#include <net/mptcp_v6.h>
+#include <net/tcp.h>
+#include <net/transp_v6.h>
+
+__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
+                        __be16 sport, __be16 dport)
+{
+       const struct {
+               struct in6_addr saddr;
+               struct in6_addr daddr;
+               u32 seed;
+               __be16 sport;
+               __be16 dport;
+       } __aligned(SIPHASH_ALIGNMENT) combined = {
+               .saddr = *(struct in6_addr *)saddr,
+               .daddr = *(struct in6_addr *)daddr,
+               .seed = mptcp_seed++,
+               .sport = sport,
+               .dport = dport
+       };
+
+       return siphash(&combined, offsetofend(typeof(combined), dport),
+                      &mptcp_secret);
+}
+
+u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
+                    __be16 sport, __be16 dport, u32 seed)
+{
+       const struct {
+               struct in6_addr saddr;
+               struct in6_addr daddr;
+               u32 seed;
+               __be16 sport;
+               __be16 dport;
+       } __aligned(SIPHASH_ALIGNMENT) combined = {
+               .saddr = *(struct in6_addr *)saddr,
+               .daddr = *(struct in6_addr *)daddr,
+               .seed = seed,
+               .sport = sport,
+               .dport = dport
+       };
+
+       return siphash(&combined, offsetofend(typeof(combined), dport),
+                      &mptcp_secret);
+}
+
+static void mptcp_v6_reqsk_destructor(struct request_sock *req)
+{
+       mptcp_reqsk_destructor(req);
+
+       tcp_v6_reqsk_destructor(req);
+}
+
+static int mptcp_v6_init_req(struct request_sock *req, const struct sock *sk,
+                            struct sk_buff *skb, bool want_cookie)
+{
+       tcp_request_sock_ipv6_ops.init_req(req, sk, skb, want_cookie);
+
+       mptcp_rsk(req)->hash_entry.pprev = NULL;
+       mptcp_rsk(req)->is_sub = 0;
+       inet_rsk(req)->mptcp_rqsk = 1;
+
+       /* In case of SYN-cookies, we wait for the isn to be generated - it is
+        * input to the key-generation.
+        */
+       if (!want_cookie)
+               mptcp_reqsk_init(req, sk, skb, false);
+
+       return 0;
+}
+
+#ifdef CONFIG_SYN_COOKIES
+static u32 mptcp_v6_cookie_init_seq(struct request_sock *req, const struct sock *sk,
+                                   const struct sk_buff *skb, __u16 *mssp)
+{
+       __u32 isn = cookie_v6_init_sequence(req, sk, skb, mssp);
+
+       tcp_rsk(req)->snt_isn = isn;
+
+       mptcp_reqsk_init(req, sk, skb, true);
+
+       return isn;
+}
+#endif
+
+static int mptcp_v6_join_init_req(struct request_sock *req, const struct sock *sk,
+                                 struct sk_buff *skb, bool want_cookie)
+{
+       struct mptcp_request_sock *mtreq = mptcp_rsk(req);
+       const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+       union inet_addr addr;
+       int loc_id;
+       bool low_prio = false;
+
+       /* We need to do this as early as possible. Because, if we fail later
+        * (e.g., get_local_id), then reqsk_free tries to remove the
+        * request-socket from the htb in mptcp_hash_request_remove as pprev
+        * may be different from NULL.
+        */
+       mtreq->hash_entry.pprev = NULL;
+
+       tcp_request_sock_ipv6_ops.init_req(req, sk, skb, want_cookie);
+
+       mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,
+                                                   ipv6_hdr(skb)->daddr.s6_addr32,
+                                                   tcp_hdr(skb)->source,
+                                                   tcp_hdr(skb)->dest);
+       addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
+       loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio);
+       if (loc_id == -1)
+               return -1;
+       mtreq->loc_id = loc_id;
+       mtreq->low_prio = low_prio;
+
+       mptcp_join_reqsk_init(mpcb, req, skb);
+
+       return 0;
+}
+
+/* Similar to tcp6_request_sock_ops */
+struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
+       .family         =       AF_INET6,
+       .obj_size       =       sizeof(struct mptcp_request_sock),
+       .rtx_syn_ack    =       tcp_rtx_synack,
+       .send_ack       =       tcp_v6_reqsk_send_ack,
+       .destructor     =       mptcp_v6_reqsk_destructor,
+       .send_reset     =       tcp_v6_send_reset,
+       .syn_ack_timeout =      tcp_syn_ack_timeout,
+};
+
+static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
+{
+       return tcp_conn_request(&mptcp6_request_sock_ops,
+                               &mptcp_join_request_sock_ipv6_ops,
+                               meta_sk, skb);
+}
+
+int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
+{
+       const struct tcphdr *th = tcp_hdr(skb);
+       const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+       struct sock *child, *rsk = NULL, *sk;
+       int ret;
+
+       sk = __inet6_lookup_established(sock_net(meta_sk),
+                                       &tcp_hashinfo,
+                                       &ip6h->saddr, th->source,
+                                       &ip6h->daddr, ntohs(th->dest),
+                                       tcp_v6_iif(skb), tcp_v6_sdif(skb));
+
+       if (!sk)
+               goto new_subflow;
+
+       if (is_meta_sk(sk)) {
+               WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
+               sock_put(sk);
+               goto discard;
+       }
+
+       if (sk->sk_state == TCP_TIME_WAIT) {
+               inet_twsk_put(inet_twsk(sk));
+               goto discard;
+       }
+
+       if (sk->sk_state == TCP_NEW_SYN_RECV) {
+               struct request_sock *req = inet_reqsk(sk);
+
+               if (!mptcp_can_new_subflow(meta_sk))
+                       goto reset_and_discard;
+
+               local_bh_disable();
+               child = tcp_check_req(meta_sk, skb, req, false);
+               if (!child) {
+                       reqsk_put(req);
+                       local_bh_enable();
+                       goto discard;
+               }
+
+               if (child != meta_sk) {
+                       ret = mptcp_finish_handshake(child, skb);
+                       if (ret) {
+                               rsk = child;
+                               local_bh_enable();
+                               goto reset_and_discard;
+                       }
+
+                       local_bh_enable();
+                       return 0;
+               }
+
+               /* tcp_check_req failed */
+               reqsk_put(req);
+
+               local_bh_enable();
+               goto discard;
+       }
+
+       ret = tcp_v6_do_rcv(sk, skb);
+       sock_put(sk);
+
+       return ret;
+
+new_subflow:
+       if (!mptcp_can_new_subflow(meta_sk))
+               goto reset_and_discard;
+
+       child = tcp_v6_cookie_check(meta_sk, skb);
+       if (!child)
+               goto discard;
+
+       if (child != meta_sk) {
+               ret = mptcp_finish_handshake(child, skb);
+               if (ret) {
+                       rsk = child;
+                       goto reset_and_discard;
+               }
+       }
+
+       if (tcp_hdr(skb)->syn) {
+               local_bh_disable();
+               mptcp_v6_join_request(meta_sk, skb);
+               local_bh_enable();
+       }
+
+discard:
+       kfree_skb(skb);
+       return 0;
+
+reset_and_discard:
+       tcp_v6_send_reset(rsk, skb);
+       goto discard;
+}
+
+/* Create a new IPv6 subflow.
+ *
+ * We are in user-context and meta-sock-lock is hold.
+ */
+int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
+                          struct mptcp_rem6 *rem)
+{
+       struct tcp_sock *tp;
+       struct sock *sk;
+       struct sockaddr_in6 loc_in, rem_in;
+       struct socket_alloc sock_full;
+       struct socket *sock = (struct socket *)&sock_full;
+       int ret;
+
+       /** First, create and prepare the new socket */
+       memcpy(&sock_full, meta_sk->sk_socket, sizeof(sock_full));
+       sock->state = SS_UNCONNECTED;
+       sock->ops = NULL;
+
+       ret = inet6_create(sock_net(meta_sk), sock, IPPROTO_TCP, 1);
+       if (unlikely(ret < 0)) {
+               mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);
+               return ret;
+       }
+
+       sk = sock->sk;
+       tp = tcp_sk(sk);
+
+       /* All subsockets need the MPTCP-lock-class */
+       lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, meta_slock_key_name);
+       lockdep_init_map(&(sk)->sk_lock.dep_map, meta_key_name, &meta_key, 0);
+
+       if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))
+               goto error;
+
+       tp->mptcp->slave_sk = 1;
+       tp->mptcp->low_prio = loc->low_prio;
+
+       /* Initializing the timer for an MPTCP subflow */
+       setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
+
+       /** Then, connect the socket to the peer */
+       loc_in.sin6_family = AF_INET6;
+       rem_in.sin6_family = AF_INET6;
+       loc_in.sin6_port = 0;
+       if (rem->port)
+               rem_in.sin6_port = rem->port;
+       else
+               rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
+       loc_in.sin6_addr = loc->addr;
+       rem_in.sin6_addr = rem->addr;
+
+       if (loc->if_idx)
+               sk->sk_bound_dev_if = loc->if_idx;
+
+       ret = kernel_bind(sock, (struct sockaddr *)&loc_in,
+                         sizeof(struct sockaddr_in6));
+       if (ret < 0) {
+               mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",
+                           __func__, ret);
+               goto error;
+       }
+
+       mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d ifidx: %u\n",
+                   __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
+                   tp->mptcp->path_index, &loc_in.sin6_addr,
+                   ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
+                   ntohs(rem_in.sin6_port), loc->if_idx);
+
+       if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)
+               tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);
+
+       ret = kernel_connect(sock, (struct sockaddr *)&rem_in,
+                            sizeof(struct sockaddr_in6), O_NONBLOCK);
+       if (ret < 0 && ret != -EINPROGRESS) {
+               mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
+                           __func__, ret);
+               goto error;
+       }
+
+       MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_JOINSYNTX);
+
+       sk_set_socket(sk, meta_sk->sk_socket);
+       sk->sk_wq = meta_sk->sk_wq;
+
+       return 0;
+
+error:
+       /* May happen if mptcp_add_sock fails first */
+       if (!mptcp(tp)) {
+               tcp_close(sk, 0);
+       } else {
+               local_bh_disable();
+               mptcp_sub_force_close(sk);
+               local_bh_enable();
+       }
+       return ret;
+}
+EXPORT_SYMBOL(mptcp_init6_subsockets);
+
+const struct inet_connection_sock_af_ops mptcp_v6_specific = {
+       .queue_xmit        = inet6_csk_xmit,
+       .send_check        = tcp_v6_send_check,
+       .rebuild_header    = inet6_sk_rebuild_header,
+       .sk_rx_dst_set     = inet6_sk_rx_dst_set,
+       .conn_request      = mptcp_conn_request,
+       .syn_recv_sock     = tcp_v6_syn_recv_sock,
+       .net_header_len    = sizeof(struct ipv6hdr),
+       .net_frag_header_len = sizeof(struct frag_hdr),
+       .setsockopt        = ipv6_setsockopt,
+       .getsockopt        = ipv6_getsockopt,
+       .addr2sockaddr     = inet6_csk_addr2sockaddr,
+       .sockaddr_len      = sizeof(struct sockaddr_in6),
+#ifdef CONFIG_COMPAT
+       .compat_setsockopt = compat_ipv6_setsockopt,
+       .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+       .mtu_reduced       = tcp_v6_mtu_reduced,
+};
+
+const struct inet_connection_sock_af_ops mptcp_v6_mapped = {
+       .queue_xmit        = ip_queue_xmit,
+       .send_check        = tcp_v4_send_check,
+       .rebuild_header    = inet_sk_rebuild_header,
+       .sk_rx_dst_set     = inet_sk_rx_dst_set,
+       .conn_request      = mptcp_conn_request,
+       .syn_recv_sock     = tcp_v6_syn_recv_sock,
+       .net_header_len    = sizeof(struct iphdr),
+       .setsockopt        = ipv6_setsockopt,
+       .getsockopt        = ipv6_getsockopt,
+       .addr2sockaddr     = inet6_csk_addr2sockaddr,
+       .sockaddr_len      = sizeof(struct sockaddr_in6),
+#ifdef CONFIG_COMPAT
+       .compat_setsockopt = compat_ipv6_setsockopt,
+       .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
+       .mtu_reduced       = tcp_v4_mtu_reduced,
+};
+
+struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
+struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
+
+int mptcp_pm_v6_init(void)
+{
+       int ret = 0;
+       struct request_sock_ops *ops = &mptcp6_request_sock_ops;
+
+       mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
+       mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;
+#ifdef CONFIG_SYN_COOKIES
+       mptcp_request_sock_ipv6_ops.cookie_init_seq = mptcp_v6_cookie_init_seq;
+#endif
+
+       mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
+       mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;
+
+       ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
+       if (ops->slab_name == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
+                                     SLAB_TYPESAFE_BY_RCU|SLAB_HWCACHE_ALIGN,
+                                     NULL);
+
+       if (ops->slab == NULL) {
+               ret =  -ENOMEM;
+               goto err_reqsk_create;
+       }
+
+out:
+       return ret;
+
+err_reqsk_create:
+       kfree(ops->slab_name);
+       ops->slab_name = NULL;
+       goto out;
+}
+
+void mptcp_pm_v6_undo(void)
+{
+       kmem_cache_destroy(mptcp6_request_sock_ops.slab);
+       kfree(mptcp6_request_sock_ops.slab_name);
+}
diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c
new file mode 100644 (file)
index 0000000..10147d5
--- /dev/null
@@ -0,0 +1,169 @@
+#include <linux/module.h>
+
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/mptcp_v6.h>
+#endif
+
+struct ndiffports_priv {
+       /* Worker struct for subflow establishment */
+       struct work_struct subflow_work;
+
+       struct mptcp_cb *mpcb;
+};
+
+static int num_subflows __read_mostly = 2;
+module_param(num_subflows, int, 0644);
+MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");
+
+/**
+ * Create all new subflows, by doing calls to mptcp_initX_subsockets
+ *
+ * This function uses a goto next_subflow, to allow releasing the lock between
+ * new subflows and giving other processes a chance to do some work on the
+ * socket and potentially finishing the communication.
+ **/
+static void create_subflow_worker(struct work_struct *work)
+{
+       const struct ndiffports_priv *pm_priv = container_of(work,
+                                                    struct ndiffports_priv,
+                                                    subflow_work);
+       struct mptcp_cb *mpcb = pm_priv->mpcb;
+       struct sock *meta_sk = mpcb->meta_sk;
+       int iter = 0;
+
+next_subflow:
+       if (iter) {
+               release_sock(meta_sk);
+               mutex_unlock(&mpcb->mpcb_mutex);
+
+               cond_resched();
+       }
+       mutex_lock(&mpcb->mpcb_mutex);
+       lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
+
+       iter++;
+
+       if (sock_flag(meta_sk, SOCK_DEAD))
+               goto exit;
+
+       if (mpcb->master_sk &&
+           !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
+               goto exit;
+
+       if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) {
+               if (meta_sk->sk_family == AF_INET ||
+                   mptcp_v6_is_v4_mapped(meta_sk)) {
+                       struct mptcp_loc4 loc;
+                       struct mptcp_rem4 rem;
+
+                       loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
+                       loc.loc4_id = 0;
+                       loc.low_prio = 0;
+                       if (mpcb->master_sk)
+                               loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
+                       else
+                               loc.if_idx = 0;
+
+                       rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
+                       rem.port = inet_sk(meta_sk)->inet_dport;
+                       rem.rem4_id = 0; /* Default 0 */
+
+                       mptcp_init4_subsockets(meta_sk, &loc, &rem);
+               } else {
+#if IS_ENABLED(CONFIG_IPV6)
+                       struct mptcp_loc6 loc;
+                       struct mptcp_rem6 rem;
+
+                       loc.addr = inet6_sk(meta_sk)->saddr;
+                       loc.loc6_id = 0;
+                       loc.low_prio = 0;
+                       if (mpcb->master_sk)
+                               loc.if_idx = mpcb->master_sk->sk_bound_dev_if;
+                       else
+                               loc.if_idx = 0;
+
+                       rem.addr = meta_sk->sk_v6_daddr;
+                       rem.port = inet_sk(meta_sk)->inet_dport;
+                       rem.rem6_id = 0; /* Default 0 */
+
+                       mptcp_init6_subsockets(meta_sk, &loc, &rem);
+#endif
+               }
+               goto next_subflow;
+       }
+
+exit:
+       release_sock(meta_sk);
+       mutex_unlock(&mpcb->mpcb_mutex);
+       sock_put(meta_sk);
+}
+
+static void ndiffports_new_session(const struct sock *meta_sk)
+{
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
+
+       /* Initialize workqueue-struct */
+       INIT_WORK(&fmp->subflow_work, create_subflow_worker);
+       fmp->mpcb = mpcb;
+}
+
+static void ndiffports_create_subflows(struct sock *meta_sk)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
+
+       if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
+           mpcb->send_infinite_mapping ||
+           mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
+               return;
+
+       if (!work_pending(&pm_priv->subflow_work)) {
+               sock_hold(meta_sk);
+               queue_work(mptcp_wq, &pm_priv->subflow_work);
+       }
+}
+
+static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr,
+                                  struct net *net, bool *low_prio)
+{
+       return 0;
+}
+
+static struct mptcp_pm_ops ndiffports __read_mostly = {
+       .new_session = ndiffports_new_session,
+       .fully_established = ndiffports_create_subflows,
+       .get_local_id = ndiffports_get_local_id,
+       .name = "ndiffports",
+       .owner = THIS_MODULE,
+};
+
+/* General initialization of MPTCP_PM */
+static int __init ndiffports_register(void)
+{
+       BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
+
+       if (mptcp_register_path_manager(&ndiffports))
+               goto exit;
+
+       return 0;
+
+exit:
+       return -1;
+}
+
+static void ndiffports_unregister(void)
+{
+       mptcp_unregister_path_manager(&ndiffports);
+}
+
+module_init(ndiffports_register);
+module_exit(ndiffports_unregister);
+
+MODULE_AUTHOR("Christoph Paasch");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
+MODULE_VERSION("0.88");
diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c
new file mode 100644 (file)
index 0000000..0441066
--- /dev/null
@@ -0,0 +1,309 @@
+/*
+ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
+ *
+ * Algorithm design:
+ * Ramin Khalili <ramin.khalili@epfl.ch>
+ * Nicolas Gast <nicolas.gast@epfl.ch>
+ * Jean-Yves Le Boudec <jean-yves.leboudec@epfl.ch>
+ *
+ * Implementation:
+ * Ramin Khalili <ramin.khalili@epfl.ch>
+ *
+ * Ported to the official MPTCP-kernel:
+ * Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <net/tcp.h>
+#include <net/mptcp.h>
+
+#include <linux/module.h>
+
+static int scale = 10;
+
+struct mptcp_olia {
+       u32     mptcp_loss1;
+       u32     mptcp_loss2;
+       u32     mptcp_loss3;
+       int     epsilon_num;
+       u32     epsilon_den;
+       int     mptcp_snd_cwnd_cnt;
+};
+
+static inline int mptcp_olia_sk_can_send(const struct sock *sk)
+{
+       return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
+}
+
+static inline u64 mptcp_olia_scale(u64 val, int scale)
+{
+       return (u64) val << scale;
+}
+
+/* take care of artificially inflate (see RFC5681)
+ * of cwnd during fast-retransmit phase
+ */
+static u32 mptcp_get_crt_cwnd(struct sock *sk)
+{
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+
+       if (icsk->icsk_ca_state == TCP_CA_Recovery)
+               return tcp_sk(sk)->snd_ssthresh;
+       else
+               return tcp_sk(sk)->snd_cwnd;
+}
+
+/* return the dominator of the first term of  the increasing term */
+static u64 mptcp_get_rate(const struct mptcp_cb *mpcb, u32 path_rtt)
+{
+       struct sock *sk;
+       u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
+
+       mptcp_for_each_sk(mpcb, sk) {
+               struct tcp_sock *tp = tcp_sk(sk);
+               u64 scaled_num;
+               u32 tmp_cwnd;
+
+               if (!mptcp_olia_sk_can_send(sk))
+                       continue;
+
+               tmp_cwnd = mptcp_get_crt_cwnd(sk);
+               scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
+               rate += div_u64(scaled_num, tp->srtt_us);
+       }
+       rate *= rate;
+       return rate;
+}
+
+/* find the maximum cwnd, used to find set M */
+static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)
+{
+       struct sock *sk;
+       u32 best_cwnd = 0;
+
+       mptcp_for_each_sk(mpcb, sk) {
+               u32 tmp_cwnd;
+
+               if (!mptcp_olia_sk_can_send(sk))
+                       continue;
+
+               tmp_cwnd = mptcp_get_crt_cwnd(sk);
+               if (tmp_cwnd > best_cwnd)
+                       best_cwnd = tmp_cwnd;
+       }
+       return best_cwnd;
+}
+
+static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)
+{
+       struct mptcp_olia *ca;
+       struct tcp_sock *tp;
+       struct sock *sk;
+       u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
+       u32 max_cwnd, tmp_cwnd;
+       u8 M = 0, B_not_M = 0;
+
+       /* TODO - integrate this in the following loop - we just want to iterate once */
+
+       max_cwnd = mptcp_get_max_cwnd(mpcb);
+
+       /* find the best path */
+       mptcp_for_each_sk(mpcb, sk) {
+               tp = tcp_sk(sk);
+               ca = inet_csk_ca(sk);
+
+               if (!mptcp_olia_sk_can_send(sk))
+                       continue;
+
+               tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
+               /* TODO - check here and rename variables */
+               tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
+                             ca->mptcp_loss2 - ca->mptcp_loss1);
+
+               if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {
+                       best_rtt = tmp_rtt;
+                       best_int = tmp_int;
+               }
+       }
+
+       /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
+       /* find the size of M and B_not_M */
+       mptcp_for_each_sk(mpcb, sk) {
+               tp = tcp_sk(sk);
+               ca = inet_csk_ca(sk);
+
+               if (!mptcp_olia_sk_can_send(sk))
+                       continue;
+
+               tmp_cwnd = mptcp_get_crt_cwnd(sk);
+               if (tmp_cwnd == max_cwnd) {
+                       M++;
+               } else {
+                       tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
+                       tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
+                                     ca->mptcp_loss2 - ca->mptcp_loss1);
+
+                       if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)
+                               B_not_M++;
+               }
+       }
+
+       /* check if the path is in M or B_not_M and set the value of epsilon accordingly */
+       mptcp_for_each_sk(mpcb, sk) {
+               tp = tcp_sk(sk);
+               ca = inet_csk_ca(sk);
+
+               if (!mptcp_olia_sk_can_send(sk))
+                       continue;
+
+               if (B_not_M == 0) {
+                       ca->epsilon_num = 0;
+                       ca->epsilon_den = 1;
+               } else {
+                       tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
+                       tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
+                                     ca->mptcp_loss2 - ca->mptcp_loss1);
+                       tmp_cwnd = mptcp_get_crt_cwnd(sk);
+
+                       if (tmp_cwnd < max_cwnd &&
+                           (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {
+                               ca->epsilon_num = 1;
+                               ca->epsilon_den = mpcb->cnt_established * B_not_M;
+                       } else if (tmp_cwnd == max_cwnd) {
+                               ca->epsilon_num = -1;
+                               ca->epsilon_den = mpcb->cnt_established  * M;
+                       } else {
+                               ca->epsilon_num = 0;
+                               ca->epsilon_den = 1;
+                       }
+               }
+       }
+}
+
+/* setting the initial values */
+static void mptcp_olia_init(struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_olia *ca = inet_csk_ca(sk);
+
+       if (mptcp(tp)) {
+               ca->mptcp_loss1 = tp->snd_una;
+               ca->mptcp_loss2 = tp->snd_una;
+               ca->mptcp_loss3 = tp->snd_una;
+               ca->mptcp_snd_cwnd_cnt = 0;
+               ca->epsilon_num = 0;
+               ca->epsilon_den = 1;
+       }
+}
+
+/* updating inter-loss distance and ssthresh */
+static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
+{
+       if (!mptcp(tcp_sk(sk)))
+               return;
+
+       if (new_state == TCP_CA_Loss ||
+           new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
+               struct mptcp_olia *ca = inet_csk_ca(sk);
+
+               if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
+                   !inet_csk(sk)->icsk_retransmits) {
+                       ca->mptcp_loss1 = ca->mptcp_loss2;
+                       ca->mptcp_loss2 = ca->mptcp_loss3;
+               }
+       }
+}
+
+/* main algorithm */
+static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_olia *ca = inet_csk_ca(sk);
+       const struct mptcp_cb *mpcb = tp->mpcb;
+
+       u64 inc_num, inc_den, rate, cwnd_scaled;
+
+       if (!mptcp(tp)) {
+               tcp_reno_cong_avoid(sk, ack, acked);
+               return;
+       }
+
+       ca->mptcp_loss3 = tp->snd_una;
+
+       if (!tcp_is_cwnd_limited(sk))
+               return;
+
+       /* slow start if it is in the safe area */
+       if (tcp_in_slow_start(tp)) {
+               tcp_slow_start(tp, acked);
+               return;
+       }
+
+       mptcp_get_epsilon(mpcb);
+       rate = mptcp_get_rate(mpcb, tp->srtt_us);
+       cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
+       inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
+
+       /* calculate the increasing term, scaling is used to reduce the rounding effect */
+       if (ca->epsilon_num == -1) {
+               if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
+                       inc_num = rate - ca->epsilon_den *
+                               cwnd_scaled * cwnd_scaled;
+                       ca->mptcp_snd_cwnd_cnt -= div64_u64(
+                           mptcp_olia_scale(inc_num, scale), inc_den);
+               } else {
+                       inc_num = ca->epsilon_den *
+                           cwnd_scaled * cwnd_scaled - rate;
+                       ca->mptcp_snd_cwnd_cnt += div64_u64(
+                           mptcp_olia_scale(inc_num, scale), inc_den);
+               }
+       } else {
+               inc_num = ca->epsilon_num * rate +
+                   ca->epsilon_den * cwnd_scaled * cwnd_scaled;
+               ca->mptcp_snd_cwnd_cnt += div64_u64(
+                   mptcp_olia_scale(inc_num, scale), inc_den);
+       }
+
+
+       if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
+               if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                       tp->snd_cwnd++;
+               ca->mptcp_snd_cwnd_cnt = 0;
+       } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
+               tp->snd_cwnd = max((int) 1, (int) tp->snd_cwnd - 1);
+               ca->mptcp_snd_cwnd_cnt = 0;
+       }
+}
+
+static struct tcp_congestion_ops mptcp_olia = {
+       .init           = mptcp_olia_init,
+       .ssthresh       = tcp_reno_ssthresh,
+       .cong_avoid     = mptcp_olia_cong_avoid,
+       .set_state      = mptcp_olia_set_state,
+       .owner          = THIS_MODULE,
+       .name           = "olia",
+};
+
+static int __init mptcp_olia_register(void)
+{
+       BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
+       return tcp_register_congestion_control(&mptcp_olia);
+}
+
+static void __exit mptcp_olia_unregister(void)
+{
+       tcp_unregister_congestion_control(&mptcp_olia);
+}
+
+module_init(mptcp_olia_register);
+module_exit(mptcp_olia_unregister);
+
+MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
+MODULE_VERSION("0.1");
diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
new file mode 100644 (file)
index 0000000..e42e2e2
--- /dev/null
@@ -0,0 +1,1806 @@
+/*
+ *     MPTCP implementation - Sending side
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kconfig.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+
+#include <net/mptcp.h>
+#include <net/mptcp_v4.h>
+#include <net/mptcp_v6.h>
+#include <net/sock.h>
+
+static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN +
+                                MPTCP_SUB_LEN_ACK_ALIGN +
+                                MPTCP_SUB_LEN_SEQ_ALIGN;
+
+static inline int mptcp_sub_len_remove_addr(u16 bitfield)
+{
+       unsigned int c;
+
+       for (c = 0; bitfield; c++)
+               bitfield &= bitfield - 1;
+       return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
+}
+
+int mptcp_sub_len_remove_addr_align(u16 bitfield)
+{
+       return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
+}
+EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);
+
+/* get the data-seq and end-data-seq and store them again in the
+ * tcp_skb_cb
+ */
+static bool mptcp_reconstruct_mapping(struct sk_buff *skb)
+{
+       const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss;
+       u32 *p32;
+       u16 *p16;
+
+       if (!mptcp_is_data_seq(skb))
+               return false;
+
+       if (!mpdss->M)
+               return false;
+
+       /* Move the pointer to the data-seq */
+       p32 = (u32 *)mpdss;
+       p32++;
+       if (mpdss->A) {
+               p32++;
+               if (mpdss->a)
+                       p32++;
+       }
+
+       TCP_SKB_CB(skb)->seq = ntohl(*p32);
+
+       /* Get the data_len to calculate the end_data_seq */
+       p32++;
+       p32++;
+       p16 = (u16 *)p32;
+       TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
+
+       return true;
+}
+
+static bool mptcp_is_reinjected(const struct sk_buff *skb)
+{
+       return TCP_SKB_CB(skb)->mptcp_flags & MPTCP_REINJECT;
+}
+
+static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb)
+{
+       struct sk_buff *skb_it;
+
+       skb_it = tcp_write_queue_head(meta_sk);
+
+       tcp_for_write_queue_from(skb_it, meta_sk) {
+               if (skb_it == tcp_send_head(meta_sk))
+                       break;
+
+               if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {
+                       TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
+                       break;
+               }
+       }
+}
+
+/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
+ * coming from the meta-retransmit-timer
+ */
+static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
+                                 struct sock *sk, int clone_it)
+{
+       struct sk_buff *skb, *skb1;
+       const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       u32 seq, end_seq;
+
+       if (clone_it) {
+               /* pskb_copy is necessary here, because the TCP/IP-headers
+                * will be changed when it's going to be reinjected on another
+                * subflow.
+                */
+               skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC);
+       } else {
+               __skb_unlink(orig_skb, &sk->sk_write_queue);
+               sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+               sk->sk_wmem_queued -= orig_skb->truesize;
+               sk_mem_uncharge(sk, orig_skb->truesize);
+               skb = orig_skb;
+       }
+       if (unlikely(!skb))
+               return;
+
+       if (sk && !mptcp_reconstruct_mapping(skb)) {
+               __kfree_skb(skb);
+               return;
+       }
+
+       skb->sk = meta_sk;
+
+       /* Reset subflow-specific TCP control-data */
+       TCP_SKB_CB(skb)->sacked = 0;
+       TCP_SKB_CB(skb)->tcp_flags &= (TCPHDR_ACK | TCPHDR_PSH);
+
+       /* If it reached already the destination, we don't have to reinject it */
+       if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
+               __kfree_skb(skb);
+               return;
+       }
+
+       /* Only reinject segments that are fully covered by the mapping */
+       if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
+           TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
+               u32 seq = TCP_SKB_CB(skb)->seq;
+               u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+               __kfree_skb(skb);
+
+               /* Ok, now we have to look for the full mapping in the meta
+                * send-queue :S
+                */
+               tcp_for_write_queue(skb, meta_sk) {
+                       /* Not yet at the mapping? */
+                       if (before(TCP_SKB_CB(skb)->seq, seq))
+                               continue;
+                       /* We have passed by the mapping */
+                       if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
+                               return;
+
+                       __mptcp_reinject_data(skb, meta_sk, NULL, 1);
+               }
+               return;
+       }
+
+       /* Segment goes back to the MPTCP-layer. So, we need to zero the
+        * path_mask/dss.
+        */
+       memset(TCP_SKB_CB(skb)->dss, 0, mptcp_dss_len);
+
+       /* We need to find out the path-mask from the meta-write-queue
+        * to properly select a subflow.
+        */
+       mptcp_find_and_set_pathmask(meta_sk, skb);
+
+       /* If it's empty, just add */
+       if (skb_queue_empty(&mpcb->reinject_queue)) {
+               skb_queue_head(&mpcb->reinject_queue, skb);
+               return;
+       }
+
+       /* Find place to insert skb - or even we can 'drop' it, as the
+        * data is already covered by other skb's in the reinject-queue.
+        *
+        * This is inspired by code from tcp_data_queue.
+        */
+
+       skb1 = skb_peek_tail(&mpcb->reinject_queue);
+       seq = TCP_SKB_CB(skb)->seq;
+       while (1) {
+               if (!after(TCP_SKB_CB(skb1)->seq, seq))
+                       break;
+               if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
+                       skb1 = NULL;
+                       break;
+               }
+               skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
+       }
+
+       /* Do skb overlap to previous one? */
+       end_seq = TCP_SKB_CB(skb)->end_seq;
+       if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+               if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+                       /* All the bits are present. Don't reinject */
+                       __kfree_skb(skb);
+                       return;
+               }
+               if (seq == TCP_SKB_CB(skb1)->seq) {
+                       if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
+                               skb1 = NULL;
+                       else
+                               skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
+               }
+       }
+       if (!skb1)
+               __skb_queue_head(&mpcb->reinject_queue, skb);
+       else
+               __skb_queue_after(&mpcb->reinject_queue, skb1, skb);
+
+       /* And clean segments covered by new one as whole. */
+       while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
+               skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
+
+               if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
+                       break;
+
+               __skb_unlink(skb1, &mpcb->reinject_queue);
+               __kfree_skb(skb1);
+       }
+}
+
+/* Inserts data into the reinject queue */
+void mptcp_reinject_data(struct sock *sk, int clone_it)
+{
+       struct sk_buff *skb_it, *tmp;
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sock *meta_sk = tp->meta_sk;
+
+       /* It has already been closed - there is really no point in reinjecting */
+       if (meta_sk->sk_state == TCP_CLOSE)
+               return;
+
+       skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
+               struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
+               /* Subflow syn's and fin's are not reinjected.
+                *
+                * As well as empty subflow-fins with a data-fin.
+                * They are reinjected below (without the subflow-fin-flag)
+                */
+               if (tcb->tcp_flags & TCPHDR_SYN ||
+                   (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
+                   (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
+                       continue;
+
+               if (mptcp_is_reinjected(skb_it))
+                       continue;
+
+               tcb->mptcp_flags |= MPTCP_REINJECT;
+               __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);
+       }
+
+       skb_it = tcp_write_queue_tail(meta_sk);
+       /* If sk has sent the empty data-fin, we have to reinject it too. */
+       if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
+           TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {
+               __mptcp_reinject_data(skb_it, meta_sk, NULL, 1);
+       }
+
+       tp->pf = 1;
+
+       mptcp_push_pending_frames(meta_sk);
+}
+EXPORT_SYMBOL(mptcp_reinject_data);
+
+static void mptcp_combine_dfin(const struct sk_buff *skb,
+                              const struct sock *meta_sk,
+                              struct sock *subsk)
+{
+       const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       const struct mptcp_cb *mpcb = meta_tp->mpcb;
+
+       /* In infinite mapping we always try to combine */
+       if (mpcb->infinite_mapping_snd)
+               goto combine;
+
+       /* Don't combine, if they didn't combine when closing - otherwise we end
+        * up in TIME_WAIT, even if our app is smart enough to avoid it.
+        */
+       if (!mptcp_sk_can_recv(meta_sk) && !mpcb->dfin_combined)
+               return;
+
+       /* Don't combine if there is still outstanding data that remains to be
+        * DATA_ACKed, because otherwise we may never be able to deliver this.
+        */
+       if (meta_tp->snd_una != TCP_SKB_CB(skb)->seq)
+               return;
+
+combine:
+       if (tcp_close_state(subsk)) {
+               subsk->sk_shutdown |= SEND_SHUTDOWN;
+               TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
+       }
+}
+
+static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
+                                  __be32 *ptr)
+{
+       const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+       __be32 *start = ptr;
+       __u16 data_len;
+
+       *ptr++ = htonl(tcb->seq); /* data_seq */
+
+       /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
+       if (mptcp_is_data_fin(skb) && skb->len == 0)
+               *ptr++ = 0; /* subseq */
+       else
+               *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
+
+       if (tcb->mptcp_flags & MPTCPHDR_INF)
+               data_len = 0;
+       else
+               data_len = tcb->end_seq - tcb->seq;
+
+       if (tp->mpcb->dss_csum && data_len) {
+               __be16 *p16 = (__be16 *)ptr;
+               __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
+               __wsum csum;
+
+               *ptr = htonl(((data_len) << 16) |
+                            (TCPOPT_EOL << 8) |
+                            (TCPOPT_EOL));
+               csum = csum_partial(ptr - 2, 12, skb->csum);
+               p16++;
+               *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
+       } else {
+               *ptr++ = htonl(((data_len) << 16) |
+                              (TCPOPT_NOP << 8) |
+                              (TCPOPT_NOP));
+       }
+
+       return ptr - start;
+}
+
+static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
+                                   __be32 *ptr)
+{
+       struct mp_dss *mdss = (struct mp_dss *)ptr;
+       __be32 *start = ptr;
+
+       mdss->kind = TCPOPT_MPTCP;
+       mdss->sub = MPTCP_SUB_DSS;
+       mdss->rsv1 = 0;
+       mdss->rsv2 = 0;
+       mdss->F = mptcp_is_data_fin(skb) ? 1 : 0;
+       mdss->m = 0;
+       mdss->M = mptcp_is_data_seq(skb) ? 1 : 0;
+       mdss->a = 0;
+       mdss->A = 1;
+       mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
+       ptr++;
+
+       *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);
+
+       return ptr - start;
+}
+
+/* RFC6824 states that once a particular subflow mapping has been sent
+ * out it must never be changed. However, packets may be split while
+ * they are in the retransmission queue (due to SACK or ACKs) and that
+ * arguably means that we would change the mapping (e.g. it splits it,
+ * our sends out a subset of the initial mapping).
+ *
+ * Furthermore, the skb checksum is not always preserved across splits
+ * (e.g. mptcp_fragment) which would mean that we need to recompute
+ * the DSS checksum in this case.
+ *
+ * To avoid this we save the initial DSS mapping which allows us to
+ * send the same DSS mapping even for fragmented retransmits.
+ */
+static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb)
+{
+       struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+       __be32 *ptr = (__be32 *)tcb->dss;
+
+       tcb->mptcp_flags |= MPTCPHDR_SEQ;
+
+       ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
+       ptr += mptcp_write_dss_mapping(tp, skb, ptr);
+}
+
+/* Write the saved DSS mapping to the header */
+static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb,
+                                   __be32 *ptr)
+{
+       __be32 *start = ptr;
+
+       memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);
+
+       /* update the data_ack */
+       start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt);
+
+       /* dss is in a union with inet_skb_parm and
+        * the IP layer expects zeroed IPCB fields.
+        */
+       memset(TCP_SKB_CB(skb)->dss, 0, mptcp_dss_len);
+
+       return mptcp_dss_len/sizeof(*ptr);
+}
+
+static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       const struct sock *meta_sk = mptcp_meta_sk(sk);
+       const struct mptcp_cb *mpcb = tp->mpcb;
+       struct tcp_skb_cb *tcb;
+       struct sk_buff *subskb = NULL;
+
+       if (!reinject)
+               TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
+                                                 MPTCPHDR_SEQ64_INDEX : 0);
+
+       subskb = pskb_copy_for_clone(skb, GFP_ATOMIC);
+       if (!subskb)
+               return false;
+
+       /* At the subflow-level we need to call again tcp_init_tso_segs. We
+        * force this, by setting pcount to 0. It has been set to 1 prior to
+        * the call to mptcp_skb_entail.
+        */
+       tcp_skb_pcount_set(subskb, 0);
+
+       TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
+
+       /* Compute checksum, if:
+        * 1. The current route does not support csum offloading but it was
+        *    assumed that it does (ip_summed is CHECKSUM_PARTIAL)
+        * 2. We need the DSS-checksum but ended up not pre-computing it
+        *    (e.g., in the case of TFO retransmissions).
+        */
+       if (skb->ip_summed == CHECKSUM_PARTIAL &&
+           (!sk_check_csum_caps(sk) || tp->mpcb->dss_csum)) {
+               subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
+               subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;
+       }
+
+       tcb = TCP_SKB_CB(subskb);
+
+       if (tp->mpcb->send_infinite_mapping &&
+           !tp->mpcb->infinite_mapping_snd &&
+           !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
+               tp->mptcp->fully_established = 1;
+               tp->mpcb->infinite_mapping_snd = 1;
+               tp->mptcp->infinite_cutoff_seq = tp->write_seq;
+               tcb->mptcp_flags |= MPTCPHDR_INF;
+       }
+
+       if (mptcp_is_data_fin(subskb))
+               mptcp_combine_dfin(subskb, meta_sk, sk);
+
+       mptcp_save_dss_data_seq(tp, subskb);
+
+       tcb->seq = tp->write_seq;
+
+       /* Take into account seg len */
+       tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
+       tcb->end_seq = tp->write_seq;
+
+       /* If it's a non-payload DATA_FIN (also no subflow-fin), the
+        * segment is not part of the subflow but on a meta-only-level.
+        */
+       if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
+               tcp_add_write_queue_tail(sk, subskb);
+               sk->sk_wmem_queued += subskb->truesize;
+               sk_mem_charge(sk, subskb->truesize);
+       } else {
+               int err;
+
+               /* Necessary to initialize for tcp_transmit_skb. mss of 1, as
+                * skb->len = 0 will force tso_segs to 1.
+                */
+               tcp_init_tso_segs(subskb, 1);
+               /* Empty data-fins are sent immediately on the subflow */
+               err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC);
+
+               /* It has not been queued, we can free it now. */
+               kfree_skb(subskb);
+
+               if (err)
+                       return false;
+       }
+
+       if (!tp->mptcp->fully_established) {
+               tp->mptcp->second_packet = 1;
+               tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
+       }
+
+       return true;
+}
+
+/* Fragment an skb and update the mptcp meta-data. Due to reinject, we
+ * might need to undo some operations done by tcp_fragment.
+ */
+static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len,
+                         gfp_t gfp, int reinject)
+{
+       int ret, diff, old_factor;
+       struct sk_buff *buff;
+       u8 flags;
+
+       if (skb_headlen(skb) < len)
+               diff = skb->len - len;
+       else
+               diff = skb->data_len;
+       old_factor = tcp_skb_pcount(skb);
+
+       /* The mss_now in tcp_fragment is used to set the tso_segs of the skb.
+        * At the MPTCP-level we do not care about the absolute value. All we
+        * care about is that it is set to 1 for accurate packets_out
+        * accounting.
+        */
+       ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp);
+       if (ret)
+               return ret;
+
+       buff = skb->next;
+
+       flags = TCP_SKB_CB(skb)->mptcp_flags;
+       TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
+       TCP_SKB_CB(buff)->mptcp_flags = flags;
+       TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask;
+
+       /* If reinject == 1, the buff will be added to the reinject
+        * queue, which is currently not part of memory accounting. So
+        * undo the changes done by tcp_fragment and update the
+        * reinject queue. Also, undo changes to the packet counters.
+        */
+       if (reinject == 1) {
+               int undo = buff->truesize - diff;
+
+               meta_sk->sk_wmem_queued -= undo;
+               sk_mem_uncharge(meta_sk, undo);
+
+               tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++;
+               meta_sk->sk_write_queue.qlen--;
+
+               if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+                       undo = old_factor - tcp_skb_pcount(skb) -
+                               tcp_skb_pcount(buff);
+                       if (undo)
+                               tcp_adjust_pcount(meta_sk, skb, -undo);
+               }
+       }
+
+       return 0;
+}
+
+/* Inspired by tcp_write_wakeup */
+int mptcp_write_wakeup(struct sock *meta_sk, int mib)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sk_buff *skb;
+       struct sock *sk_it;
+       int ans = 0;
+
+       if (meta_sk->sk_state == TCP_CLOSE)
+               return -1;
+
+       skb = tcp_send_head(meta_sk);
+       if (skb &&
+           before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
+               unsigned int mss;
+               unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
+               struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true);
+               struct tcp_sock *subtp;
+
+               WARN_ON(TCP_SKB_CB(skb)->sacked);
+
+               if (!subsk)
+                       goto window_probe;
+               subtp = tcp_sk(subsk);
+               mss = tcp_current_mss(subsk);
+
+               seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq,
+                              tcp_wnd_end(subtp) - subtp->write_seq);
+
+               if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+                       meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+               /* We are probing the opening of a window
+                * but the window size is != 0
+                * must have been a result SWS avoidance ( sender )
+                */
+               if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+                   skb->len > mss) {
+                       seg_size = min(seg_size, mss);
+                       TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+                       if (mptcp_fragment(meta_sk, skb, seg_size,
+                                          GFP_ATOMIC, 0))
+                               return -1;
+               } else if (!tcp_skb_pcount(skb)) {
+                       /* see mptcp_write_xmit on why we use UINT_MAX */
+                       tcp_set_skb_tso_segs(skb, UINT_MAX);
+               }
+
+               TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
+               if (!mptcp_skb_entail(subsk, skb, 0))
+                       return -1;
+
+               mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
+                                                TCP_SKB_CB(skb)->seq);
+               tcp_event_new_data_sent(meta_sk, skb);
+
+               __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH);
+               meta_tp->lsndtime = tcp_jiffies32;
+
+               return 0;
+       }
+window_probe:
+       if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
+                   meta_tp->snd_una + 0xFFFF)) {
+               mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
+                       if (mptcp_sk_can_send_ack(sk_it))
+                               tcp_xmit_probe_skb(sk_it, 1, mib);
+               }
+       }
+
+       /* At least one of the tcp_xmit_probe_skb's has to succeed */
+       mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
+               int ret;
+
+               if (!mptcp_sk_can_send_ack(sk_it))
+                       continue;
+
+               ret = tcp_xmit_probe_skb(sk_it, 0, mib);
+               if (unlikely(ret > 0))
+                       ans = ret;
+       }
+       return ans;
+}
+
+bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
+                    int push_one, gfp_t gfp)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
+       struct sock *subsk = NULL;
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       struct sk_buff *skb;
+       int reinject = 0;
+       unsigned int sublimit;
+       __u32 path_mask = 0;
+
+       while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk,
+                                                   &sublimit))) {
+               unsigned int limit;
+
+               WARN(TCP_SKB_CB(skb)->sacked, "sacked: %u reinject: %u",
+                    TCP_SKB_CB(skb)->sacked, reinject);
+
+               subtp = tcp_sk(subsk);
+               mss_now = tcp_current_mss(subsk);
+
+               if (reinject == 1) {
+                       if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
+                               /* Segment already reached the peer, take the next one */
+                               __skb_unlink(skb, &mpcb->reinject_queue);
+                               __kfree_skb(skb);
+                               continue;
+                       }
+               }
+
+               /* If the segment was cloned (e.g. a meta retransmission),
+                * the header must be expanded/copied so that there is no
+                * corruption of TSO information.
+                */
+               if (skb_unclone(skb, GFP_ATOMIC))
+                       break;
+
+               if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now)))
+                       break;
+
+               /* Force tso_segs to 1 by using UINT_MAX.
+                * We actually don't care about the exact number of segments
+                * emitted on the subflow. We need just to set tso_segs, because
+                * we still need an accurate packets_out count in
+                * tcp_event_new_data_sent.
+                */
+               tcp_set_skb_tso_segs(skb, UINT_MAX);
+
+               /* Check for nagle, irregardless of tso_segs. If the segment is
+                * actually larger than mss_now (TSO segment), then
+                * tcp_nagle_check will have partial == false and always trigger
+                * the transmission.
+                * tcp_write_xmit has a TSO-level nagle check which is not
+                * subject to the MPTCP-level. It is based on the properties of
+                * the subflow, not the MPTCP-level.
+                */
+               if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
+                                            (tcp_skb_is_last(meta_sk, skb) ?
+                                             nonagle : TCP_NAGLE_PUSH))))
+                       break;
+
+               limit = mss_now;
+               /* skb->len > mss_now is the equivalent of tso_segs > 1 in
+                * tcp_write_xmit. Otherwise split-point would return 0.
+                */
+               if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
+                       /* We limit the size of the skb so that it fits into the
+                        * window. Call tcp_mss_split_point to avoid duplicating
+                        * code.
+                        * We really only care about fitting the skb into the
+                        * window. That's why we use UINT_MAX. If the skb does
+                        * not fit into the cwnd_quota or the NIC's max-segs
+                        * limitation, it will be split by the subflow's
+                        * tcp_write_xmit which does the appropriate call to
+                        * tcp_mss_split_point.
+                        */
+                       limit = tcp_mss_split_point(meta_sk, skb, mss_now,
+                                                   UINT_MAX / mss_now,
+                                                   nonagle);
+
+               if (sublimit)
+                       limit = min(limit, sublimit);
+
+               if (skb->len > limit &&
+                   unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject)))
+                       break;
+
+               if (!mptcp_skb_entail(subsk, skb, reinject))
+                       break;
+               /* Nagle is handled at the MPTCP-layer, so
+                * always push on the subflow
+                */
+               __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
+               meta_tp->lsndtime = tcp_jiffies32;
+
+               path_mask |= mptcp_pi_to_flag(subtp->mptcp->path_index);
+
+               if (!reinject) {
+                       mptcp_check_sndseq_wrap(meta_tp,
+                                               TCP_SKB_CB(skb)->end_seq -
+                                               TCP_SKB_CB(skb)->seq);
+                       tcp_event_new_data_sent(meta_sk, skb);
+               }
+
+               tcp_minshall_update(meta_tp, mss_now, skb);
+
+               if (reinject > 0) {
+                       __skb_unlink(skb, &mpcb->reinject_queue);
+                       kfree_skb(skb);
+               }
+
+               if (push_one)
+                       break;
+       }
+
+       mptcp_for_each_sk(mpcb, subsk) {
+               subtp = tcp_sk(subsk);
+
+               if (!(path_mask & mptcp_pi_to_flag(subtp->mptcp->path_index)))
+                       continue;
+
+               /* We have pushed data on this subflow. We ignore the call to
+                * cwnd_validate in tcp_write_xmit as is_cwnd_limited will never
+                * be true (we never push more than what the cwnd can accept).
+                * We need to ensure that we call tcp_cwnd_validate with
+                * is_cwnd_limited set to true if we have filled the cwnd.
+                */
+               tcp_cwnd_validate(subsk, tcp_packets_in_flight(subtp) >=
+                                 subtp->snd_cwnd);
+       }
+
+       return !meta_tp->packets_out && tcp_send_head(meta_sk);
+}
+
+void mptcp_write_space(struct sock *sk)
+{
+       mptcp_push_pending_frames(mptcp_meta_sk(sk));
+}
+
+u32 __mptcp_select_window(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+       int mss, free_space, full_space, window;
+
+       /* MSS for the peer's data.  Previous versions used mss_clamp
+        * here.  I don't know if the value based on our guesses
+        * of peer's MSS is better for the performance.  It's more correct
+        * but may be worse for the performance because of rcv_mss
+        * fluctuations.  --SAW  1998/11/1
+        */
+       mss = icsk->icsk_ack.rcv_mss;
+       free_space = tcp_space(meta_sk);
+       full_space = min_t(int, meta_tp->window_clamp,
+                       tcp_full_space(meta_sk));
+
+       if (mss > full_space)
+               mss = full_space;
+
+       if (free_space < (full_space >> 1)) {
+               /* If free_space is decreasing due to mostly meta-level
+                * out-of-order packets, don't turn off the quick-ack mode.
+                */
+               if (meta_tp->rcv_nxt - meta_tp->copied_seq > ((full_space - free_space) >> 1))
+                       icsk->icsk_ack.quick = 0;
+
+               if (tcp_memory_pressure)
+                       /* TODO this has to be adapted when we support different
+                        * MSS's among the subflows.
+                        */
+                       meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
+                                                   4U * meta_tp->advmss);
+
+               if (free_space < mss)
+                       return 0;
+       }
+
+       if (free_space > meta_tp->rcv_ssthresh)
+               free_space = meta_tp->rcv_ssthresh;
+
+       /* Don't do rounding if we are using window scaling, since the
+        * scaled window will not line up with the MSS boundary anyway.
+        */
+       window = meta_tp->rcv_wnd;
+       if (tp->rx_opt.rcv_wscale) {
+               window = free_space;
+
+               /* Advertise enough space so that it won't get scaled away.
+                * Import case: prevent zero window announcement if
+                * 1<<rcv_wscale > mss.
+                */
+               if (((window >> tp->rx_opt.rcv_wscale) << tp->
+                    rx_opt.rcv_wscale) != window)
+                       window = (((window >> tp->rx_opt.rcv_wscale) + 1)
+                                 << tp->rx_opt.rcv_wscale);
+       } else {
+               /* Get the largest window that is a nice multiple of mss.
+                * Window clamp already applied above.
+                * If our current window offering is within 1 mss of the
+                * free space we just keep it. This prevents the divide
+                * and multiply from happening most of the time.
+                * We also don't do any window rounding when the free space
+                * is too small.
+                */
+               if (window <= free_space - mss || window > free_space)
+                       window = (free_space / mss) * mss;
+               else if (mss == full_space &&
+                        free_space > window + (full_space >> 1))
+                       window = free_space;
+       }
+
+       return window;
+}
+
+void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
+                      unsigned int *remaining)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+
+       opts->options |= OPTION_MPTCP;
+       if (is_master_tp(tp)) {
+               opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
+               opts->mptcp_ver = tcp_sk(sk)->mptcp_ver;
+               *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
+               opts->mp_capable.sender_key = tp->mptcp_loc_key;
+               opts->dss_csum = !!sysctl_mptcp_checksum;
+       } else {
+               const struct mptcp_cb *mpcb = tp->mpcb;
+
+               opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
+               *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
+               opts->mp_join_syns.token = mpcb->mptcp_rem_token;
+               opts->mp_join_syns.low_prio  = tp->mptcp->low_prio;
+               opts->addr_id = tp->mptcp->loc_id;
+               opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
+       }
+}
+
+void mptcp_synack_options(struct request_sock *req,
+                         struct tcp_out_options *opts, unsigned int *remaining)
+{
+       struct mptcp_request_sock *mtreq;
+
+       mtreq = mptcp_rsk(req);
+
+       opts->options |= OPTION_MPTCP;
+       /* MPCB not yet set - thus it's a new MPTCP-session */
+       if (!mtreq->is_sub) {
+               opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
+               opts->mptcp_ver = mtreq->mptcp_ver;
+               opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
+               opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;
+               *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
+       } else {
+               opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
+               opts->mp_join_syns.sender_truncated_mac =
+                               mtreq->mptcp_hash_tmac;
+               opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
+               opts->mp_join_syns.low_prio = mtreq->low_prio;
+               opts->addr_id = mtreq->loc_id;
+               *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
+       }
+}
+
+void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
+                              struct tcp_out_options *opts, unsigned int *size)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct mptcp_cb *mpcb = tp->mpcb;
+       const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
+
+       /* We are coming from tcp_current_mss with the meta_sk as an argument.
+        * It does not make sense to check for the options, because when the
+        * segment gets sent, another subflow will be chosen.
+        */
+       if (!skb && is_meta_sk(sk))
+               return;
+
+       if (unlikely(tp->send_mp_fclose)) {
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_MP_FCLOSE;
+               opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
+               *size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
+               return;
+       }
+
+       /* 1. If we are the sender of the infinite-mapping, we need the
+        *    MPTCPHDR_INF-flag, because a retransmission of the
+        *    infinite-announcment still needs the mptcp-option.
+        *
+        *    We need infinite_cutoff_seq, because retransmissions from before
+        *    the infinite-cutoff-moment still need the MPTCP-signalling to stay
+        *    consistent.
+        *
+        * 2. If we are the receiver of the infinite-mapping, we always skip
+        *    mptcp-options, because acknowledgments from before the
+        *    infinite-mapping point have already been sent out.
+        *
+        * I know, the whole infinite-mapping stuff is ugly...
+        *
+        * TODO: Handle wrapped data-sequence numbers
+        *       (even if it's very unlikely)
+        */
+       if (unlikely(mpcb->infinite_mapping_snd) &&
+           ((mpcb->send_infinite_mapping && tcb &&
+             mptcp_is_data_seq(skb) &&
+             !(tcb->mptcp_flags & MPTCPHDR_INF) &&
+             !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
+            !mpcb->send_infinite_mapping))
+               return;
+
+       if (unlikely(tp->mptcp->include_mpc)) {
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_MP_CAPABLE |
+                                      OPTION_TYPE_ACK;
+               *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
+               opts->mptcp_ver = mpcb->mptcp_ver;
+               opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
+               opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
+               opts->dss_csum = mpcb->dss_csum;
+
+               if (skb)
+                       tp->mptcp->include_mpc = 0;
+       }
+       if (unlikely(tp->mptcp->pre_established) &&
+           (!skb || !(tcb->tcp_flags & (TCPHDR_FIN | TCPHDR_RST)))) {
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
+               *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
+       }
+
+       if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal &&
+           mpcb->mptcp_ver >= MPTCP_VERSION_1 && skb && !mptcp_is_data_seq(skb)) {
+               mpcb->pm_ops->addr_signal(sk, size, opts, skb);
+
+               if (opts->add_addr_v6)
+                       /* Skip subsequent options */
+                       return;
+       }
+
+       if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_DATA_ACK;
+               /* If !skb, we come from tcp_current_mss and thus we always
+                * assume that the DSS-option will be set for the data-packet.
+                */
+               if (skb && !mptcp_is_data_seq(skb)) {
+                       *size += MPTCP_SUB_LEN_ACK_ALIGN;
+               } else {
+                       /* Doesn't matter, if csum included or not. It will be
+                        * either 10 or 12, and thus aligned = 12
+                        */
+                       *size += MPTCP_SUB_LEN_ACK_ALIGN +
+                                MPTCP_SUB_LEN_SEQ_ALIGN;
+               }
+
+               *size += MPTCP_SUB_LEN_DSS_ALIGN;
+       }
+
+       /* In fallback mp_fail-mode, we have to repeat it until the fallback
+        * has been done by the sender
+        */
+       if (unlikely(tp->mptcp->send_mp_fail) && skb &&
+           MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_FAIL) {
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_MP_FAIL;
+               *size += MPTCP_SUB_LEN_FAIL;
+       }
+
+       if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal &&
+           mpcb->mptcp_ver < MPTCP_VERSION_1)
+               mpcb->pm_ops->addr_signal(sk, size, opts, skb);
+
+       if (unlikely(tp->mptcp->send_mp_prio) &&
+           MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
+               opts->options |= OPTION_MPTCP;
+               opts->mptcp_options |= OPTION_MP_PRIO;
+               if (skb)
+                       tp->mptcp->send_mp_prio = 0;
+               *size += MPTCP_SUB_LEN_PRIO_ALIGN;
+       }
+}
+
+u16 mptcp_select_window(struct sock *sk)
+{
+       u16 new_win             = tcp_select_window(sk);
+       struct tcp_sock *tp     = tcp_sk(sk);
+       struct tcp_sock *meta_tp = mptcp_meta_tp(tp);
+
+       meta_tp->rcv_wnd        = tp->rcv_wnd;
+       meta_tp->rcv_wup        = meta_tp->rcv_nxt;
+
+       return new_win;
+}
+
+void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
+                        const struct tcp_out_options *opts,
+                        struct sk_buff *skb)
+{
+       if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
+               struct mp_capable *mpc = (struct mp_capable *)ptr;
+
+               mpc->kind = TCPOPT_MPTCP;
+
+               if ((OPTION_TYPE_SYN & opts->mptcp_options) ||
+                   (OPTION_TYPE_SYNACK & opts->mptcp_options)) {
+                       mpc->sender_key = opts->mp_capable.sender_key;
+                       mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
+                       mpc->ver = opts->mptcp_ver;
+                       ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
+               } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
+                       mpc->sender_key = opts->mp_capable.sender_key;
+                       mpc->receiver_key = opts->mp_capable.receiver_key;
+                       mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
+                       mpc->ver = opts->mptcp_ver;
+                       ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
+               }
+
+               mpc->sub = MPTCP_SUB_CAPABLE;
+               mpc->a = opts->dss_csum;
+               mpc->b = 0;
+               mpc->rsv = 0;
+               mpc->h = 1;
+       }
+       if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
+               struct mp_join *mpj = (struct mp_join *)ptr;
+
+               mpj->kind = TCPOPT_MPTCP;
+               mpj->sub = MPTCP_SUB_JOIN;
+               mpj->rsv = 0;
+
+               if (OPTION_TYPE_SYN & opts->mptcp_options) {
+                       mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
+                       mpj->u.syn.token = opts->mp_join_syns.token;
+                       mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
+                       mpj->b = opts->mp_join_syns.low_prio;
+                       mpj->addr_id = opts->addr_id;
+                       ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
+               } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
+                       mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
+                       mpj->u.synack.mac =
+                               opts->mp_join_syns.sender_truncated_mac;
+                       mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
+                       mpj->b = opts->mp_join_syns.low_prio;
+                       mpj->addr_id = opts->addr_id;
+                       ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
+               } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
+                       mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
+                       mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */
+                       memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
+                       ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
+               }
+       }
+       if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
+               struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
+               struct mptcp_cb *mpcb = tp->mpcb;
+
+               mpadd->kind = TCPOPT_MPTCP;
+               if (opts->add_addr_v4) {
+                       mpadd->sub = MPTCP_SUB_ADD_ADDR;
+                       mpadd->ipver = 4;
+                       mpadd->addr_id = opts->add_addr4.addr_id;
+                       mpadd->u.v4.addr = opts->add_addr4.addr;
+                       if (mpcb->mptcp_ver < MPTCP_VERSION_1) {
+                               mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
+                               ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
+                       } else {
+                               memcpy((char *)mpadd->u.v4.mac - 2,
+                                      (char *)&opts->add_addr4.trunc_mac, 8);
+                               mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4_VER1;
+                               ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN_VER1 >> 2;
+                       }
+               } else if (opts->add_addr_v6) {
+                       mpadd->sub = MPTCP_SUB_ADD_ADDR;
+                       mpadd->ipver = 6;
+                       mpadd->addr_id = opts->add_addr6.addr_id;
+                       memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,
+                              sizeof(mpadd->u.v6.addr));
+                       if (mpcb->mptcp_ver < MPTCP_VERSION_1) {
+                               mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
+                               ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
+                       } else {
+                               memcpy((char *)mpadd->u.v6.mac - 2,
+                                      (char *)&opts->add_addr6.trunc_mac, 8);
+                               mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6_VER1;
+                               ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN_VER1 >> 2;
+                       }
+               }
+
+               MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_ADDADDRTX);
+       }
+       if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
+               struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
+               u8 *addrs_id;
+               int id, len, len_align;
+
+               len = mptcp_sub_len_remove_addr(opts->remove_addrs);
+               len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
+
+               mprem->kind = TCPOPT_MPTCP;
+               mprem->len = len;
+               mprem->sub = MPTCP_SUB_REMOVE_ADDR;
+               mprem->rsv = 0;
+               addrs_id = &mprem->addrs_id;
+
+               mptcp_for_each_bit_set(opts->remove_addrs, id)
+                       *(addrs_id++) = id;
+
+               /* Fill the rest with NOP's */
+               if (len_align > len) {
+                       int i;
+
+                       for (i = 0; i < len_align - len; i++)
+                               *(addrs_id++) = TCPOPT_NOP;
+               }
+
+               ptr += len_align >> 2;
+
+               MPTCP_INC_STATS(sock_net((struct sock *)tp), MPTCP_MIB_REMADDRTX);
+       }
+       if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
+               struct mp_fail *mpfail = (struct mp_fail *)ptr;
+
+               mpfail->kind = TCPOPT_MPTCP;
+               mpfail->len = MPTCP_SUB_LEN_FAIL;
+               mpfail->sub = MPTCP_SUB_FAIL;
+               mpfail->rsv1 = 0;
+               mpfail->rsv2 = 0;
+               mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq);
+
+               ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
+       }
+       if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
+               struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
+
+               mpfclose->kind = TCPOPT_MPTCP;
+               mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
+               mpfclose->sub = MPTCP_SUB_FCLOSE;
+               mpfclose->rsv1 = 0;
+               mpfclose->rsv2 = 0;
+               mpfclose->key = opts->mp_capable.receiver_key;
+
+               ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
+       }
+
+       if (OPTION_DATA_ACK & opts->mptcp_options) {
+               if (!mptcp_is_data_seq(skb))
+                       ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
+               else
+                       ptr += mptcp_write_dss_data_seq(tp, skb, ptr);
+       }
+       if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
+               struct mp_prio *mpprio = (struct mp_prio *)ptr;
+
+               mpprio->kind = TCPOPT_MPTCP;
+               mpprio->len = MPTCP_SUB_LEN_PRIO;
+               mpprio->sub = MPTCP_SUB_PRIO;
+               mpprio->rsv = 0;
+               mpprio->b = tp->mptcp->low_prio;
+               mpprio->addr_id = TCPOPT_NOP;
+
+               ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
+       }
+}
+
+/* Sends the datafin */
+void mptcp_send_fin(struct sock *meta_sk)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sk_buff *skb = tcp_write_queue_tail(meta_sk);
+       int mss_now;
+
+       if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
+               meta_tp->mpcb->passive_close = 1;
+
+       /* Optimization, tack on the FIN if we have a queue of
+        * unsent frames.  But be careful about outgoing SACKS
+        * and IP options.
+        */
+       mss_now = mptcp_current_mss(meta_sk);
+
+       if (tcp_send_head(meta_sk) != NULL) {
+               TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
+               TCP_SKB_CB(skb)->end_seq++;
+               meta_tp->write_seq++;
+       } else {
+               /* Socket is locked, keep trying until memory is available. */
+               for (;;) {
+                       skb = alloc_skb_fclone(MAX_TCP_HEADER,
+                                              meta_sk->sk_allocation);
+                       if (skb)
+                               break;
+                       yield();
+               }
+               /* Reserve space for headers and prepare control bits. */
+               skb_reserve(skb, MAX_TCP_HEADER);
+
+               tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
+               TCP_SKB_CB(skb)->end_seq++;
+               TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
+               tcp_queue_skb(meta_sk, skb);
+       }
+       __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
+}
+
+void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       struct sock *sk;
+
+       if (!mpcb->cnt_subflows)
+               return;
+
+       WARN_ON(meta_tp->send_mp_fclose);
+
+       /* First - select a socket */
+       sk = mptcp_select_ack_sock(meta_sk);
+
+       /* May happen if no subflow is in an appropriate state, OR
+        * we are in infinite mode or about to go there - just send a reset
+        */
+       if (!sk || mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping ||
+           mpcb->infinite_mapping_rcv) {
+
+               /* tcp_done must be handled with bh disabled */
+               if (!in_serving_softirq())
+                       local_bh_disable();
+
+               mptcp_sub_force_close_all(mpcb, NULL);
+
+               if (!in_serving_softirq())
+                       local_bh_enable();
+               return;
+       }
+
+
+       tcp_sk(sk)->send_mp_fclose = 1;
+       /** Reset all other subflows */
+
+       /* tcp_done must be handled with bh disabled */
+       if (!in_serving_softirq())
+               local_bh_disable();
+
+       mptcp_sub_force_close_all(mpcb, sk);
+
+       tcp_set_state(sk, TCP_RST_WAIT);
+
+       if (!in_serving_softirq())
+               local_bh_enable();
+
+       tcp_send_ack(sk);
+       tcp_clear_xmit_timers(sk);
+       inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);
+
+       meta_tp->send_mp_fclose = 1;
+       inet_csk(sk)->icsk_retransmits = 0;
+
+       /* Prevent exp backoff reverting on ICMP dest unreachable */
+       inet_csk(sk)->icsk_backoff = 0;
+
+       MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_FASTCLOSETX);
+}
+
+static void mptcp_ack_retransmit_timer(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct net *net = sock_net(sk);
+       struct sk_buff *skb;
+
+       if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
+               goto out; /* Routing failure or similar */
+
+       tcp_mstamp_refresh(tp);
+
+       if (tcp_write_timeout(sk)) {
+               MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRTO);
+               tp->mptcp->pre_established = 0;
+               sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
+               tp->ops->send_active_reset(sk, GFP_ATOMIC);
+               goto out;
+       }
+
+       skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+       if (skb == NULL) {
+               sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
+                              jiffies + icsk->icsk_rto);
+               return;
+       }
+
+       /* Reserve space for headers and prepare control bits */
+       skb_reserve(skb, MAX_TCP_HEADER);
+       tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
+
+       MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKRXMIT);
+
+       if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
+               /* Retransmission failed because of local congestion,
+                * do not backoff.
+                */
+               if (!icsk->icsk_retransmits)
+                       icsk->icsk_retransmits = 1;
+               sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
+                              jiffies + icsk->icsk_rto);
+               return;
+       }
+
+       if (!tp->retrans_stamp)
+               tp->retrans_stamp = tcp_time_stamp(tp) ? : 1;
+
+       icsk->icsk_retransmits++;
+       icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+       sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
+                      jiffies + icsk->icsk_rto);
+       if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
+               __sk_dst_reset(sk);
+
+out:;
+}
+
+void mptcp_ack_handler(unsigned long data)
+{
+       struct sock *sk = (struct sock *)data;
+       struct sock *meta_sk = mptcp_meta_sk(sk);
+
+       bh_lock_sock(meta_sk);
+       if (sock_owned_by_user(meta_sk)) {
+               /* Try again later */
+               sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
+                              jiffies + (HZ / 20));
+               goto out_unlock;
+       }
+
+       if (sk->sk_state == TCP_CLOSE)
+               goto out_unlock;
+       if (!tcp_sk(sk)->mptcp->pre_established)
+               goto out_unlock;
+
+       mptcp_ack_retransmit_timer(sk);
+
+       sk_mem_reclaim(sk);
+
+out_unlock:
+       bh_unlock_sock(meta_sk);
+       sock_put(sk);
+}
+
+/* Similar to tcp_retransmit_skb
+ *
+ * The diff is that we handle the retransmission-stats (retrans_stamp) at the
+ * meta-level.
+ */
+int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct sock *subsk;
+       unsigned int limit, mss_now;
+       int err = -1;
+
+       WARN_ON(TCP_SKB_CB(skb)->sacked);
+
+       /* Do not sent more than we queued. 1/4 is reserved for possible
+        * copying overhead: fragmentation, tunneling, mangling etc.
+        *
+        * This is a meta-retransmission thus we check on the meta-socket.
+        */
+       if (refcount_read(&meta_sk->sk_wmem_alloc) >
+           min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
+               return -EAGAIN;
+       }
+
+       /* We need to make sure that the retransmitted segment can be sent on a
+        * subflow right now. If it is too big, it needs to be fragmented.
+        */
+       subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false);
+       if (!subsk) {
+               /* We want to increase icsk_retransmits, thus return 0, so that
+                * mptcp_meta_retransmit_timer enters the desired branch.
+                */
+               err = 0;
+               goto failed;
+       }
+       mss_now = tcp_current_mss(subsk);
+
+       /* If the segment was cloned (e.g. a meta retransmission), the header
+        * must be expanded/copied so that there is no corruption of TSO
+        * information.
+        */
+       if (skb_unclone(skb, GFP_ATOMIC)) {
+               err = -ENOMEM;
+               goto failed;
+       }
+
+       /* Must have been set by mptcp_write_xmit before */
+       WARN_ON(!tcp_skb_pcount(skb));
+
+       limit = mss_now;
+       /* skb->len > mss_now is the equivalent of tso_segs > 1 in
+        * tcp_write_xmit. Otherwise split-point would return 0.
+        */
+       if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
+               limit = tcp_mss_split_point(meta_sk, skb, mss_now,
+                                           UINT_MAX / mss_now,
+                                           TCP_NAGLE_OFF);
+
+       if (skb->len > limit &&
+           unlikely(mptcp_fragment(meta_sk, skb, limit,
+                                   GFP_ATOMIC, 0)))
+               goto failed;
+
+       if (!mptcp_skb_entail(subsk, skb, -1))
+               goto failed;
+
+       /* Update global TCP statistics. */
+       MPTCP_INC_STATS(sock_net(meta_sk), MPTCP_MIB_RETRANSSEGS);
+
+       /* Diff to tcp_retransmit_skb */
+
+       /* Save stamp of the first retransmit. */
+       if (!meta_tp->retrans_stamp) {
+               tcp_mstamp_refresh(meta_tp);
+               meta_tp->retrans_stamp = tcp_time_stamp(meta_tp);
+       }
+
+       __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
+       meta_tp->lsndtime = tcp_jiffies32;
+
+       return 0;
+
+failed:
+       __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL);
+       return err;
+}
+
+/* Similar to tcp_retransmit_timer
+ *
+ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
+ * and that we don't have an srtt estimation at the meta-level.
+ */
+void mptcp_meta_retransmit_timer(struct sock *meta_sk)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
+       int err;
+
+       /* In fallback, retransmission is handled at the subflow-level */
+       if (!meta_tp->packets_out || mpcb->infinite_mapping_snd)
+               return;
+
+       WARN_ON(tcp_write_queue_empty(meta_sk));
+
+       if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
+           !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+               /* Receiver dastardly shrinks window. Our retransmits
+                * become zero probes, but we should not timeout this
+                * connection. If the socket is an orphan, time it out,
+                * we cannot allow such beasts to hang infinitely.
+                */
+               struct inet_sock *meta_inet = inet_sk(meta_sk);
+
+               if (meta_sk->sk_family == AF_INET) {
+                       net_dbg_ratelimited("MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+                                           &meta_inet->inet_daddr,
+                                           ntohs(meta_inet->inet_dport),
+                                           meta_inet->inet_num, meta_tp->snd_una,
+                                           meta_tp->snd_nxt);
+               }
+#if IS_ENABLED(CONFIG_IPV6)
+               else if (meta_sk->sk_family == AF_INET6) {
+                       net_dbg_ratelimited("MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
+                                           &meta_sk->sk_v6_daddr,
+                                           ntohs(meta_inet->inet_dport),
+                                           meta_inet->inet_num, meta_tp->snd_una,
+                                           meta_tp->snd_nxt);
+               }
+#endif
+               if (tcp_jiffies32 - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
+                       tcp_write_err(meta_sk);
+                       return;
+               }
+
+               mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
+               goto out_reset_timer;
+       }
+
+       if (tcp_write_timeout(meta_sk))
+               return;
+
+       if (meta_icsk->icsk_retransmits == 0)
+               __NET_INC_STATS(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
+
+       meta_icsk->icsk_ca_state = TCP_CA_Loss;
+
+       err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
+       if (err > 0) {
+               /* Retransmission failed because of local congestion,
+                * do not backoff.
+                */
+               if (!meta_icsk->icsk_retransmits)
+                       meta_icsk->icsk_retransmits = 1;
+               inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
+                                         min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
+                                         TCP_RTO_MAX);
+               return;
+       }
+
+       /* Increase the timeout each time we retransmit.  Note that
+        * we do not increase the rtt estimate.  rto is initialized
+        * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
+        * that doubling rto each time is the least we can get away with.
+        * In KA9Q, Karn uses this for the first few times, and then
+        * goes to quadratic.  netBSD doubles, but only goes up to *64,
+        * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
+        * defined in the protocol as the maximum possible RTT.  I guess
+        * we'll have to use something other than TCP to talk to the
+        * University of Mars.
+        *
+        * PAWS allows us longer timeouts and large windows, so once
+        * implemented ftp to mars will work nicely. We will have to fix
+        * the 120 second clamps though!
+        */
+       meta_icsk->icsk_backoff++;
+       meta_icsk->icsk_retransmits++;
+
+out_reset_timer:
+       /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+        * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+        * might be increased if the stream oscillates between thin and thick,
+        * thus the old value might already be too high compared to the value
+        * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+        * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+        * exponential backoff behaviour to avoid continue hammering
+        * linear-timeout retransmissions into a black hole
+        */
+       if (meta_sk->sk_state == TCP_ESTABLISHED &&
+           (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+           tcp_stream_is_thin(meta_tp) &&
+           meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+               meta_icsk->icsk_backoff = 0;
+               /* We cannot do the same as in tcp_write_timer because the
+                * srtt is not set here.
+                */
+               mptcp_set_rto(meta_sk);
+       } else {
+               /* Use normal (exponential) backoff */
+               meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
+       }
+       inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
+}
+
+void mptcp_sub_retransmit_timer(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       tcp_retransmit_timer(sk);
+
+       if (!tp->fastopen_rsk) {
+               mptcp_reinject_data(sk, 1);
+               mptcp_set_rto(sk);
+       }
+}
+
+/* Modify values to an mptcp-level for the initial window of new subflows */
+void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
+                               __u32 *window_clamp, int wscale_ok,
+                               __u8 *rcv_wscale, __u32 init_rcv_wnd,
+                                const struct sock *sk)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
+
+       *window_clamp = mpcb->orig_window_clamp;
+       __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);
+
+       tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,
+                                 wscale_ok, rcv_wscale, init_rcv_wnd, sk);
+}
+
+static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss,
+                                 unsigned int (*mss_cb)(struct sock *sk))
+{
+       struct sock *sk;
+       u64 rate = 0;
+
+       mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
+               struct tcp_sock *tp = tcp_sk(sk);
+               int this_mss;
+               u64 this_rate;
+
+               if (!mptcp_sk_can_send(sk))
+                       continue;
+
+               /* Do not consider subflows without a RTT estimation yet
+                * otherwise this_rate >>> rate.
+                */
+               if (unlikely(!tp->srtt_us))
+                       continue;
+
+               this_mss = mss_cb(sk);
+
+               /* If this_mss is smaller than mss, it means that a segment will
+                * be splitted in two (or more) when pushed on this subflow. If
+                * you consider that mss = 1428 and this_mss = 1420 then two
+                * segments will be generated: a 1420-byte and 8-byte segment.
+                * The latter will introduce a large overhead as for a single
+                * data segment 2 slots will be used in the congestion window.
+                * Therefore reducing by ~2 the potential throughput of this
+                * subflow. Indeed, 1428 will be send while 2840 could have been
+                * sent if mss == 1420 reducing the throughput by 2840 / 1428.
+                *
+                * The following algorithm take into account this overhead
+                * when computing the potential throughput that MPTCP can
+                * achieve when generating mss-byte segments.
+                *
+                * The formulae is the following:
+                *  \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub}
+                * Where ratio is computed as follows:
+                *  \frac{mss}{\ceil{mss / mss_sub} * mss_sub}
+                *
+                * ratio gives the reduction factor of the theoretical
+                * throughput a subflow can achieve if MPTCP uses a specific
+                * MSS value.
+                */
+               this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) *
+                                     max(tp->snd_cwnd, tp->packets_out),
+                                     (u64)tp->srtt_us *
+                                     DIV_ROUND_UP(mss, this_mss) * this_mss);
+               rate += this_rate;
+       }
+
+       return rate;
+}
+
+static unsigned int __mptcp_current_mss(const struct sock *meta_sk,
+                                       unsigned int (*mss_cb)(struct sock *sk))
+{
+       unsigned int mss = 0;
+       u64 rate = 0;
+       struct sock *sk;
+
+       mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
+               int this_mss;
+               u64 this_rate;
+
+               if (!mptcp_sk_can_send(sk))
+                       continue;
+
+               this_mss = mss_cb(sk);
+
+               /* Same mss values will produce the same throughput. */
+               if (this_mss == mss)
+                       continue;
+
+               /* See whether using this mss value can theoretically improve
+                * the performances.
+                */
+               this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb);
+               if (this_rate >= rate) {
+                       mss = this_mss;
+                       rate = this_rate;
+               }
+       }
+
+       return mss;
+}
+
+unsigned int mptcp_current_mss(struct sock *meta_sk)
+{
+       unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss);
+
+       /* If no subflow is available, we take a default-mss from the
+        * meta-socket.
+        */
+       return !mss ? tcp_current_mss(meta_sk) : mss;
+}
+
+static unsigned int mptcp_select_size_mss(struct sock *sk)
+{
+       return tcp_sk(sk)->mss_cache;
+}
+
+int mptcp_select_size(const struct sock *meta_sk, bool sg, bool first_skb)
+{
+       unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss);
+
+       if (sg) {
+               if (mptcp_sk_can_gso(meta_sk)) {
+                       mss = linear_payload_sz(first_skb);
+               } else {
+                       int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+
+                       if (mss >= pgbreak &&
+                           mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+                               mss = pgbreak;
+               }
+       }
+
+       return !mss ? tcp_sk(meta_sk)->mss_cache : mss;
+}
+
+int mptcp_check_snd_buf(const struct tcp_sock *tp)
+{
+       const struct sock *sk;
+       u32 rtt_max = tp->srtt_us;
+       u64 bw_est;
+
+       if (!tp->srtt_us)
+               return tp->reordering + 1;
+
+       mptcp_for_each_sk(tp->mpcb, sk) {
+               if (!mptcp_sk_can_send(sk))
+                       continue;
+
+               if (rtt_max < tcp_sk(sk)->srtt_us)
+                       rtt_max = tcp_sk(sk)->srtt_us;
+       }
+
+       bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
+                               (u64)tp->srtt_us);
+
+       return max_t(unsigned int, (u32)(bw_est >> 16),
+                       tp->reordering + 1);
+}
+
+unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
+                                 int large_allowed)
+{
+       struct sock *sk;
+       u32 xmit_size_goal = 0;
+
+       if (large_allowed && mptcp_sk_can_gso(meta_sk)) {
+               mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
+                       int this_size_goal;
+
+                       if (!mptcp_sk_can_send(sk))
+                               continue;
+
+                       this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
+                       if (this_size_goal > xmit_size_goal)
+                               xmit_size_goal = this_size_goal;
+               }
+       }
+
+       return max(xmit_size_goal, mss_now);
+}
+
diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c
new file mode 100644 (file)
index 0000000..cbf05ea
--- /dev/null
@@ -0,0 +1,226 @@
+/*
+ *     MPTCP implementation - MPTCP-subflow-management
+ *
+ *     Initial Design & Implementation:
+ *     Sébastien Barré <sebastien.barre@uclouvain.be>
+ *
+ *     Current Maintainer & Author:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     Additional authors:
+ *     Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi>
+ *     Gregory Detal <gregory.detal@uclouvain.be>
+ *     Fabien Duchêne <fabien.duchene@uclouvain.be>
+ *     Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de>
+ *     Lavkesh Lahngir <lavkesh51@gmail.com>
+ *     Andreas Ripke <ripke@neclab.eu>
+ *     Vlad Dogaru <vlad.dogaru@intel.com>
+ *     Octavian Purdila <octavian.purdila@intel.com>
+ *     John Ronan <jronan@tssg.org>
+ *     Catalin Nicutar <catalin.nicutar@gmail.com>
+ *     Brandon Heller <brandonh@stanford.edu>
+ *
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/module.h>
+#include <net/mptcp.h>
+
+static DEFINE_SPINLOCK(mptcp_pm_list_lock);
+static LIST_HEAD(mptcp_pm_list);
+
+static int mptcp_default_id(sa_family_t family, union inet_addr *addr,
+                           struct net *net, bool *low_prio)
+{
+       return 0;
+}
+
+struct mptcp_pm_ops mptcp_pm_default = {
+       .get_local_id = mptcp_default_id, /* We do not care */
+       .name = "default",
+       .owner = THIS_MODULE,
+};
+
+static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
+{
+       struct mptcp_pm_ops *e;
+
+       list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
+               if (strcmp(e->name, name) == 0)
+                       return e;
+       }
+
+       return NULL;
+}
+
+int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
+{
+       int ret = 0;
+
+       if (!pm->get_local_id)
+               return -EINVAL;
+
+       spin_lock(&mptcp_pm_list_lock);
+       if (mptcp_pm_find(pm->name)) {
+               pr_notice("%s already registered\n", pm->name);
+               ret = -EEXIST;
+       } else {
+               list_add_tail_rcu(&pm->list, &mptcp_pm_list);
+               pr_info("%s registered\n", pm->name);
+       }
+       spin_unlock(&mptcp_pm_list_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
+
+void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
+{
+       spin_lock(&mptcp_pm_list_lock);
+       list_del_rcu(&pm->list);
+       spin_unlock(&mptcp_pm_list_lock);
+
+       /* Wait for outstanding readers to complete before the
+        * module gets removed entirely.
+        *
+        * A try_module_get() should fail by now as our module is
+        * in "going" state since no refs are held anymore and
+        * module_exit() handler being called.
+        */
+       synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
+
+void mptcp_get_default_path_manager(char *name)
+{
+       struct mptcp_pm_ops *pm;
+
+       WARN_ON(list_empty(&mptcp_pm_list));
+
+       rcu_read_lock();
+       pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
+       strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
+       rcu_read_unlock();
+}
+
+int mptcp_set_default_path_manager(const char *name)
+{
+       struct mptcp_pm_ops *pm;
+       int ret = -ENOENT;
+
+       spin_lock(&mptcp_pm_list_lock);
+       pm = mptcp_pm_find(name);
+#ifdef CONFIG_MODULES
+       if (!pm && capable(CAP_NET_ADMIN)) {
+               spin_unlock(&mptcp_pm_list_lock);
+
+               request_module("mptcp_%s", name);
+               spin_lock(&mptcp_pm_list_lock);
+               pm = mptcp_pm_find(name);
+       }
+#endif
+
+       if (pm) {
+               list_move(&pm->list, &mptcp_pm_list);
+               ret = 0;
+       } else {
+               pr_info("%s is not available\n", name);
+       }
+       spin_unlock(&mptcp_pm_list_lock);
+
+       return ret;
+}
+
+static struct mptcp_pm_ops *__mptcp_pm_find_autoload(const char *name)
+{
+       struct mptcp_pm_ops *pm = mptcp_pm_find(name);
+#ifdef CONFIG_MODULES
+       if (!pm && capable(CAP_NET_ADMIN)) {
+               rcu_read_unlock();
+               request_module("mptcp_%s", name);
+               rcu_read_lock();
+               pm = mptcp_pm_find(name);
+       }
+#endif
+       return pm;
+}
+
+void mptcp_init_path_manager(struct mptcp_cb *mpcb)
+{
+       struct mptcp_pm_ops *pm;
+       struct sock *meta_sk = mpcb->meta_sk;
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+
+       rcu_read_lock();
+       /* if path manager was set using socket option */
+       if (meta_tp->mptcp_pm_setsockopt) {
+               pm = __mptcp_pm_find_autoload(meta_tp->mptcp_pm_name);
+               if (pm && try_module_get(pm->owner)) {
+                       mpcb->pm_ops = pm;
+                       goto out;
+               }
+       }
+
+       list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
+               if (try_module_get(pm->owner)) {
+                       mpcb->pm_ops = pm;
+                       break;
+               }
+       }
+out:
+       rcu_read_unlock();
+}
+
+/* Change path manager for socket */
+int mptcp_set_path_manager(struct sock *sk, const char *name)
+{
+       struct mptcp_pm_ops *pm;
+       int err = 0;
+
+       rcu_read_lock();
+       pm = __mptcp_pm_find_autoload(name);
+
+       if (!pm) {
+               err = -ENOENT;
+       } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+               err = -EPERM;
+       } else {
+               strcpy(tcp_sk(sk)->mptcp_pm_name, name);
+               tcp_sk(sk)->mptcp_pm_setsockopt = 1;
+       }
+       rcu_read_unlock();
+
+       return err;
+}
+
+/* Manage refcounts on socket close. */
+void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
+{
+       module_put(mpcb->pm_ops->owner);
+}
+
+/* Fallback to the default path-manager. */
+void mptcp_fallback_default(struct mptcp_cb *mpcb)
+{
+       struct mptcp_pm_ops *pm;
+
+       mptcp_cleanup_path_manager(mpcb);
+       pm = mptcp_pm_find("default");
+
+       /* Cannot fail - it's the default module */
+       try_module_get(pm->owner);
+       mpcb->pm_ops = pm;
+}
+EXPORT_SYMBOL_GPL(mptcp_fallback_default);
+
+/* Set default value from kernel configuration at bootup */
+static int __init mptcp_path_manager_default(void)
+{
+       return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);
+}
+late_initcall(mptcp_path_manager_default);
diff --git a/net/mptcp/mptcp_redundant.c b/net/mptcp/mptcp_redundant.c
new file mode 100644 (file)
index 0000000..001a3f8
--- /dev/null
@@ -0,0 +1,301 @@
+/*
+ *     MPTCP Scheduler to reduce latency and jitter.
+ *
+ *     This scheduler sends all packets redundantly on all available subflows.
+ *
+ *     Initial Design & Implementation:
+ *     Tobias Erbshaeusser <erbshauesser@dvs.tu-darmstadt.de>
+ *     Alexander Froemmgen <froemmge@dvs.tu-darmstadt.de>
+ *
+ *     Initial corrections & modifications:
+ *     Christian Pinedo <christian.pinedo@ehu.eus>
+ *     Igor Lopez <igor.lopez@ehu.eus>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <net/mptcp.h>
+
+/* Struct to store the data of a single subflow */
+struct redsched_sock_data {
+       /* The skb or NULL */
+       struct sk_buff *skb;
+       /* End sequence number of the skb. This number should be checked
+        * to be valid before the skb field is used
+        */
+       u32 skb_end_seq;
+};
+
+/* Struct to store the data of the control block */
+struct redsched_cb_data {
+       /* The next subflow where a skb should be sent or NULL */
+       struct tcp_sock *next_subflow;
+};
+
+/* Returns the socket data from a given subflow socket */
+static struct redsched_sock_data *redsched_get_sock_data(struct tcp_sock *tp)
+{
+       return (struct redsched_sock_data *)&tp->mptcp->mptcp_sched[0];
+}
+
+/* Returns the control block data from a given meta socket */
+static struct redsched_cb_data *redsched_get_cb_data(struct tcp_sock *tp)
+{
+       return (struct redsched_cb_data *)&tp->mpcb->mptcp_sched[0];
+}
+
+static bool redsched_get_active_valid_sks(struct sock *meta_sk)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       struct sock *sk;
+       int active_valid_sks = 0;
+
+       mptcp_for_each_sk(mpcb, sk) {
+               if (subflow_is_active((struct tcp_sock *)sk) &&
+                   !mptcp_is_def_unavailable(sk))
+                       active_valid_sks++;
+       }
+
+       return active_valid_sks;
+}
+
+static bool redsched_use_subflow(struct sock *meta_sk,
+                                int active_valid_sks,
+                                struct tcp_sock *tp,
+                                struct sk_buff *skb)
+{
+       if (!skb || !mptcp_is_available((struct sock *)tp, skb, false))
+               return false;
+
+       if (TCP_SKB_CB(skb)->path_mask != 0)
+               return subflow_is_active(tp);
+
+       if (TCP_SKB_CB(skb)->path_mask == 0) {
+               if (active_valid_sks == -1)
+                       active_valid_sks = redsched_get_active_valid_sks(meta_sk);
+
+               if (subflow_is_backup(tp) && active_valid_sks > 0)
+                       return false;
+               else
+                       return true;
+       }
+
+       return false;
+}
+
+static struct sock *redundant_get_subflow(struct sock *meta_sk,
+                                         struct sk_buff *skb,
+                                         bool zero_wnd_test)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       struct redsched_cb_data *cb_data = redsched_get_cb_data(meta_tp);
+       struct tcp_sock *first_tp = cb_data->next_subflow;
+       struct sock *sk;
+       struct tcp_sock *tp;
+
+       /* Answer data_fin on same subflow */
+       if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
+           skb && mptcp_is_data_fin(skb)) {
+               mptcp_for_each_sk(mpcb, sk) {
+                       if (tcp_sk(sk)->mptcp->path_index ==
+                               mpcb->dfin_path_index &&
+                           mptcp_is_available(sk, skb, zero_wnd_test))
+                               return sk;
+               }
+       }
+
+       if (!first_tp)
+               first_tp = mpcb->connection_list;
+       tp = first_tp;
+
+       /* still NULL (no subflow in connection_list?) */
+       if (!first_tp)
+               return NULL;
+
+       /* Search for any subflow to send it */
+       do {
+               if (mptcp_is_available((struct sock *)tp, skb,
+                                      zero_wnd_test)) {
+                       cb_data->next_subflow = tp->mptcp->next;
+                       return (struct sock *)tp;
+               }
+
+               tp = tp->mptcp->next;
+               if (!tp)
+                       tp = mpcb->connection_list;
+       } while (tp != first_tp);
+
+       /* No space */
+       return NULL;
+}
+
+/* Corrects the stored skb pointers if they are invalid */
+static void redsched_correct_skb_pointers(struct sock *meta_sk,
+                                         struct redsched_sock_data *sk_data)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+
+       if (sk_data->skb && !after(sk_data->skb_end_seq, meta_tp->snd_una))
+               sk_data->skb = NULL;
+}
+
+/* Returns the next skb from the queue */
+static struct sk_buff *redundant_next_skb_from_queue(struct sk_buff_head *queue,
+                                                    struct sk_buff *previous,
+                                                    struct sock *meta_sk)
+{
+       if (skb_queue_empty(queue))
+               return NULL;
+
+       if (!previous)
+               return skb_peek(queue);
+
+       if (skb_queue_is_last(queue, previous))
+               return NULL;
+
+       /* sk_data->skb stores the last scheduled packet for this subflow.
+        * If sk_data->skb was scheduled but not sent (e.g., due to nagle),
+        * we have to schedule it again.
+        *
+        * For the redundant scheduler, there are two cases:
+        * 1. sk_data->skb was not sent on another subflow:
+        *    we have to schedule it again to ensure that we do not
+        *    skip this packet.
+        * 2. sk_data->skb was already sent on another subflow:
+        *    with regard to the redundant semantic, we have to
+        *    schedule it again. However, we keep it simple and ignore it,
+        *    as it was already sent by another subflow.
+        *    This might be changed in the future.
+        *
+        * For case 1, send_head is equal previous, as only a single
+        * packet can be skipped.
+        */
+       if (tcp_send_head(meta_sk) == previous)
+               return tcp_send_head(meta_sk);
+
+       return skb_queue_next(queue, previous);
+}
+
+static struct sk_buff *redundant_next_segment(struct sock *meta_sk,
+                                             int *reinject,
+                                             struct sock **subsk,
+                                             unsigned int *limit)
+{
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+       struct mptcp_cb *mpcb = meta_tp->mpcb;
+       struct redsched_cb_data *cb_data = redsched_get_cb_data(meta_tp);
+       struct tcp_sock *first_tp = cb_data->next_subflow;
+       struct tcp_sock *tp;
+       struct sk_buff *skb;
+       int active_valid_sks = -1;
+
+       /* As we set it, we have to reset it as well. */
+       *limit = 0;
+
+       if (skb_queue_empty(&mpcb->reinject_queue) &&
+           skb_queue_empty(&meta_sk->sk_write_queue))
+               /* Nothing to send */
+               return NULL;
+
+       /* First try reinjections */
+       skb = skb_peek(&mpcb->reinject_queue);
+       if (skb) {
+               *subsk = get_available_subflow(meta_sk, skb, false);
+               if (!*subsk)
+                       return NULL;
+               *reinject = 1;
+               return skb;
+       }
+
+       /* Then try indistinctly redundant and normal skbs */
+
+       if (!first_tp)
+               first_tp = mpcb->connection_list;
+
+       /* still NULL (no subflow in connection_list?) */
+       if (!first_tp)
+               return NULL;
+
+       tp = first_tp;
+
+       *reinject = 0;
+       active_valid_sks = redsched_get_active_valid_sks(meta_sk);
+       do {
+               struct redsched_sock_data *sk_data;
+
+               /* Correct the skb pointers of the current subflow */
+               sk_data = redsched_get_sock_data(tp);
+               redsched_correct_skb_pointers(meta_sk, sk_data);
+
+               skb = redundant_next_skb_from_queue(&meta_sk->sk_write_queue,
+                                                   sk_data->skb, meta_sk);
+               if (skb && redsched_use_subflow(meta_sk, active_valid_sks, tp,
+                                               skb)) {
+                       sk_data->skb = skb;
+                       sk_data->skb_end_seq = TCP_SKB_CB(skb)->end_seq;
+                       cb_data->next_subflow = tp->mptcp->next;
+                       *subsk = (struct sock *)tp;
+
+                       if (TCP_SKB_CB(skb)->path_mask)
+                               *reinject = -1;
+                       return skb;
+               }
+
+               tp = tp->mptcp->next;
+               if (!tp)
+                       tp = mpcb->connection_list;
+       } while (tp != first_tp);
+
+       /* Nothing to send */
+       return NULL;
+}
+
+static void redundant_release(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct redsched_cb_data *cb_data = redsched_get_cb_data(tp);
+
+       /* Check if the next subflow would be the released one. If yes correct
+        * the pointer
+        */
+       if (cb_data->next_subflow == tp)
+               cb_data->next_subflow = tp->mptcp->next;
+}
+
+static struct mptcp_sched_ops mptcp_sched_redundant = {
+       .get_subflow = redundant_get_subflow,
+       .next_segment = redundant_next_segment,
+       .release = redundant_release,
+       .name = "redundant",
+       .owner = THIS_MODULE,
+};
+
+static int __init redundant_register(void)
+{
+       BUILD_BUG_ON(sizeof(struct redsched_sock_data) > MPTCP_SCHED_SIZE);
+       BUILD_BUG_ON(sizeof(struct redsched_cb_data) > MPTCP_SCHED_DATA_SIZE);
+
+       if (mptcp_register_scheduler(&mptcp_sched_redundant))
+               return -1;
+
+       return 0;
+}
+
+static void redundant_unregister(void)
+{
+       mptcp_unregister_scheduler(&mptcp_sched_redundant);
+}
+
+module_init(redundant_register);
+module_exit(redundant_unregister);
+
+MODULE_AUTHOR("Tobias Erbshaeusser, Alexander Froemmgen");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("REDUNDANT MPTCP");
+MODULE_VERSION("0.90");
diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c
new file mode 100644 (file)
index 0000000..8910ba9
--- /dev/null
@@ -0,0 +1,301 @@
+/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
+
+#include <linux/module.h>
+#include <net/mptcp.h>
+
+static unsigned char num_segments __read_mostly = 1;
+module_param(num_segments, byte, 0644);
+MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");
+
+static bool cwnd_limited __read_mostly = 1;
+module_param(cwnd_limited, bool, 0644);
+MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");
+
+struct rrsched_priv {
+       unsigned char quota;
+};
+
+static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)
+{
+       return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];
+}
+
+/* If the sub-socket sk available to send the skb? */
+static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,
+                                 bool zero_wnd_test, bool cwnd_test)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       unsigned int space, in_flight;
+
+       /* Set of states for which we are allowed to send data */
+       if (!mptcp_sk_can_send(sk))
+               return false;
+
+       /* We do not send data on this subflow unless it is
+        * fully established, i.e. the 4th ack has been received.
+        */
+       if (tp->mptcp->pre_established)
+               return false;
+
+       if (tp->pf)
+               return false;
+
+       if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
+               /* If SACK is disabled, and we got a loss, TCP does not exit
+                * the loss-state until something above high_seq has been acked.
+                * (see tcp_try_undo_recovery)
+                *
+                * high_seq is the snd_nxt at the moment of the RTO. As soon
+                * as we have an RTO, we won't push data on the subflow.
+                * Thus, snd_una can never go beyond high_seq.
+                */
+               if (!tcp_is_reno(tp))
+                       return false;
+               else if (tp->snd_una != tp->high_seq)
+                       return false;
+       }
+
+       if (!tp->mptcp->fully_established) {
+               /* Make sure that we send in-order data */
+               if (skb && tp->mptcp->second_packet &&
+                   tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
+                       return false;
+       }
+
+       if (!cwnd_test)
+               goto zero_wnd_test;
+
+       in_flight = tcp_packets_in_flight(tp);
+       /* Not even a single spot in the cwnd */
+       if (in_flight >= tp->snd_cwnd)
+               return false;
+
+       /* Now, check if what is queued in the subflow's send-queue
+        * already fills the cwnd.
+        */
+       space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+       if (tp->write_seq - tp->snd_nxt > space)
+               return false;
+
+zero_wnd_test:
+       if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
+               return false;
+
+       return true;
+}
+
+/* Are we not allowed to reinject this skb on tp? */
+static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
+{
+       /* If the skb has already been enqueued in this sk, try to find
+        * another one.
+        */
+       return skb &&
+               /* Has the skb already been enqueued into this subsocket? */
+               mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
+}
+
+/* We just look for any subflow that is available */
+static struct sock *rr_get_available_subflow(struct sock *meta_sk,
+                                            struct sk_buff *skb,
+                                            bool zero_wnd_test)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct sock *sk, *bestsk = NULL, *backupsk = NULL;
+
+       /* Answer data_fin on same subflow!!! */
+       if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
+           skb && mptcp_is_data_fin(skb)) {
+               mptcp_for_each_sk(mpcb, sk) {
+                       if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
+                           mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
+                               return sk;
+               }
+       }
+
+       /* First, find the best subflow */
+       mptcp_for_each_sk(mpcb, sk) {
+               struct tcp_sock *tp = tcp_sk(sk);
+
+               if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
+                       continue;
+
+               if (mptcp_rr_dont_reinject_skb(tp, skb)) {
+                       backupsk = sk;
+                       continue;
+               }
+
+               bestsk = sk;
+       }
+
+       if (bestsk) {
+               sk = bestsk;
+       } else if (backupsk) {
+               /* It has been sent on all subflows once - let's give it a
+                * chance again by restarting its pathmask.
+                */
+               if (skb)
+                       TCP_SKB_CB(skb)->path_mask = 0;
+               sk = backupsk;
+       }
+
+       return sk;
+}
+
+/* Returns the next segment to be sent from the mptcp meta-queue.
+ * (chooses the reinject queue if any segment is waiting in it, otherwise,
+ * chooses the normal write queue).
+ * Sets *@reinject to 1 if the returned segment comes from the
+ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
+ * and sets it to -1 if it is a meta-level retransmission to optimize the
+ * receive-buffer.
+ */
+static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct sk_buff *skb = NULL;
+
+       *reinject = 0;
+
+       /* If we are in fallback-mode, just take from the meta-send-queue */
+       if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
+               return tcp_send_head(meta_sk);
+
+       skb = skb_peek(&mpcb->reinject_queue);
+
+       if (skb)
+               *reinject = 1;
+       else
+               skb = tcp_send_head(meta_sk);
+       return skb;
+}
+
+static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,
+                                            int *reinject,
+                                            struct sock **subsk,
+                                            unsigned int *limit)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct sock *sk_it, *choose_sk = NULL;
+       struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);
+       unsigned char split = num_segments;
+       unsigned char iter = 0, full_subs = 0;
+
+       /* As we set it, we have to reset it as well. */
+       *limit = 0;
+
+       if (!skb)
+               return NULL;
+
+       if (*reinject) {
+               *subsk = rr_get_available_subflow(meta_sk, skb, false);
+               if (!*subsk)
+                       return NULL;
+
+               return skb;
+       }
+
+retry:
+
+       /* First, we look for a subflow who is currently being used */
+       mptcp_for_each_sk(mpcb, sk_it) {
+               struct tcp_sock *tp_it = tcp_sk(sk_it);
+               struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
+
+               if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
+                       continue;
+
+               iter++;
+
+               /* Is this subflow currently being used? */
+               if (rsp->quota > 0 && rsp->quota < num_segments) {
+                       split = num_segments - rsp->quota;
+                       choose_sk = sk_it;
+                       goto found;
+               }
+
+               /* Or, it's totally unused */
+               if (!rsp->quota) {
+                       split = num_segments;
+                       choose_sk = sk_it;
+               }
+
+               /* Or, it must then be fully used  */
+               if (rsp->quota >= num_segments)
+                       full_subs++;
+       }
+
+       /* All considered subflows have a full quota, and we considered at
+        * least one.
+        */
+       if (iter && iter == full_subs) {
+               /* So, we restart this round by setting quota to 0 and retry
+                * to find a subflow.
+                */
+               mptcp_for_each_sk(mpcb, sk_it) {
+                       struct tcp_sock *tp_it = tcp_sk(sk_it);
+                       struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
+
+                       if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
+                               continue;
+
+                       rsp->quota = 0;
+               }
+
+               goto retry;
+       }
+
+found:
+       if (choose_sk) {
+               unsigned int mss_now;
+               struct tcp_sock *choose_tp = tcp_sk(choose_sk);
+               struct rrsched_priv *rsp = rrsched_get_priv(choose_tp);
+
+               if (!mptcp_rr_is_available(choose_sk, skb, false, true))
+                       return NULL;
+
+               *subsk = choose_sk;
+               mss_now = tcp_current_mss(*subsk);
+               *limit = split * mss_now;
+
+               if (skb->len > mss_now)
+                       rsp->quota += DIV_ROUND_UP(skb->len, mss_now);
+               else
+                       rsp->quota++;
+
+               return skb;
+       }
+
+       return NULL;
+}
+
+static struct mptcp_sched_ops mptcp_sched_rr = {
+       .get_subflow = rr_get_available_subflow,
+       .next_segment = mptcp_rr_next_segment,
+       .name = "roundrobin",
+       .owner = THIS_MODULE,
+};
+
+static int __init rr_register(void)
+{
+       BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);
+
+       if (mptcp_register_scheduler(&mptcp_sched_rr))
+               return -1;
+
+       return 0;
+}
+
+static void rr_unregister(void)
+{
+       mptcp_unregister_scheduler(&mptcp_sched_rr);
+}
+
+module_init(rr_register);
+module_exit(rr_unregister);
+
+MODULE_AUTHOR("Christoph Paasch");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ROUNDROBIN MPTCP");
+MODULE_VERSION("0.89");
diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c
new file mode 100644 (file)
index 0000000..50ec2e4
--- /dev/null
@@ -0,0 +1,634 @@
+/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
+
+#include <linux/module.h>
+#include <net/mptcp.h>
+
+static DEFINE_SPINLOCK(mptcp_sched_list_lock);
+static LIST_HEAD(mptcp_sched_list);
+
+struct defsched_priv {
+       u32     last_rbuf_opti;
+};
+
+static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)
+{
+       return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];
+}
+
+bool mptcp_is_def_unavailable(struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+
+       /* Set of states for which we are allowed to send data */
+       if (!mptcp_sk_can_send(sk))
+               return true;
+
+       /* We do not send data on this subflow unless it is
+        * fully established, i.e. the 4th ack has been received.
+        */
+       if (tp->mptcp->pre_established)
+               return true;
+
+       if (tp->pf)
+               return true;
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable);
+
+static bool mptcp_is_temp_unavailable(struct sock *sk,
+                                     const struct sk_buff *skb,
+                                     bool zero_wnd_test)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       unsigned int mss_now, space, in_flight;
+
+       if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
+               /* If SACK is disabled, and we got a loss, TCP does not exit
+                * the loss-state until something above high_seq has been
+                * acked. (see tcp_try_undo_recovery)
+                *
+                * high_seq is the snd_nxt at the moment of the RTO. As soon
+                * as we have an RTO, we won't push data on the subflow.
+                * Thus, snd_una can never go beyond high_seq.
+                */
+               if (!tcp_is_reno(tp))
+                       return true;
+               else if (tp->snd_una != tp->high_seq)
+                       return true;
+       }
+
+       if (!tp->mptcp->fully_established) {
+               /* Make sure that we send in-order data */
+               if (skb && tp->mptcp->second_packet &&
+                   tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
+                       return true;
+       }
+
+       /* If TSQ is already throttling us, do not send on this subflow. When
+        * TSQ gets cleared the subflow becomes eligible again.
+        */
+       if (test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags))
+               return true;
+
+       in_flight = tcp_packets_in_flight(tp);
+       /* Not even a single spot in the cwnd */
+       if (in_flight >= tp->snd_cwnd)
+               return true;
+
+       /* Now, check if what is queued in the subflow's send-queue
+        * already fills the cwnd.
+        */
+       space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+       if (tp->write_seq - tp->snd_nxt > space)
+               return true;
+
+       if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
+               return true;
+
+       mss_now = tcp_current_mss(sk);
+
+       /* Don't send on this subflow if we bypass the allowed send-window at
+        * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
+        * calculated end_seq (because here at this point end_seq is still at
+        * the meta-level).
+        */
+       if (skb && !zero_wnd_test &&
+           after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
+               return true;
+
+       return false;
+}
+
+/* Is the sub-socket sk available to send the skb? */
+bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
+                       bool zero_wnd_test)
+{
+       return !mptcp_is_def_unavailable(sk) &&
+              !mptcp_is_temp_unavailable(sk, skb, zero_wnd_test);
+}
+EXPORT_SYMBOL_GPL(mptcp_is_available);
+
+/* Are we not allowed to reinject this skb on tp? */
+static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
+{
+       /* If the skb has already been enqueued in this sk, try to find
+        * another one.
+        */
+       return skb &&
+               /* Has the skb already been enqueued into this subsocket? */
+               mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
+}
+
+bool subflow_is_backup(const struct tcp_sock *tp)
+{
+       return tp->mptcp->rcv_low_prio || tp->mptcp->low_prio;
+}
+EXPORT_SYMBOL_GPL(subflow_is_backup);
+
+bool subflow_is_active(const struct tcp_sock *tp)
+{
+       return !tp->mptcp->rcv_low_prio && !tp->mptcp->low_prio;
+}
+EXPORT_SYMBOL_GPL(subflow_is_active);
+
+/* Generic function to iterate over used and unused subflows and to select the
+ * best one
+ */
+static struct sock
+*get_subflow_from_selectors(struct mptcp_cb *mpcb, struct sk_buff *skb,
+                           bool (*selector)(const struct tcp_sock *),
+                           bool zero_wnd_test, bool *force)
+{
+       struct sock *bestsk = NULL;
+       u32 min_srtt = 0xffffffff;
+       bool found_unused = false;
+       bool found_unused_una = false;
+       struct sock *sk;
+
+       mptcp_for_each_sk(mpcb, sk) {
+               struct tcp_sock *tp = tcp_sk(sk);
+               bool unused = false;
+
+               /* First, we choose only the wanted sks */
+               if (!(*selector)(tp))
+                       continue;
+
+               if (!mptcp_dont_reinject_skb(tp, skb))
+                       unused = true;
+               else if (found_unused)
+                       /* If a unused sk was found previously, we continue -
+                        * no need to check used sks anymore.
+                        */
+                       continue;
+
+               if (mptcp_is_def_unavailable(sk))
+                       continue;
+
+               if (mptcp_is_temp_unavailable(sk, skb, zero_wnd_test)) {
+                       if (unused)
+                               found_unused_una = true;
+                       continue;
+               }
+
+               if (unused) {
+                       if (!found_unused) {
+                               /* It's the first time we encounter an unused
+                                * sk - thus we reset the bestsk (which might
+                                * have been set to a used sk).
+                                */
+                               min_srtt = 0xffffffff;
+                               bestsk = NULL;
+                       }
+                       found_unused = true;
+               }
+
+               if (tp->srtt_us < min_srtt) {
+                       min_srtt = tp->srtt_us;
+                       bestsk = sk;
+               }
+       }
+
+       if (bestsk) {
+               /* The force variable is used to mark the returned sk as
+                * previously used or not-used.
+                */
+               if (found_unused)
+                       *force = true;
+               else
+                       *force = false;
+       } else {
+               /* The force variable is used to mark if there are temporally
+                * unavailable not-used sks.
+                */
+               if (found_unused_una)
+                       *force = true;
+               else
+                       *force = false;
+       }
+
+       return bestsk;
+}
+
+/* This is the scheduler. This function decides on which flow to send
+ * a given MSS. If all subflows are found to be busy, NULL is returned
+ * The flow is selected based on the shortest RTT.
+ * If all paths have full cong windows, we simply return NULL.
+ *
+ * Additionally, this function is aware of the backup-subflows.
+ */
+struct sock *get_available_subflow(struct sock *meta_sk, struct sk_buff *skb,
+                                  bool zero_wnd_test)
+{
+       struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct sock *sk;
+       bool force;
+
+       /* if there is only one subflow, bypass the scheduling function */
+       if (mpcb->cnt_subflows == 1) {
+               sk = (struct sock *)mpcb->connection_list;
+               if (!mptcp_is_available(sk, skb, zero_wnd_test))
+                       sk = NULL;
+               return sk;
+       }
+
+       /* Answer data_fin on same subflow!!! */
+       if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
+           skb && mptcp_is_data_fin(skb)) {
+               mptcp_for_each_sk(mpcb, sk) {
+                       if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
+                           mptcp_is_available(sk, skb, zero_wnd_test))
+                               return sk;
+               }
+       }
+
+       /* Find the best subflow */
+       sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_active,
+                                       zero_wnd_test, &force);
+       if (force)
+               /* one unused active sk or one NULL sk when there is at least
+                * one temporally unavailable unused active sk
+                */
+               return sk;
+
+       sk = get_subflow_from_selectors(mpcb, skb, &subflow_is_backup,
+                                       zero_wnd_test, &force);
+       if (!force && skb)
+               /* one used backup sk or one NULL sk where there is no one
+                * temporally unavailable unused backup sk
+                *
+                * the skb passed through all the available active and backups
+                * sks, so clean the path mask
+                */
+               TCP_SKB_CB(skb)->path_mask = 0;
+       return sk;
+}
+EXPORT_SYMBOL_GPL(get_available_subflow);
+
+static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
+{
+       struct sock *meta_sk;
+       const struct tcp_sock *tp = tcp_sk(sk);
+       struct tcp_sock *tp_it;
+       struct sk_buff *skb_head;
+       struct defsched_priv *dsp = defsched_get_priv(tp);
+
+       if (tp->mpcb->cnt_subflows == 1)
+               return NULL;
+
+       meta_sk = mptcp_meta_sk(sk);
+       skb_head = tcp_write_queue_head(meta_sk);
+
+       if (!skb_head || skb_head == tcp_send_head(meta_sk))
+               return NULL;
+
+       /* If penalization is optional (coming from mptcp_next_segment() and
+        * We are not send-buffer-limited we do not penalize. The retransmission
+        * is just an optimization to fix the idle-time due to the delay before
+        * we wake up the application.
+        */
+       if (!penal && sk_stream_memory_free(meta_sk))
+               goto retrans;
+
+       /* Only penalize again after an RTT has elapsed */
+       if (tcp_jiffies32 - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))
+               goto retrans;
+
+       /* Half the cwnd of the slow flow */
+       mptcp_for_each_tp(tp->mpcb, tp_it) {
+               if (tp_it != tp &&
+                   TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
+                       if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
+                               u32 prior_cwnd = tp_it->snd_cwnd;
+
+                               tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
+
+                               /* If in slow start, do not reduce the ssthresh */
+                               if (prior_cwnd >= tp_it->snd_ssthresh)
+                                       tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
+
+                               dsp->last_rbuf_opti = tcp_jiffies32;
+                       }
+                       break;
+               }
+       }
+
+retrans:
+
+       /* Segment not yet injected into this path? Take it!!! */
+       if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
+               bool do_retrans = false;
+
+               mptcp_for_each_tp(tp->mpcb, tp_it) {
+                       if (tp_it != tp &&
+                           TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
+                               if (tp_it->snd_cwnd <= 4) {
+                                       do_retrans = true;
+                                       break;
+                               }
+
+                               if (4 * tp->srtt_us >= tp_it->srtt_us) {
+                                       do_retrans = false;
+                                       break;
+                               }
+
+                               do_retrans = true;
+                       }
+               }
+
+               if (do_retrans && mptcp_is_available(sk, skb_head, false))
+                       return skb_head;
+       }
+       return NULL;
+}
+
+/* Returns the next segment to be sent from the mptcp meta-queue.
+ * (chooses the reinject queue if any segment is waiting in it, otherwise,
+ * chooses the normal write queue).
+ * Sets *@reinject to 1 if the returned segment comes from the
+ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
+ * and sets it to -1 if it is a meta-level retransmission to optimize the
+ * receive-buffer.
+ */
+static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)
+{
+       const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
+       struct sk_buff *skb = NULL;
+
+       *reinject = 0;
+
+       /* If we are in fallback-mode, just take from the meta-send-queue */
+       if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
+               return tcp_send_head(meta_sk);
+
+       skb = skb_peek(&mpcb->reinject_queue);
+
+       if (skb) {
+               *reinject = 1;
+       } else {
+               skb = tcp_send_head(meta_sk);
+
+               if (!skb && meta_sk->sk_socket &&
+                   test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
+                   sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
+                       struct sock *subsk = get_available_subflow(meta_sk, NULL,
+                                                                  false);
+                       if (!subsk)
+                               return NULL;
+
+                       skb = mptcp_rcv_buf_optimization(subsk, 0);
+                       if (skb)
+                               *reinject = -1;
+               }
+       }
+       return skb;
+}
+
+static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
+                                         int *reinject,
+                                         struct sock **subsk,
+                                         unsigned int *limit)
+{
+       struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
+       unsigned int mss_now;
+       struct tcp_sock *subtp;
+       u16 gso_max_segs;
+       u32 max_len, max_segs, window, needed;
+
+       /* As we set it, we have to reset it as well. */
+       *limit = 0;
+
+       if (!skb)
+               return NULL;
+
+       *subsk = get_available_subflow(meta_sk, skb, false);
+       if (!*subsk)
+               return NULL;
+
+       subtp = tcp_sk(*subsk);
+       mss_now = tcp_current_mss(*subsk);
+
+       if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {
+               skb = mptcp_rcv_buf_optimization(*subsk, 1);
+               if (skb)
+                       *reinject = -1;
+               else
+                       return NULL;
+       }
+
+       /* No splitting required, as we will only send one single segment */
+       if (skb->len <= mss_now)
+               return skb;
+
+       /* The following is similar to tcp_mss_split_point, but
+        * we do not care about nagle, because we will anyways
+        * use TCP_NAGLE_PUSH, which overrides this.
+        *
+        * So, we first limit according to the cwnd/gso-size and then according
+        * to the subflow's window.
+        */
+
+       gso_max_segs = (*subsk)->sk_gso_max_segs;
+       if (!gso_max_segs) /* No gso supported on the subflow's NIC */
+               gso_max_segs = 1;
+       max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);
+       if (!max_segs)
+               return NULL;
+
+       max_len = mss_now * max_segs;
+       window = tcp_wnd_end(subtp) - subtp->write_seq;
+
+       needed = min(skb->len, window);
+       if (max_len <= skb->len)
+               /* Take max_win, which is actually the cwnd/gso-size */
+               *limit = max_len;
+       else
+               /* Or, take the window */
+               *limit = needed;
+
+       return skb;
+}
+
+static void defsched_init(struct sock *sk)
+{
+       struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk));
+
+       dsp->last_rbuf_opti = tcp_jiffies32;
+}
+
+struct mptcp_sched_ops mptcp_sched_default = {
+       .get_subflow = get_available_subflow,
+       .next_segment = mptcp_next_segment,
+       .init = defsched_init,
+       .name = "default",
+       .owner = THIS_MODULE,
+};
+
+static struct mptcp_sched_ops *mptcp_sched_find(const char *name)
+{
+       struct mptcp_sched_ops *e;
+
+       list_for_each_entry_rcu(e, &mptcp_sched_list, list) {
+               if (strcmp(e->name, name) == 0)
+                       return e;
+       }
+
+       return NULL;
+}
+
+int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
+{
+       int ret = 0;
+
+       if (!sched->get_subflow || !sched->next_segment)
+               return -EINVAL;
+
+       spin_lock(&mptcp_sched_list_lock);
+       if (mptcp_sched_find(sched->name)) {
+               pr_notice("%s already registered\n", sched->name);
+               ret = -EEXIST;
+       } else {
+               list_add_tail_rcu(&sched->list, &mptcp_sched_list);
+               pr_info("%s registered\n", sched->name);
+       }
+       spin_unlock(&mptcp_sched_list_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mptcp_register_scheduler);
+
+void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
+{
+       spin_lock(&mptcp_sched_list_lock);
+       list_del_rcu(&sched->list);
+       spin_unlock(&mptcp_sched_list_lock);
+
+       /* Wait for outstanding readers to complete before the
+        * module gets removed entirely.
+        *
+        * A try_module_get() should fail by now as our module is
+        * in "going" state since no refs are held anymore and
+        * module_exit() handler being called.
+        */
+       synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);
+
+void mptcp_get_default_scheduler(char *name)
+{
+       struct mptcp_sched_ops *sched;
+
+       WARN_ON(list_empty(&mptcp_sched_list));
+
+       rcu_read_lock();
+       sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);
+       strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);
+       rcu_read_unlock();
+}
+
+int mptcp_set_default_scheduler(const char *name)
+{
+       struct mptcp_sched_ops *sched;
+       int ret = -ENOENT;
+
+       spin_lock(&mptcp_sched_list_lock);
+       sched = mptcp_sched_find(name);
+#ifdef CONFIG_MODULES
+       if (!sched && capable(CAP_NET_ADMIN)) {
+               spin_unlock(&mptcp_sched_list_lock);
+
+               request_module("mptcp_%s", name);
+               spin_lock(&mptcp_sched_list_lock);
+               sched = mptcp_sched_find(name);
+       }
+#endif
+
+       if (sched) {
+               list_move(&sched->list, &mptcp_sched_list);
+               ret = 0;
+       } else {
+               pr_info("%s is not available\n", name);
+       }
+       spin_unlock(&mptcp_sched_list_lock);
+
+       return ret;
+}
+
+/* Must be called with rcu lock held */
+static struct mptcp_sched_ops *__mptcp_sched_find_autoload(const char *name)
+{
+       struct mptcp_sched_ops *sched = mptcp_sched_find(name);
+#ifdef CONFIG_MODULES
+       if (!sched && capable(CAP_NET_ADMIN)) {
+               rcu_read_unlock();
+               request_module("mptcp_%s", name);
+               rcu_read_lock();
+               sched = mptcp_sched_find(name);
+       }
+#endif
+       return sched;
+}
+
+void mptcp_init_scheduler(struct mptcp_cb *mpcb)
+{
+       struct mptcp_sched_ops *sched;
+       struct sock *meta_sk = mpcb->meta_sk;
+       struct tcp_sock *meta_tp = tcp_sk(meta_sk);
+
+       rcu_read_lock();
+       /* if scheduler was set using socket option */
+       if (meta_tp->mptcp_sched_setsockopt) {
+               sched = __mptcp_sched_find_autoload(meta_tp->mptcp_sched_name);
+               if (sched && try_module_get(sched->owner)) {
+                       mpcb->sched_ops = sched;
+                       goto out;
+               }
+       }
+
+       list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+               if (try_module_get(sched->owner)) {
+                       mpcb->sched_ops = sched;
+                       break;
+               }
+       }
+out:
+       rcu_read_unlock();
+}
+
+/* Change scheduler for socket */
+int mptcp_set_scheduler(struct sock *sk, const char *name)
+{
+       struct mptcp_sched_ops *sched;
+       int err = 0;
+
+       rcu_read_lock();
+       sched = __mptcp_sched_find_autoload(name);
+
+       if (!sched) {
+               err = -ENOENT;
+       } else if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+               err = -EPERM;
+       } else {
+               strcpy(tcp_sk(sk)->mptcp_sched_name, name);
+               tcp_sk(sk)->mptcp_sched_setsockopt = 1;
+       }
+       rcu_read_unlock();
+
+       return err;
+}
+
+/* Manage refcounts on socket close. */
+void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)
+{
+       module_put(mpcb->sched_ops->owner);
+}
+
+/* Set default value from kernel configuration at bootup */
+static int __init mptcp_scheduler_default(void)
+{
+       BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);
+
+       return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);
+}
+late_initcall(mptcp_scheduler_default);
diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c
new file mode 100644 (file)
index 0000000..ab29137
--- /dev/null
@@ -0,0 +1,269 @@
+/*
+ *     MPTCP implementation - WEIGHTED VEGAS
+ *
+ *     Algorithm design:
+ *     Yu Cao <cyAnalyst@126.com>
+ *     Mingwei Xu <xmw@csnet1.cs.tsinghua.edu.cn>
+ *     Xiaoming Fu <fu@cs.uni-goettinggen.de>
+ *
+ *     Implementation:
+ *     Yu Cao <cyAnalyst@126.com>
+ *     Enhuan Dong <deh13@mails.tsinghua.edu.cn>
+ *
+ *     Ported to the official MPTCP-kernel:
+ *     Christoph Paasch <christoph.paasch@uclouvain.be>
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include <linux/module.h>
+#include <linux/tcp.h>
+
+static int initial_alpha = 2;
+static int total_alpha = 10;
+static int gamma = 1;
+
+module_param(initial_alpha, int, 0644);
+MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
+module_param(total_alpha, int, 0644);
+MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+#define MPTCP_WVEGAS_SCALE 16
+
+/* wVegas variables */
+struct wvegas {
+       u32     beg_snd_nxt;    /* right edge during last RTT */
+       u8      doing_wvegas_now;/* if true, do wvegas for this RTT */
+
+       u16     cnt_rtt;                /* # of RTTs measured within last RTT */
+       u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
+       u32     base_rtt;       /* the min of all wVegas RTT measurements seen (in usec) */
+
+       u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
+       u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
+       int alpha; /* alpha for each subflows */
+
+       u32 queue_delay; /* queue delay*/
+};
+
+static inline u64 mptcp_wvegas_scale(u32 val, int scale)
+{
+       return (u64) val << scale;
+}
+
+static void wvegas_enable(const struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       struct wvegas *wvegas = inet_csk_ca(sk);
+
+       wvegas->doing_wvegas_now = 1;
+
+       wvegas->beg_snd_nxt = tp->snd_nxt;
+
+       wvegas->cnt_rtt = 0;
+       wvegas->sampled_rtt = 0;
+
+       wvegas->instant_rate = 0;
+       wvegas->alpha = initial_alpha;
+       wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
+
+       wvegas->queue_delay = 0;
+}
+
+static inline void wvegas_disable(const struct sock *sk)
+{
+       struct wvegas *wvegas = inet_csk_ca(sk);
+
+       wvegas->doing_wvegas_now = 0;
+}
+
+static void mptcp_wvegas_init(struct sock *sk)
+{
+       struct wvegas *wvegas = inet_csk_ca(sk);
+
+       wvegas->base_rtt = 0x7fffffff;
+       wvegas_enable(sk);
+}
+
+static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
+{
+       return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
+}
+
+static void mptcp_wvegas_pkts_acked(struct sock *sk,
+                                   const struct ack_sample *sample)
+{
+       struct wvegas *wvegas = inet_csk_ca(sk);
+       u32 vrtt;
+
+       if (sample->rtt_us < 0)
+               return;
+
+       vrtt = sample->rtt_us + 1;
+
+       if (vrtt < wvegas->base_rtt)
+               wvegas->base_rtt = vrtt;
+
+       wvegas->sampled_rtt += vrtt;
+       wvegas->cnt_rtt++;
+}
+
+static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
+{
+       if (ca_state == TCP_CA_Open)
+               wvegas_enable(sk);
+       else
+               wvegas_disable(sk);
+}
+
+static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+{
+       if (event == CA_EVENT_CWND_RESTART) {
+               mptcp_wvegas_init(sk);
+       } else if (event == CA_EVENT_LOSS) {
+               struct wvegas *wvegas = inet_csk_ca(sk);
+
+               wvegas->instant_rate = 0;
+       }
+}
+
+static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)
+{
+       return  min(tp->snd_ssthresh, tp->snd_cwnd);
+}
+
+static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)
+{
+       u64 total_rate = 0;
+       struct sock *sub_sk;
+       const struct wvegas *wvegas = inet_csk_ca(sk);
+
+       if (!mpcb)
+               return wvegas->weight;
+
+       mptcp_for_each_sk(mpcb, sub_sk) {
+               struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
+
+               /* sampled_rtt is initialized by 0 */
+               if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
+                       total_rate += sub_wvegas->instant_rate;
+       }
+
+       if (total_rate && wvegas->instant_rate)
+               return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
+       else
+               return wvegas->weight;
+}
+
+static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct wvegas *wvegas = inet_csk_ca(sk);
+
+       if (!wvegas->doing_wvegas_now) {
+               tcp_reno_cong_avoid(sk, ack, acked);
+               return;
+       }
+
+       if (after(ack, wvegas->beg_snd_nxt)) {
+               wvegas->beg_snd_nxt  = tp->snd_nxt;
+
+               if (wvegas->cnt_rtt <= 2) {
+                       tcp_reno_cong_avoid(sk, ack, acked);
+               } else {
+                       u32 rtt, diff, q_delay;
+                       u64 target_cwnd;
+
+                       rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
+                       target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
+
+                       diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
+
+                       if (diff > gamma && tcp_in_slow_start(tp)) {
+                               tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+                               tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
+
+                       } else if (tcp_in_slow_start(tp)) {
+                               tcp_slow_start(tp, acked);
+                       } else {
+                               if (diff >= wvegas->alpha) {
+                                       wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
+                                       wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
+                                       wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
+                               }
+                               if (diff > wvegas->alpha) {
+                                       tp->snd_cwnd--;
+                                       tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
+                               } else if (diff < wvegas->alpha) {
+                                       tp->snd_cwnd++;
+                               }
+
+                               /* Try to drain link queue if needed*/
+                               q_delay = rtt - wvegas->base_rtt;
+                               if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
+                                       wvegas->queue_delay = q_delay;
+
+                               if (q_delay >= 2 * wvegas->queue_delay) {
+                                       u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
+                                       
+                                       tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
+                                       wvegas->queue_delay = 0;
+                               }
+                       }
+
+                       if (tp->snd_cwnd < 2)
+                               tp->snd_cwnd = 2;
+                       else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
+                               tp->snd_cwnd = tp->snd_cwnd_clamp;
+
+                       tp->snd_ssthresh = tcp_current_ssthresh(sk);
+               }
+
+               wvegas->cnt_rtt = 0;
+               wvegas->sampled_rtt = 0;
+       }
+       /* Use normal slow start */
+       else if (tcp_in_slow_start(tp))
+               tcp_slow_start(tp, acked);
+}
+
+
+static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
+       .init           = mptcp_wvegas_init,
+       .ssthresh       = tcp_reno_ssthresh,
+       .cong_avoid     = mptcp_wvegas_cong_avoid,
+       .pkts_acked     = mptcp_wvegas_pkts_acked,
+       .set_state      = mptcp_wvegas_state,
+       .cwnd_event     = mptcp_wvegas_cwnd_event,
+
+       .owner          = THIS_MODULE,
+       .name           = "wvegas",
+};
+
+static int __init mptcp_wvegas_register(void)
+{
+       BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
+       tcp_register_congestion_control(&mptcp_wvegas);
+       return 0;
+}
+
+static void __exit mptcp_wvegas_unregister(void)
+{
+       tcp_unregister_congestion_control(&mptcp_wvegas);
+}
+
+module_init(mptcp_wvegas_register);
+module_exit(mptcp_wvegas_unregister);
+
+MODULE_AUTHOR("Yu Cao, Enhuan Dong");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MPTCP wVegas");
+MODULE_VERSION("0.1");