mptcp: never shrink offered window
authorPaolo Abeni <pabeni@redhat.com>
Wed, 4 May 2022 21:54:07 +0000 (14:54 -0700)
committerJakub Kicinski <kuba@kernel.org>
Fri, 6 May 2022 02:00:15 +0000 (19:00 -0700)
As per RFC, the offered MPTCP-level window should never shrink.
While we currently track the right edge, we don't enforce the
above constraint on the wire.
Additionally, concurrent xmit on different subflows can end-up in
erroneous right edge update.
Address the above explicitly updating the announced window and
protecting the update with an additional atomic operation (sic)

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
net/mptcp/options.c
net/mptcp/protocol.c
net/mptcp/protocol.h

index 2570911..3e3156c 100644 (file)
@@ -1224,20 +1224,58 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
        return true;
 }
 
-static void mptcp_set_rwin(const struct tcp_sock *tp)
+static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
 {
        const struct sock *ssk = (const struct sock *)tp;
-       const struct mptcp_subflow_context *subflow;
+       struct mptcp_subflow_context *subflow;
+       u64 ack_seq, rcv_wnd_old, rcv_wnd_new;
        struct mptcp_sock *msk;
-       u64 ack_seq;
+       u32 new_win;
+       u64 win;
 
        subflow = mptcp_subflow_ctx(ssk);
        msk = mptcp_sk(subflow->conn);
 
-       ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
+       ack_seq = READ_ONCE(msk->ack_seq);
+       rcv_wnd_new = ack_seq + tp->rcv_wnd;
+
+       rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent);
+       if (after64(rcv_wnd_new, rcv_wnd_old)) {
+               u64 rcv_wnd;
 
-       if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
-               WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+               for (;;) {
+                       rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new);
+
+                       if (rcv_wnd == rcv_wnd_old)
+                               break;
+                       if (before64(rcv_wnd_new, rcv_wnd))
+                               goto raise_win;
+                       rcv_wnd_old = rcv_wnd;
+               }
+               return;
+       }
+
+       if (rcv_wnd_new != rcv_wnd_old) {
+raise_win:
+               win = rcv_wnd_old - ack_seq;
+               tp->rcv_wnd = min_t(u64, win, U32_MAX);
+               new_win = tp->rcv_wnd;
+
+               /* Make sure we do not exceed the maximum possible
+                * scaled window.
+                */
+               if (unlikely(th->syn))
+                       new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
+               if (!tp->rx_opt.rcv_wscale &&
+                   sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)
+                       new_win = min(new_win, MAX_TCP_WINDOW);
+               else
+                       new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+               /* RFC1323 scaling applied */
+               new_win >>= tp->rx_opt.rcv_wscale;
+               th->window = htons(new_win);
+       }
 }
 
 u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum)
@@ -1554,7 +1592,7 @@ mp_capable_done:
        }
 
        if (tp)
-               mptcp_set_rwin(tp);
+               mptcp_set_rwin(tp, th);
 }
 
 __be32 mptcp_get_reset_option(const struct sk_buff *skb)
index 961c2ab..9e46cc8 100644 (file)
@@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
 
        seq = MPTCP_SKB_CB(skb)->map_seq;
        end_seq = MPTCP_SKB_CB(skb)->end_seq;
-       max_seq = READ_ONCE(msk->rcv_wnd_sent);
+       max_seq = atomic64_read(&msk->rcv_wnd_sent);
 
        pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq,
                 RB_EMPTY_ROOT(&msk->out_of_order_queue));
@@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
                mptcp_drop(sk, skb);
                pr_debug("oow by %lld, rcv_wnd_sent %llu\n",
                         (unsigned long long)end_seq - (unsigned long)max_seq,
-                        (unsigned long long)msk->rcv_wnd_sent);
+                        (unsigned long long)atomic64_read(&msk->rcv_wnd_sent));
                MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW);
                return;
        }
@@ -3004,7 +3004,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
                mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq);
                ack_seq++;
                WRITE_ONCE(msk->ack_seq, ack_seq);
-               WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
+               atomic64_set(&msk->rcv_wnd_sent, ack_seq);
        }
 
        sock_reset_flag(nsk, SOCK_RCU_FREE);
@@ -3297,9 +3297,9 @@ void mptcp_finish_connect(struct sock *ssk)
        WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
        WRITE_ONCE(msk->snd_nxt, msk->write_seq);
        WRITE_ONCE(msk->ack_seq, ack_seq);
-       WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
        WRITE_ONCE(msk->can_ack, 1);
        WRITE_ONCE(msk->snd_una, msk->write_seq);
+       atomic64_set(&msk->rcv_wnd_sent, ack_seq);
 
        mptcp_pm_new_connection(msk, ssk, 0);
 
index f542aea..4672901 100644 (file)
@@ -257,7 +257,7 @@ struct mptcp_sock {
        u64             write_seq;
        u64             snd_nxt;
        u64             ack_seq;
-       u64             rcv_wnd_sent;
+       atomic64_t      rcv_wnd_sent;
        u64             rcv_data_fin_seq;
        int             rmem_fwd_alloc;
        struct sock     *last_snd;