rxrpc: Fix congestion management
authorDavid Howells <dhowells@redhat.com>
Mon, 3 Oct 2022 17:49:11 +0000 (18:49 +0100)
committerDavid Howells <dhowells@redhat.com>
Tue, 8 Nov 2022 16:42:28 +0000 (16:42 +0000)
rxrpc has a problem in its congestion management in that it saves the
congestion window size (cwnd) from one call to another, but if this is 0 at
the time is saved, then the next call may not actually manage to ever
transmit anything.

To this end:

 (1) Don't save cwnd between calls, but rather reset back down to the
     initial cwnd and re-enter slow-start if data transmission is idle for
     more than an RTT.

 (2) Preserve ssthresh instead, as that is a handy estimate of pipe
     capacity.  Knowing roughly when to stop slow start and enter
     congestion avoidance can reduce the tendency to overshoot and drop
     larger amounts of packets when probing.

In future, cwind growth also needs to be constrained when the window isn't
being filled due to being application limited.

Reported-by: Simon Wilkinson <sxw@auristor.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org

include/trace/events/rxrpc.h
net/rxrpc/ar-internal.h
net/rxrpc/call_accept.c
net/rxrpc/call_object.c
net/rxrpc/conn_client.c
net/rxrpc/conn_object.c
net/rxrpc/input.c
net/rxrpc/output.c
net/rxrpc/peer_object.c
net/rxrpc/proc.c
net/rxrpc/sendmsg.c

index a11de55..b9886d1 100644 (file)
        EM(rxrpc_cong_new_low_nack,             " NewLowN") \
        EM(rxrpc_cong_no_change,                " -") \
        EM(rxrpc_cong_progress,                 " Progres") \
+       EM(rxrpc_cong_idle_reset,               " IdleRes") \
        EM(rxrpc_cong_retransmit_again,         " ReTxAgn") \
        EM(rxrpc_cong_rtt_window_end,           " RttWinE") \
        E_(rxrpc_cong_saw_nack,                 " SawNack")
index 775eb91..6bbe28e 100644 (file)
@@ -332,7 +332,7 @@ struct rxrpc_peer {
        u32                     rto_j;          /* Retransmission timeout in jiffies */
        u8                      backoff;        /* Backoff timeout */
 
-       u8                      cong_cwnd;      /* Congestion window size */
+       u8                      cong_ssthresh;  /* Congestion slow-start threshold */
 };
 
 /*
@@ -626,6 +626,7 @@ struct rxrpc_call {
        u16                     tx_backoff;     /* Delay to insert due to Tx failure */
        u8                      tx_winsize;     /* Maximum size of Tx window */
 #define RXRPC_TX_MAX_WINDOW    128
+       ktime_t                 tx_last_sent;   /* Last time a transmission occurred */
 
        /* Received data tracking */
        struct sk_buff_head     recvmsg_queue;  /* Queue of packets ready for recvmsg() */
@@ -687,10 +688,10 @@ struct rxrpc_call {
  * Summary of a new ACK and the changes it made to the Tx buffer packet states.
  */
 struct rxrpc_ack_summary {
+       u16                     nr_acks;                /* Number of ACKs in packet */
+       u16                     nr_new_acks;            /* Number of new ACKs in packet */
+       u16                     nr_rot_new_acks;        /* Number of rotated new ACKs */
        u8                      ack_reason;
-       u8                      nr_acks;                /* Number of ACKs in packet */
-       u8                      nr_new_acks;            /* Number of new ACKs in packet */
-       u8                      nr_rot_new_acks;        /* Number of rotated new ACKs */
        bool                    saw_nacks;              /* Saw NACKs in packet */
        bool                    new_low_nack;           /* T if new low NACK found */
        bool                    retrans_timeo;          /* T if reTx due to timeout happened */
index d8db277..48790ee 100644 (file)
@@ -324,7 +324,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
        call->security = conn->security;
        call->security_ix = conn->security_ix;
        call->peer = rxrpc_get_peer(conn->params.peer);
-       call->cong_cwnd = call->peer->cong_cwnd;
+       call->cong_ssthresh = call->peer->cong_ssthresh;
+       call->tx_last_sent = ktime_get_real();
        return call;
 }
 
index aa19daa..1befe22 100644 (file)
@@ -166,7 +166,12 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
        call->rx_winsize = rxrpc_rx_window_size;
        call->tx_winsize = 16;
 
-       call->cong_cwnd = 2;
+       if (RXRPC_TX_SMSS > 2190)
+               call->cong_cwnd = 2;
+       else if (RXRPC_TX_SMSS > 1095)
+               call->cong_cwnd = 3;
+       else
+               call->cong_cwnd = 4;
        call->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
 
        call->rxnet = rxnet;
index 3c9eeb5..f020f30 100644 (file)
@@ -363,7 +363,8 @@ static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx,
        if (!cp->peer)
                goto error;
 
-       call->cong_cwnd = cp->peer->cong_cwnd;
+       call->tx_last_sent = ktime_get_real();
+       call->cong_ssthresh = cp->peer->cong_ssthresh;
        if (call->cong_cwnd >= call->cong_ssthresh)
                call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
        else
index f7ea71a..156bd26 100644 (file)
@@ -207,7 +207,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
 {
        struct rxrpc_connection *conn = call->conn;
 
-       call->peer->cong_cwnd = call->cong_cwnd;
+       call->peer->cong_ssthresh = call->cong_ssthresh;
 
        if (!hlist_unhashed(&call->error_link)) {
                spin_lock_bh(&call->peer->lock);
index 5c17fed..bdf70b8 100644 (file)
@@ -58,6 +58,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
        summary->cumulative_acks = cumulative_acks;
        summary->dup_acks = call->cong_dup_acks;
 
+       /* If we haven't transmitted anything for >1RTT, we should reset the
+        * congestion management state.
+        */
+       if ((call->cong_mode == RXRPC_CALL_SLOW_START ||
+            call->cong_mode == RXRPC_CALL_CONGEST_AVOIDANCE) &&
+           ktime_before(ktime_add_us(call->tx_last_sent,
+                                     call->peer->srtt_us >> 3),
+                        ktime_get_real())
+           ) {
+               change = rxrpc_cong_idle_reset;
+               summary->mode = RXRPC_CALL_SLOW_START;
+               if (RXRPC_TX_SMSS > 2190)
+                       summary->cwnd = 2;
+               else if (RXRPC_TX_SMSS > 1095)
+                       summary->cwnd = 3;
+               else
+                       summary->cwnd = 4;
+       }
+
        switch (call->cong_mode) {
        case RXRPC_CALL_SLOW_START:
                if (summary->saw_nacks)
@@ -205,7 +224,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 
        if (call->acks_lowest_nak == call->acks_hard_ack) {
                call->acks_lowest_nak = to;
-       } else if (before_eq(call->acks_lowest_nak, to)) {
+       } else if (after(to, call->acks_lowest_nak)) {
                summary->new_low_nack = true;
                call->acks_lowest_nak = to;
        }
index 2c3f7e4..46432e7 100644 (file)
@@ -501,6 +501,7 @@ dont_set_request_ack:
 
 done:
        if (ret >= 0) {
+               call->tx_last_sent = txb->last_sent;
                if (txb->wire.flags & RXRPC_REQUEST_ACK) {
                        call->peer->rtt_last_req = txb->last_sent;
                        if (call->peer->rtt_count > 1) {
index 26d2ae9..041a512 100644 (file)
@@ -227,12 +227,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
 
                rxrpc_peer_init_rtt(peer);
 
-               if (RXRPC_TX_SMSS > 2190)
-                       peer->cong_cwnd = 2;
-               else if (RXRPC_TX_SMSS > 1095)
-                       peer->cong_cwnd = 3;
-               else
-                       peer->cong_cwnd = 4;
+               peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
                trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here);
        }
 
index 0807753..fae22a8 100644 (file)
@@ -217,7 +217,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
                seq_puts(seq,
                         "Proto Local                                          "
                         " Remote                                         "
-                        " Use  CW   MTU LastUse      RTT      RTO\n"
+                        " Use SST   MTU LastUse      RTT      RTO\n"
                         );
                return 0;
        }
@@ -235,7 +235,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
                   lbuff,
                   rbuff,
                   refcount_read(&peer->ref),
-                  peer->cong_cwnd,
+                  peer->cong_ssthresh,
                   peer->mtu,
                   now - peer->last_tx_at,
                   peer->srtt_us >> 3,
index 9b567af..e5fd8a9 100644 (file)
  */
 static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
 {
-       unsigned int win_size =
-               min_t(unsigned int, call->tx_winsize,
-                     call->cong_cwnd + call->cong_extra);
+       unsigned int win_size;
        rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack);
 
+       /* If we haven't transmitted anything for >1RTT, we should reset the
+        * congestion management state.
+        */
+       if (ktime_before(ktime_add_us(call->tx_last_sent,
+                                     call->peer->srtt_us >> 3),
+                        ktime_get_real())) {
+               if (RXRPC_TX_SMSS > 2190)
+                       win_size = 2;
+               else if (RXRPC_TX_SMSS > 1095)
+                       win_size = 3;
+               else
+                       win_size = 4;
+               win_size += call->cong_extra;
+       } else {
+               win_size = min_t(unsigned int, call->tx_winsize,
+                                call->cong_cwnd + call->cong_extra);
+       }
+
        if (_tx_win)
                *_tx_win = tx_win;
        return call->tx_top - tx_win < win_size;