tcp: track data delivery rate for a TCP connection

author Yuchung Cheng <ycheng@google.com>

Tue, 20 Sep 2016 03:39:14 +0000 (23:39 -0400)

committer David S. Miller <davem@davemloft.net>

Wed, 21 Sep 2016 04:23:00 +0000 (00:23 -0400)
author Yuchung Cheng <ycheng@google.com>
Tue, 20 Sep 2016 03:39:14 +0000 (23:39 -0400)
committer David S. Miller <davem@davemloft.net>
Wed, 21 Sep 2016 04:23:00 +0000 (00:23 -0400)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h

index 38590fb..c50e6ae 100644 (file)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -268,6 +268,8 @@ struct tcp_sock {
         u32     prr_out;        /* Total number of pkts sent during Recovery. */
         u32     delivered;      /* Total data packets delivered incl. rexmits */
         u32     lost;           /* Total data packets lost incl. rexmits */
+       struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
+       struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
  
         u32     rcv_wnd;        /* Current receiver window              */
         u32     write_seq;      /* Tail(+1) of data held in tcp send buffer */
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 2f1648a..b261c89 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -763,8 +763,14 @@ struct tcp_skb_cb {
         __u32           ack_seq;        /* Sequence number ACK'd        */
         union {
                 struct {
-                       /* There is space for up to 20 bytes */
+                       /* There is space for up to 24 bytes */
                         __u32 in_flight;/* Bytes in flight when packet sent */
+                       /* pkts S/ACKed so far upon tx of skb, incl retrans: */
+                       __u32 delivered;
+                       /* start of send pipeline phase */
+                       struct skb_mstamp first_tx_mstamp;
+                       /* when we reached the "delivered" count */
+                       struct skb_mstamp delivered_mstamp;
                 } tx;   /* only used for outgoing skbs */
                 union {
                         struct inet_skb_parm    h4;
@@ -860,6 +866,26 @@ struct ack_sample {
         u32 in_flight;
  };
  
+/* A rate sample measures the number of (original/retransmitted) data
+ * packets delivered "delivered" over an interval of time "interval_us".
+ * The tcp_rate.c code fills in the rate sample, and congestion
+ * control modules that define a cong_control function to run at the end
+ * of ACK processing can optionally chose to consult this sample when
+ * setting cwnd and pacing rate.
+ * A sample is invalid if "delivered" or "interval_us" is negative.
+ */
+struct rate_sample {
+       struct  skb_mstamp prior_mstamp; /* starting timestamp for interval */
+       u32  prior_delivered;   /* tp->delivered at "prior_mstamp" */
+       s32  delivered;         /* number of packets delivered over interval */
+       long interval_us;       /* time for tp->delivered to incr "delivered" */
+       long rtt_us;            /* RTT of last (S)ACKed packet (or -1) */
+       int  losses;            /* number of packets marked lost upon ACK */
+       u32  acked_sacked;      /* number of packets newly (S)ACKed upon ACK */
+       u32  prior_in_flight;   /* in flight before this ACK */
+       bool is_retrans;        /* is sample from retransmission? */
+};
+
  struct tcp_congestion_ops {
         struct list_head        list;
         u32 key;
@@ -946,6 +972,13 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
                 icsk->icsk_ca_ops->cwnd_event(sk, event);
  }
  
+/* From tcp_rate.c */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+                           struct rate_sample *rs);
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+                 struct skb_mstamp *now, struct rate_sample *rs);
+
  /* These functions determine how the current flow behaves in respect of SACK
   * handling. SACK is negotiated with the peer, and therefore it can vary
   * between different flows.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile

index 24629b6..9cfff1a 100644 (file)
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y     := route.o inetpeer.o protocol.o \
              inet_timewait_sock.o inet_connection_sock.o \
              tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
              tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
-            tcp_recovery.o \
+            tcp_rate.o tcp_recovery.o \
              tcp_offload.o datagram.o raw.o udp.o udplite.o \
              udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
              fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 9413288..d9ed4bb 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1112,6 +1112,7 @@ struct tcp_sacktag_state {
          */
         struct skb_mstamp first_sackt;
         struct skb_mstamp last_sackt;
+       struct rate_sample *rate;
         int     flag;
  };
  
@@ -1279,6 +1280,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
         tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
                         start_seq, end_seq, dup_sack, pcount,
                         &skb->skb_mstamp);
+       tcp_rate_skb_delivered(sk, skb, state->rate);
  
         if (skb == tp->lost_skb_hint)
                 tp->lost_cnt_hint += pcount;
@@ -1329,6 +1331,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
                 tcp_advance_highest_sack(sk, skb);
  
         tcp_skb_collapse_tstamp(prev, skb);
+       if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64))
+               TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0;
+
         tcp_unlink_write_queue(skb, sk);
         sk_wmem_free_skb(sk, skb);
  
@@ -1558,6 +1563,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                                 dup_sack,
                                                 tcp_skb_pcount(skb),
                                                 &skb->skb_mstamp);
+                       tcp_rate_skb_delivered(sk, skb, state->rate);
  
                         if (!before(TCP_SKB_CB(skb)->seq,
                                     tcp_highest_sack_seq(tp)))
@@ -1640,8 +1646,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
  
         found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
                                          num_sacks, prior_snd_una);
-       if (found_dup_sack)
+       if (found_dup_sack) {
                 state->flag |= FLAG_DSACKING_ACK;
+               tp->delivered++; /* A spurious retransmission is delivered */
+       }
  
         /* Eliminate too old ACKs, but take into
          * account more or less fresh ones, they can
@@ -3071,10 +3079,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
   */
  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                                u32 prior_snd_una, int *acked,
-                              struct tcp_sacktag_state *sack)
+                              struct tcp_sacktag_state *sack,
+                              struct skb_mstamp *now)
  {
         const struct inet_connection_sock *icsk = inet_csk(sk);
-       struct skb_mstamp first_ackt, last_ackt, now;
+       struct skb_mstamp first_ackt, last_ackt;
         struct tcp_sock *tp = tcp_sk(sk);
         u32 prior_sacked = tp->sacked_out;
         u32 reord = tp->packets_out;
@@ -3106,7 +3115,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                         acked_pcount = tcp_tso_acked(sk, skb);
                         if (!acked_pcount)
                                 break;
-
                         fully_acked = false;
                 } else {
                         /* Speedup tcp_unlink_write_queue() and next loop */
@@ -3142,6 +3150,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  
                 tp->packets_out -= acked_pcount;
                 pkts_acked += acked_pcount;
+               tcp_rate_skb_delivered(sk, skb, sack->rate);
  
                 /* Initial outgoing SYN's get put onto the write_queue
                  * just like anything else we transmit.  It is not
@@ -3174,16 +3183,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
         if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                 flag |= FLAG_SACK_RENEGING;
  
-       skb_mstamp_get(&now);
         if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
-               seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
-               ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
+               seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt);
+               ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt);
         }
         if (sack->first_sackt.v64) {
-               sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt);
-               ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
+               sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt);
+               ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt);
         }
-
+       sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
         rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
                                         ca_rtt_us);
  
@@ -3211,7 +3219,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
                 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
  
         } else if (skb && rtt_update && sack_rtt_us >= 0 &&
-                  sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
+                  sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) {
                 /* Do not re-arm RTO if the sack RTT is measured from data sent
                  * after when the head was last (re)transmitted. Otherwise the
                  * timeout may continue to extend in loss recovery.
@@ -3548,17 +3556,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         struct inet_connection_sock *icsk = inet_csk(sk);
         struct tcp_sock *tp = tcp_sk(sk);
         struct tcp_sacktag_state sack_state;
+       struct rate_sample rs = { .prior_delivered = 0 };
         u32 prior_snd_una = tp->snd_una;
         u32 ack_seq = TCP_SKB_CB(skb)->seq;
         u32 ack = TCP_SKB_CB(skb)->ack_seq;
         bool is_dupack = false;
         u32 prior_fackets;
         int prior_packets = tp->packets_out;
-       u32 prior_delivered = tp->delivered;
+       u32 delivered = tp->delivered;
+       u32 lost = tp->lost;
         int acked = 0; /* Number of packets newly acked */
         int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+       struct skb_mstamp now;
  
         sack_state.first_sackt.v64 = 0;
+       sack_state.rate = &rs;
  
         /* We very likely will need to access write queue head. */
         prefetchw(sk->sk_write_queue.next);
@@ -3581,6 +3593,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         if (after(ack, tp->snd_nxt))
                 goto invalid_ack;
  
+       skb_mstamp_get(&now);
+
         if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                 tcp_rearm_rto(sk);
@@ -3591,6 +3605,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         }
  
         prior_fackets = tp->fackets_out;
+       rs.prior_in_flight = tcp_packets_in_flight(tp);
  
         /* ts_recent update must be made after we are sure that the packet
          * is in window.
@@ -3646,7 +3661,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
         /* See if we can take anything off of the retransmit queue. */
         flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
-                                   &sack_state);
+                                   &sack_state, &now);
  
         if (tcp_ack_is_dubious(sk, flag)) {
                 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
@@ -3663,7 +3678,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
         if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                 tcp_schedule_loss_probe(sk);
-       tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+       delivered = tp->delivered - delivered;  /* freshly ACKed or SACKed */
+       lost = tp->lost - lost;                 /* freshly marked lost */
+       tcp_rate_gen(sk, delivered, lost, &now, &rs);
+       tcp_cong_control(sk, ack, delivered, flag);
         tcp_xmit_recovery(sk, rexmit);
         return 1;
  
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 8b45794..e02c8eb 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -918,6 +918,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                 skb_mstamp_get(&skb->skb_mstamp);
                 TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
                         - tp->snd_una;
+               tcp_rate_skb_sent(sk, skb);
  
                 if (unlikely(skb_cloned(skb)))
                         skb = pskb_copy(skb, gfp_mask);
@@ -1213,6 +1214,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
         tcp_set_skb_tso_segs(skb, mss_now);
         tcp_set_skb_tso_segs(buff, mss_now);
  
+       /* Update delivered info for the new segment */
+       TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
+
         /* If this packet has been sent out already, we must
          * adjust the various packet counters.
          */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c

new file mode 100644 (file)

index 0000000..1daed6a
--- /dev/null
+++ b/net/ipv4/tcp_rate.c
@@ -0,0 +1,149 @@
+#include <net/tcp.h>
+
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ *    send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ *    ack_rate  = #pkts_delivered/(last_ack_time - first_ack_time)
+ *    bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits.  The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ */
+
+
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+        /* In general we need to start delivery rate samples from the
+         * time we received the most recent ACK, to ensure we include
+         * the full time the network needs to deliver all in-flight
+         * packets. If there are no packets in flight yet, then we
+         * know that any ACKs after now indicate that the network was
+         * able to deliver those packets completely in the sampling
+         * interval between now and the next ACK.
+         *
+         * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+         * because the latter is a guess based on RTO and loss-marking
+         * heuristics. We don't want spurious RTOs or loss markings to cause
+         * a spuriously small time interval, causing a spuriously high
+         * bandwidth estimate.
+         */
+       if (!tp->packets_out) {
+               tp->first_tx_mstamp  = skb->skb_mstamp;
+               tp->delivered_mstamp = skb->skb_mstamp;
+       }
+
+       TCP_SKB_CB(skb)->tx.first_tx_mstamp     = tp->first_tx_mstamp;
+       TCP_SKB_CB(skb)->tx.delivered_mstamp    = tp->delivered_mstamp;
+       TCP_SKB_CB(skb)->tx.delivered           = tp->delivered;
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the highest prior_delivered count.
+ */
+void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+                           struct rate_sample *rs)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+       if (!scb->tx.delivered_mstamp.v64)
+               return;
+
+       if (!rs->prior_delivered ||
+           after(scb->tx.delivered, rs->prior_delivered)) {
+               rs->prior_delivered  = scb->tx.delivered;
+               rs->prior_mstamp     = scb->tx.delivered_mstamp;
+               rs->is_retrans       = scb->sacked & TCPCB_RETRANS;
+
+               /* Find the duration of the "send phase" of this window: */
+               rs->interval_us      = skb_mstamp_us_delta(
+                                               &skb->skb_mstamp,
+                                               &scb->tx.first_tx_mstamp);
+
+               /* Record send time of most recently ACKed packet: */
+               tp->first_tx_mstamp  = skb->skb_mstamp;
+       }
+       /* Mark off the skb delivered once it's sacked to avoid being
+        * used again when it's cumulatively acked. For acked packets
+        * we don't need to reset since it'll be freed soon.
+        */
+       if (scb->sacked & TCPCB_SACKED_ACKED)
+               scb->tx.delivered_mstamp.v64 = 0;
+}
+
+/* Update the connection delivery information and generate a rate sample. */
+void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+                 struct skb_mstamp *now, struct rate_sample *rs)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       u32 snd_us, ack_us;
+
+       /* TODO: there are multiple places throughout tcp_ack() to get
+        * current time. Refactor the code using a new "tcp_acktag_state"
+        * to carry current time, flags, stats like "tcp_sacktag_state".
+        */
+       if (delivered)
+               tp->delivered_mstamp = *now;
+
+       rs->acked_sacked = delivered;   /* freshly ACKed or SACKed */
+       rs->losses = lost;              /* freshly marked lost */
+       /* Return an invalid sample if no timing information is available. */
+       if (!rs->prior_mstamp.v64) {
+               rs->delivered = -1;
+               rs->interval_us = -1;
+               return;
+       }
+       rs->delivered   = tp->delivered - rs->prior_delivered;
+
+       /* Model sending data and receiving ACKs as separate pipeline phases
+        * for a window. Usually the ACK phase is longer, but with ACK
+        * compression the send phase can be longer. To be safe we use the
+        * longer phase.
+        */
+       snd_us = rs->interval_us;                               /* send phase */
+       ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp);   /* ack phase */
+       rs->interval_us = max(snd_us, ack_us);
+
+       /* Normally we expect interval_us >= min-rtt.
+        * Note that rate may still be over-estimated when a spuriously
+        * retransmistted skb was first (s)acked because "interval_us"
+        * is under-estimated (up to an RTT). However continuously
+        * measuring the delivery rate during loss recovery is crucial
+        * for connections suffer heavy or prolonged losses.
+        */
+       if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+               rs->interval_us = -1;
+               if (!rs->is_retrans)
+                       pr_debug("tcp rate: %ld %d %u %u %u\n",
+                                rs->interval_us, rs->delivered,
+                                inet_csk(sk)->icsk_ca_state,
+                                tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+       }
+}
author	Yuchung Cheng <ycheng@google.com>
	Tue, 20 Sep 2016 03:39:14 +0000 (23:39 -0400)
committer	David S. Miller <davem@davemloft.net>
	Wed, 21 Sep 2016 04:23:00 +0000 (00:23 -0400)
include/linux/tcp.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/ipv4/Makefile		patch \| blob \| history
net/ipv4/tcp_input.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history
net/ipv4/tcp_rate.c	[new file with mode: 0644]	patch \| blob