tcp: Try to restore large SKBs while SACK processing

author Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>

Tue, 25 Nov 2008 05:20:15 +0000 (21:20 -0800)

committer David S. Miller <davem@davemloft.net>

Tue, 25 Nov 2008 05:20:15 +0000 (21:20 -0800)
author Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Tue, 25 Nov 2008 05:20:15 +0000 (21:20 -0800)
committer David S. Miller <davem@davemloft.net>
Tue, 25 Nov 2008 05:20:15 +0000 (21:20 -0800)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index a01b6f8..acf17af 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -493,6 +493,19 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list,
  }
  
  /**
+ *     skb_queue_is_first - check if skb is the first entry in the queue
+ *     @list: queue head
+ *     @skb: buffer
+ *
+ *     Returns true if @skb is the first buffer on the list.
+ */
+static inline bool skb_queue_is_first(const struct sk_buff_head *list,
+                                     const struct sk_buff *skb)
+{
+       return (skb->prev == (struct sk_buff *) list);
+}
+
+/**
   *     skb_queue_next - return the next packet in the queue
   *     @list: queue head
   *     @skb: current buffer
@@ -511,6 +524,24 @@ static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
  }
  
  /**
+ *     skb_queue_prev - return the prev packet in the queue
+ *     @list: queue head
+ *     @skb: current buffer
+ *
+ *     Return the prev packet in @list before @skb.  It is only valid to
+ *     call this if skb_queue_is_first() evaluates to false.
+ */
+static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
+                                            const struct sk_buff *skb)
+{
+       /* This BUG_ON may seem severe, but if we just return then we
+        * are going to dereference garbage.
+        */
+       BUG_ON(skb_queue_is_first(list, skb));
+       return skb->prev;
+}
+
+/**
   *     skb_get - reference buffer
   *     @skb: buffer to reference
   *
@@ -1652,6 +1683,8 @@ extern int             skb_splice_bits(struct sk_buff *skb,
  extern void           skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
  extern void           skb_split(struct sk_buff *skb,
                                  struct sk_buff *skb1, const u32 len);
+extern int            skb_shift(struct sk_buff *tgt, struct sk_buff *skb,
+                                int shiftlen);
  
  extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
  
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 90b4c3b..2653924 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1192,6 +1192,11 @@ static inline struct sk_buff *tcp_write_queue_next(struct sock *sk, struct sk_bu
         return skb_queue_next(&sk->sk_write_queue, skb);
  }
  
+static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_buff *skb)
+{
+       return skb_queue_prev(&sk->sk_write_queue, skb);
+}
+
  #define tcp_for_write_queue(skb, sk)                                   \
         skb_queue_walk(&(sk)->sk_write_queue, skb)
  
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index 267185a..844b8ab 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2018,6 +2018,146 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
                 skb_split_no_header(skb, skb1, len, pos);
  }
  
+/* Shifting from/to a cloned skb is a no-go.
+ *
+ * TODO: handle cloned skbs by using pskb_expand_head()
+ */
+static int skb_prepare_for_shift(struct sk_buff *skb)
+{
+       return skb_cloned(skb);
+}
+
+/**
+ * skb_shift - Shifts paged data partially from skb to another
+ * @tgt: buffer into which tail data gets added
+ * @skb: buffer from which the paged data comes from
+ * @shiftlen: shift up to this many bytes
+ *
+ * Attempts to shift up to shiftlen worth of bytes, which may be less than
+ * the length of the skb, from tgt to skb. Returns number bytes shifted.
+ * It's up to caller to free skb if everything was shifted.
+ *
+ * If @tgt runs out of frags, the whole operation is aborted.
+ *
+ * Skb cannot include anything else but paged data while tgt is allowed
+ * to have non-paged data as well.
+ *
+ * TODO: full sized shift could be optimized but that would need
+ * specialized skb free'er to handle frags without up-to-date nr_frags.
+ */
+int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
+{
+       int from, to, merge, todo;
+       struct skb_frag_struct *fragfrom, *fragto;
+
+       BUG_ON(shiftlen > skb->len);
+       BUG_ON(skb_headlen(skb));       /* Would corrupt stream */
+
+       todo = shiftlen;
+       from = 0;
+       to = skb_shinfo(tgt)->nr_frags;
+       fragfrom = &skb_shinfo(skb)->frags[from];
+
+       /* Actual merge is delayed until the point when we know we can
+        * commit all, so that we don't have to undo partial changes
+        */
+       if (!to ||
+           !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
+               merge = -1;
+       } else {
+               merge = to - 1;
+
+               todo -= fragfrom->size;
+               if (todo < 0) {
+                       if (skb_prepare_for_shift(skb) ||
+                           skb_prepare_for_shift(tgt))
+                               return 0;
+
+                       fragto = &skb_shinfo(tgt)->frags[merge];
+
+                       fragto->size += shiftlen;
+                       fragfrom->size -= shiftlen;
+                       fragfrom->page_offset += shiftlen;
+
+                       goto onlymerged;
+               }
+
+               from++;
+       }
+
+       /* Skip full, not-fitting skb to avoid expensive operations */
+       if ((shiftlen == skb->len) &&
+           (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
+               return 0;
+
+       if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
+               return 0;
+
+       while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
+               if (to == MAX_SKB_FRAGS)
+                       return 0;
+
+               fragfrom = &skb_shinfo(skb)->frags[from];
+               fragto = &skb_shinfo(tgt)->frags[to];
+
+               if (todo >= fragfrom->size) {
+                       *fragto = *fragfrom;
+                       todo -= fragfrom->size;
+                       from++;
+                       to++;
+
+               } else {
+                       get_page(fragfrom->page);
+                       fragto->page = fragfrom->page;
+                       fragto->page_offset = fragfrom->page_offset;
+                       fragto->size = todo;
+
+                       fragfrom->page_offset += todo;
+                       fragfrom->size -= todo;
+                       todo = 0;
+
+                       to++;
+                       break;
+               }
+       }
+
+       /* Ready to "commit" this state change to tgt */
+       skb_shinfo(tgt)->nr_frags = to;
+
+       if (merge >= 0) {
+               fragfrom = &skb_shinfo(skb)->frags[0];
+               fragto = &skb_shinfo(tgt)->frags[merge];
+
+               fragto->size += fragfrom->size;
+               put_page(fragfrom->page);
+       }
+
+       /* Reposition in the original skb */
+       to = 0;
+       while (from < skb_shinfo(skb)->nr_frags)
+               skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
+       skb_shinfo(skb)->nr_frags = to;
+
+       BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);
+
+onlymerged:
+       /* Most likely the tgt won't ever need its checksum anymore, skb on
+        * the other hand might need it if it needs to be resent
+        */
+       tgt->ip_summed = CHECKSUM_PARTIAL;
+       skb->ip_summed = CHECKSUM_PARTIAL;
+
+       /* Yak, is it really working this way? Some helper please? */
+       skb->len -= shiftlen;
+       skb->data_len -= shiftlen;
+       skb->truesize -= shiftlen;
+       tgt->len += shiftlen;
+       tgt->data_len += shiftlen;
+       tgt->truesize += shiftlen;
+
+       return shiftlen;
+}
+
  /**
   * skb_prepare_seq_read - Prepare a sequential read of skb data
   * @skb: the buffer to read
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index 3c8e297..97d5767 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1242,6 +1242,8 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb,
   * aligned portion of it that matches. Therefore we might need to fragment
   * which may fail and creates some hassle (caller must handle error case
   * returns).
+ *
+ * FIXME: this could be merged to shift decision code
   */
  static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
                                  u32 start_seq, u32 end_seq)
@@ -1353,9 +1355,6 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
  
                 if (fack_count > tp->fackets_out)
                         tp->fackets_out = fack_count;
-
-               if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp)))
-                       tcp_advance_highest_sack(sk, skb);
         }
  
         /* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1370,12 +1369,231 @@ static int tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
         return flag;
  }
  
+static int tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
+                          struct sk_buff *skb, unsigned int pcount,
+                          int shifted, int fack_count, int *reord,
+                          int *flag, int mss)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       u8 dummy_sacked = TCP_SKB_CB(skb)->sacked;      /* We discard results */
+
+       BUG_ON(!pcount);
+
+       TCP_SKB_CB(prev)->end_seq += shifted;
+       TCP_SKB_CB(skb)->seq += shifted;
+
+       skb_shinfo(prev)->gso_segs += pcount;
+       BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
+       skb_shinfo(skb)->gso_segs -= pcount;
+
+       /* When we're adding to gso_segs == 1, gso_size will be zero,
+        * in theory this shouldn't be necessary but as long as DSACK
+        * code can come after this skb later on it's better to keep
+        * setting gso_size to something.
+        */
+       if (!skb_shinfo(prev)->gso_size) {
+               skb_shinfo(prev)->gso_size = mss;
+               skb_shinfo(prev)->gso_type = sk->sk_gso_type;
+       }
+
+       /* CHECKME: To clear or not to clear? Mimics normal skb currently */
+       if (skb_shinfo(skb)->gso_segs <= 1) {
+               skb_shinfo(skb)->gso_size = 0;
+               skb_shinfo(skb)->gso_type = 0;
+       }
+
+       *flag |= tcp_sacktag_one(skb, sk, reord, 0, fack_count, &dummy_sacked,
+                                pcount);
+
+       /* Difference in this won't matter, both ACKed by the same cumul. ACK */
+       TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
+
+       tcp_clear_all_retrans_hints(tp);
+
+       if (skb->len > 0) {
+               BUG_ON(!tcp_skb_pcount(skb));
+               return 0;
+       }
+
+       /* Whole SKB was eaten :-) */
+
+       TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(prev)->flags;
+       if (skb == tcp_highest_sack(sk))
+               tcp_advance_highest_sack(sk, skb);
+
+       tcp_unlink_write_queue(skb, sk);
+       sk_wmem_free_skb(sk, skb);
+
+       return 1;
+}
+
+/* I wish gso_size would have a bit more sane initialization than
+ * something-or-zero which complicates things
+ */
+static int tcp_shift_mss(struct sk_buff *skb)
+{
+       int mss = tcp_skb_mss(skb);
+
+       if (!mss)
+               mss = skb->len;
+
+       return mss;
+}
+
+/* Shifting pages past head area doesn't work */
+static int skb_can_shift(struct sk_buff *skb)
+{
+       return !skb_headlen(skb) && skb_is_nonlinear(skb);
+}
+
+/* Try collapsing SACK blocks spanning across multiple skbs to a single
+ * skb.
+ */
+static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
+                                         u32 start_seq, u32 end_seq,
+                                         int dup_sack, int *fack_count,
+                                         int *reord, int *flag)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *prev;
+       int mss;
+       int pcount = 0;
+       int len;
+       int in_sack;
+
+       if (!sk_can_gso(sk))
+               goto fallback;
+
+       /* Normally R but no L won't result in plain S */
+       if (!dup_sack &&
+           (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) == TCPCB_SACKED_RETRANS)
+               goto fallback;
+       if (!skb_can_shift(skb))
+               goto fallback;
+       /* This frame is about to be dropped (was ACKed). */
+       if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+               goto fallback;
+
+       /* Can only happen with delayed DSACK + discard craziness */
+       if (unlikely(skb == tcp_write_queue_head(sk)))
+               goto fallback;
+       prev = tcp_write_queue_prev(sk, skb);
+
+       if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+               goto fallback;
+
+       in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+                 !before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+       if (in_sack) {
+               len = skb->len;
+               pcount = tcp_skb_pcount(skb);
+               mss = tcp_shift_mss(skb);
+
+               /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                * drop this restriction as unnecessary
+                */
+               if (mss != tcp_shift_mss(prev))
+                       goto fallback;
+       } else {
+               if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
+                       goto noop;
+               /* CHECKME: This is non-MSS split case only?, this will
+                * cause skipped skbs due to advancing loop btw, original
+                * has that feature too
+                */
+               if (tcp_skb_pcount(skb) <= 1)
+                       goto noop;
+
+               in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
+               if (!in_sack) {
+                       /* TODO: head merge to next could be attempted here
+                        * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
+                        * though it might not be worth of the additional hassle
+                        *
+                        * ...we can probably just fallback to what was done
+                        * previously. We could try merging non-SACKed ones
+                        * as well but it probably isn't going to buy off
+                        * because later SACKs might again split them, and
+                        * it would make skb timestamp tracking considerably
+                        * harder problem.
+                        */
+                       goto fallback;
+               }
+
+               len = end_seq - TCP_SKB_CB(skb)->seq;
+               BUG_ON(len < 0);
+               BUG_ON(len > skb->len);
+
+               /* MSS boundaries should be honoured or else pcount will
+                * severely break even though it makes things bit trickier.
+                * Optimize common case to avoid most of the divides
+                */
+               mss = tcp_skb_mss(skb);
+
+               /* TODO: Fix DSACKs to not fragment already SACKed and we can
+                * drop this restriction as unnecessary
+                */
+               if (mss != tcp_shift_mss(prev))
+                       goto fallback;
+
+               if (len == mss) {
+                       pcount = 1;
+               } else if (len < mss) {
+                       goto noop;
+               } else {
+                       pcount = len / mss;
+                       len = pcount * mss;
+               }
+       }
+
+       if (!skb_shift(prev, skb, len))
+               goto fallback;
+       if (!tcp_shifted_skb(sk, prev, skb, pcount, len, *fack_count, reord,
+                            flag, mss))
+               goto out;
+
+       /* Hole filled allows collapsing with the next as well, this is very
+        * useful when hole on every nth skb pattern happens
+        */
+       if (prev == tcp_write_queue_tail(sk))
+               goto out;
+       skb = tcp_write_queue_next(sk, prev);
+
+       if (!skb_can_shift(skb))
+               goto out;
+       if (skb == tcp_send_head(sk))
+               goto out;
+       if ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
+               goto out;
+
+       len = skb->len;
+       if (skb_shift(prev, skb, len)) {
+               pcount += tcp_skb_pcount(skb);
+               tcp_shifted_skb(sk, prev, skb, tcp_skb_pcount(skb), len,
+                               *fack_count, reord, flag, mss);
+       }
+
+out:
+       *fack_count += pcount;
+       return prev;
+
+noop:
+       return skb;
+
+fallback:
+       return NULL;
+}
+
  static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                         struct tcp_sack_block *next_dup,
                                         u32 start_seq, u32 end_seq,
                                         int dup_sack_in, int *fack_count,
                                         int *reord, int *flag)
  {
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *tmp;
+
         tcp_for_write_queue_from(skb, sk) {
                 int in_sack = 0;
                 int dup_sack = dup_sack_in;
@@ -1396,18 +1614,42 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
                                 dup_sack = 1;
                 }
  
-               if (in_sack <= 0)
-                       in_sack = tcp_match_skb_to_sack(sk, skb, start_seq,
-                                                       end_seq);
+               /* skb reference here is a bit tricky to get right, since
+                * shifting can eat and free both this skb and the next,
+                * so not even _safe variant of the loop is enough.
+                */
+               if (in_sack <= 0) {
+                       tmp = tcp_shift_skb_data(sk, skb, start_seq,
+                                                end_seq, dup_sack,
+                                                fack_count, reord, flag);
+                       if (tmp != NULL) {
+                               if (tmp != skb) {
+                                       skb = tmp;
+                                       continue;
+                               }
+
+                               in_sack = 0;
+                       } else {
+                               in_sack = tcp_match_skb_to_sack(sk, skb,
+                                                               start_seq,
+                                                               end_seq);
+                       }
+               }
+
                 if (unlikely(in_sack < 0))
                         break;
  
-               if (in_sack)
+               if (in_sack) {
                         *flag |= tcp_sacktag_one(skb, sk, reord, dup_sack,
                                                  *fack_count,
                                                  &(TCP_SKB_CB(skb)->sacked),
                                                  tcp_skb_pcount(skb));
  
+                       if (!before(TCP_SKB_CB(skb)->seq,
+                                   tcp_highest_sack_seq(tp)))
+                               tcp_advance_highest_sack(sk, skb);
+               }
+
                 *fack_count += tcp_skb_pcount(skb);
         }
         return skb;
author	Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
	Tue, 25 Nov 2008 05:20:15 +0000 (21:20 -0800)
committer	David S. Miller <davem@davemloft.net>
	Tue, 25 Nov 2008 05:20:15 +0000 (21:20 -0800)
include/linux/skbuff.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/core/skbuff.c		patch \| blob \| history
net/ipv4/tcp_input.c		patch \| blob \| history