net: generalize skb freeing deferral to per-cpu lists

author Eric Dumazet <edumazet@google.com>

Fri, 22 Apr 2022 20:12:37 +0000 (13:12 -0700)

committer Jakub Kicinski <kuba@kernel.org>

Wed, 27 Apr 2022 00:05:59 +0000 (17:05 -0700)
author Eric Dumazet <edumazet@google.com>
Fri, 22 Apr 2022 20:12:37 +0000 (13:12 -0700)
committer Jakub Kicinski <kuba@kernel.org>
Wed, 27 Apr 2022 00:05:59 +0000 (17:05 -0700)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h

index 7dccbfd..ac8a5f7 100644 (file)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3081,6 +3081,11 @@ struct softnet_data {
         struct sk_buff_head     input_pkt_queue;
         struct napi_struct      backlog;
  
+       /* Another possibly contended cache line */
+       spinlock_t              defer_lock ____cacheline_aligned_in_smp;
+       int                     defer_count;
+       struct sk_buff          *defer_list;
+       call_single_data_t      defer_csd;
  };
  
  static inline void input_queue_head_incr(struct softnet_data *sd)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h

index 84d78df..5cbc184 100644 (file)
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -888,6 +888,7 @@ typedef unsigned char *sk_buff_data_t;
   *             delivery_time at egress.
   *     @napi_id: id of the NAPI struct this skb came from
   *     @sender_cpu: (aka @napi_id) source CPU in XPS
+ *     @alloc_cpu: CPU which did the skb allocation.
   *     @secmark: security marking
   *     @mark: Generic packet mark
   *     @reserved_tailroom: (aka @mark) number of bytes of free space available
@@ -1080,6 +1081,7 @@ struct sk_buff {
                 unsigned int    sender_cpu;
         };
  #endif
+       u16                     alloc_cpu;
  #ifdef CONFIG_NETWORK_SECMARK
         __u32           secmark;
  #endif
@@ -1321,6 +1323,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size);
  struct sk_buff *build_skb(void *data, unsigned int frag_size);
  struct sk_buff *build_skb_around(struct sk_buff *skb,
                                  void *data, unsigned int frag_size);
+void skb_attempt_defer_free(struct sk_buff *skb);
  
  struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
  
diff --git a/include/net/sock.h b/include/net/sock.h

index a01d6c4..f9f8eca 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -292,7 +292,6 @@ struct sk_filter;
    *    @sk_pacing_shift: scaling factor for TCP Small Queues
    *    @sk_lingertime: %SO_LINGER l_linger setting
    *    @sk_backlog: always used with the per-socket spinlock held
-  *    @defer_list: head of llist storing skbs to be freed
    *    @sk_callback_lock: used with the callbacks in the end of this struct
    *    @sk_error_queue: rarely used
    *    @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
@@ -417,7 +416,6 @@ struct sock {
                 struct sk_buff  *head;
                 struct sk_buff  *tail;
         } sk_backlog;
-       struct llist_head defer_list;
  
  #define sk_rmem_alloc sk_backlog.rmem_alloc
  
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 679b196..94a52ad 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1375,18 +1375,6 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
  bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
                      enum skb_drop_reason *reason);
  
-#ifdef CONFIG_INET
-void __sk_defer_free_flush(struct sock *sk);
-
-static inline void sk_defer_free_flush(struct sock *sk)
-{
-       if (llist_empty(&sk->defer_list))
-               return;
-       __sk_defer_free_flush(sk);
-}
-#else
-static inline void sk_defer_free_flush(struct sock *sk) {}
-#endif
  
  int tcp_filter(struct sock *sk, struct sk_buff *skb);
  void tcp_set_state(struct sock *sk, int state);
diff --git a/net/core/dev.c b/net/core/dev.c

index 4a77ebd..611bd71 100644 (file)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4545,6 +4545,12 @@ static void rps_trigger_softirq(void *data)
  
  #endif /* CONFIG_RPS */
  
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data __always_unused)
+{
+       __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
  /*
   * Check if this softnet_data structure is another cpu one
   * If yes, queue it to our IPI list and return 1
@@ -6571,6 +6577,28 @@ static int napi_threaded_poll(void *data)
         return 0;
  }
  
+static void skb_defer_free_flush(struct softnet_data *sd)
+{
+       struct sk_buff *skb, *next;
+       unsigned long flags;
+
+       /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
+       if (!READ_ONCE(sd->defer_list))
+               return;
+
+       spin_lock_irqsave(&sd->defer_lock, flags);
+       skb = sd->defer_list;
+       sd->defer_list = NULL;
+       sd->defer_count = 0;
+       spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+       while (skb != NULL) {
+               next = skb->next;
+               __kfree_skb(skb);
+               skb = next;
+       }
+}
+
  static __latent_entropy void net_rx_action(struct softirq_action *h)
  {
         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -6616,6 +6644,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
  
         net_rps_action_and_irq_enable(sd);
+       skb_defer_free_flush(sd);
  }
  
  struct netdev_adjacent {
@@ -11326,6 +11355,8 @@ static int __init net_dev_init(void)
                 INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
                 sd->cpu = i;
  #endif
+               INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL);
+               spin_lock_init(&sd->defer_lock);
  
                 init_gro_hash(&sd->backlog);
                 sd->backlog.poll = process_backlog;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c

index 30b523f..028a280 100644 (file)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -204,7 +204,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
         skb_set_end_offset(skb, size);
         skb->mac_header = (typeof(skb->mac_header))~0U;
         skb->transport_header = (typeof(skb->transport_header))~0U;
-
+       skb->alloc_cpu = raw_smp_processor_id();
         /* make sure we initialize shinfo sequentially */
         shinfo = skb_shinfo(skb);
         memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
@@ -1037,6 +1037,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  #ifdef CONFIG_NET_RX_BUSY_POLL
         CHECK_SKB_FIELD(napi_id);
  #endif
+       CHECK_SKB_FIELD(alloc_cpu);
  #ifdef CONFIG_XPS
         CHECK_SKB_FIELD(sender_cpu);
  #endif
@@ -6486,3 +6487,51 @@ free_now:
  }
  EXPORT_SYMBOL(__skb_ext_put);
  #endif /* CONFIG_SKB_EXTENSIONS */
+
+/**
+ * skb_attempt_defer_free - queue skb for remote freeing
+ * @skb: buffer
+ *
+ * Put @skb in a per-cpu list, using the cpu which
+ * allocated the skb/pages to reduce false sharing
+ * and memory zone spinlock contention.
+ */
+void skb_attempt_defer_free(struct sk_buff *skb)
+{
+       int cpu = skb->alloc_cpu;
+       struct softnet_data *sd;
+       unsigned long flags;
+       bool kick;
+
+       if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+           !cpu_online(cpu) ||
+           cpu == raw_smp_processor_id()) {
+               __kfree_skb(skb);
+               return;
+       }
+
+       sd = &per_cpu(softnet_data, cpu);
+       /* We do not send an IPI or any signal.
+        * Remote cpu will eventually call skb_defer_free_flush()
+        */
+       spin_lock_irqsave(&sd->defer_lock, flags);
+       skb->next = sd->defer_list;
+       /* Paired with READ_ONCE() in skb_defer_free_flush() */
+       WRITE_ONCE(sd->defer_list, skb);
+       sd->defer_count++;
+
+       /* kick every time queue length reaches 128.
+        * This should avoid blocking in smp_call_function_single_async().
+        * This condition should hardly be bit under normal conditions,
+        * unless cpu suddenly stopped to receive NIC interrupts.
+        */
+       kick = sd->defer_count == 128;
+
+       spin_unlock_irqrestore(&sd->defer_lock, flags);
+
+       /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+        * if we are unlucky enough (this seems very unlikely).
+        */
+       if (unlikely(kick))
+               smp_call_function_single_async(cpu, &sd->defer_csd);
+}
diff --git a/net/core/sock.c b/net/core/sock.c

index 29abec3..a0f3989 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2082,9 +2082,6 @@ void sk_destruct(struct sock *sk)
  {
         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
  
-       WARN_ON_ONCE(!llist_empty(&sk->defer_list));
-       sk_defer_free_flush(sk);
-
         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
                 reuseport_detach_sock(sk);
                 use_call_rcu = true;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index e20b87b..db55af9 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -843,7 +843,6 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
         }
  
         release_sock(sk);
-       sk_defer_free_flush(sk);
  
         if (spliced)
                 return spliced;
@@ -1589,20 +1588,6 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
                 tcp_send_ack(sk);
  }
  
-void __sk_defer_free_flush(struct sock *sk)
-{
-       struct llist_node *head;
-       struct sk_buff *skb, *n;
-
-       head = llist_del_all(&sk->defer_list);
-       llist_for_each_entry_safe(skb, n, head, ll_node) {
-               prefetch(n);
-               skb_mark_not_on_list(skb);
-               __kfree_skb(skb);
-       }
-}
-EXPORT_SYMBOL(__sk_defer_free_flush);
-
  static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
  {
         __skb_unlink(skb, &sk->sk_receive_queue);
@@ -1610,11 +1595,7 @@ static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
                 sock_rfree(skb);
                 skb->destructor = NULL;
                 skb->sk = NULL;
-               if (!skb_queue_empty(&sk->sk_receive_queue) ||
-                   !llist_empty(&sk->defer_list)) {
-                       llist_add(&skb->ll_node, &sk->defer_list);
-                       return;
-               }
+               return skb_attempt_defer_free(skb);
         }
         __kfree_skb(skb);
  }
@@ -2453,7 +2434,6 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
                         __sk_flush_backlog(sk);
                 } else {
                         tcp_cleanup_rbuf(sk, copied);
-                       sk_defer_free_flush(sk);
                         sk_wait_data(sk, &timeo, last);
                 }
  
@@ -2571,7 +2551,6 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
         lock_sock(sk);
         ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
         release_sock(sk);
-       sk_defer_free_flush(sk);
  
         if (cmsg_flags && ret >= 0) {
                 if (cmsg_flags & TCP_CMSG_TS)
@@ -3096,7 +3075,6 @@ int tcp_disconnect(struct sock *sk, int flags)
                 sk->sk_frag.page = NULL;
                 sk->sk_frag.offset = 0;
         }
-       sk_defer_free_flush(sk);
         sk_error_report(sk);
         return 0;
  }
@@ -4225,7 +4203,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
                 err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
                                                           &zc, &len, err);
                 release_sock(sk);
-               sk_defer_free_flush(sk);
                 if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
                         goto zerocopy_rcv_cmsg;
                 switch (len) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c

index 2c2d421..918816e 100644 (file)
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2065,7 +2065,6 @@ process:
  
         sk_incoming_cpu_update(sk);
  
-       sk_defer_free_flush(sk);
         bh_lock_sock_nested(sk);
         tcp_segs_in(tcp_sk(sk), skb);
         ret = 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c

index 54277de..60bdec2 100644 (file)
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1728,7 +1728,6 @@ process:
  
         sk_incoming_cpu_update(sk);
  
-       sk_defer_free_flush(sk);
         bh_lock_sock_nested(sk);
         tcp_segs_in(tcp_sk(sk), skb);
         ret = 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c

index ddbe05e..bc54f6c 100644 (file)
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1911,7 +1911,6 @@ recv_end:
  
  end:
         release_sock(sk);
-       sk_defer_free_flush(sk);
         if (psock)
                 sk_psock_put(sk, psock);
         return copied ? : err;
@@ -1983,7 +1982,6 @@ ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
  
  splice_read_end:
         release_sock(sk);
-       sk_defer_free_flush(sk);
         return copied ? : err;
  }
author	Eric Dumazet <edumazet@google.com>
	Fri, 22 Apr 2022 20:12:37 +0000 (13:12 -0700)
committer	Jakub Kicinski <kuba@kernel.org>
	Wed, 27 Apr 2022 00:05:59 +0000 (17:05 -0700)
include/linux/netdevice.h		patch \| blob \| history
include/linux/skbuff.h		patch \| blob \| history
include/net/sock.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/core/dev.c		patch \| blob \| history
net/core/skbuff.c		patch \| blob \| history
net/core/sock.c		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv4/tcp_ipv4.c		patch \| blob \| history
net/ipv6/tcp_ipv6.c		patch \| blob \| history
net/tls/tls_sw.c		patch \| blob \| history