Merge branch 'net-skb-defer-freeing-polish'
authorDavid S. Miller <davem@davemloft.net>
Mon, 16 May 2022 10:33:59 +0000 (11:33 +0100)
committerDavid S. Miller <davem@davemloft.net>
Mon, 16 May 2022 10:33:59 +0000 (11:33 +0100)
Eric Dumazet says:

====================
net: polish skb defer freeing

While testing this recently added feature on a variety
of platforms/configurations, I found the following issues:

1) A race leading to concurrent calls to smp_call_function_single_async()

2) Missed opportunity to use napi_consume_skb()

3) Need to limit the max length of the per-cpu lists.

4) Process the per-cpu list more frequently, for the
   (unusual) case where net_rx_action() has mutiple
   napi_poll() to process per round.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/admin-guide/sysctl/net.rst
include/linux/netdevice.h
net/core/dev.c
net/core/dev.h
net/core/skbuff.c
net/core/sysctl_net_core.c

index f86b5e1..fa4dcdb 100644 (file)
@@ -322,6 +322,14 @@ a leaked reference faster. A larger value may be useful to prevent false
 warnings on slow/loaded systems.
 Default value is 10, minimum 1, maximum 3600.
 
+skb_defer_max
+-------------
+
+Max size (in skbs) of the per-cpu list of skbs being freed
+by the cpu which allocated them. Used by TCP stack so far.
+
+Default: 64
+
 optmem_max
 ----------
 
index d57ce24..cbaf312 100644 (file)
@@ -3136,6 +3136,7 @@ struct softnet_data {
        /* Another possibly contended cache line */
        spinlock_t              defer_lock ____cacheline_aligned_in_smp;
        int                     defer_count;
+       int                     defer_ipi_scheduled;
        struct sk_buff          *defer_list;
        call_single_data_t      defer_csd;
 };
index d93456c..04fd056 100644 (file)
@@ -4330,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000;
 EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
+unsigned int sysctl_skb_defer_max __read_mostly = 64;
 int netdev_budget __read_mostly = 300;
 /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
 unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
@@ -4582,9 +4583,12 @@ static void rps_trigger_softirq(void *data)
 #endif /* CONFIG_RPS */
 
 /* Called from hardirq (IPI) context */
-static void trigger_rx_softirq(void *data __always_unused)
+static void trigger_rx_softirq(void *data)
 {
+       struct softnet_data *sd = data;
+
        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+       smp_store_release(&sd->defer_ipi_scheduled, 0);
 }
 
 /*
@@ -6630,7 +6634,7 @@ static void skb_defer_free_flush(struct softnet_data *sd)
 
        while (skb != NULL) {
                next = skb->next;
-               __kfree_skb(skb);
+               napi_consume_skb(skb, 1);
                skb = next;
        }
 }
@@ -6651,6 +6655,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
        for (;;) {
                struct napi_struct *n;
 
+               skb_defer_free_flush(sd);
+
                if (list_empty(&list)) {
                        if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
                                goto end;
@@ -6680,8 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
 
        net_rps_action_and_irq_enable(sd);
-end:
-       skb_defer_free_flush(sd);
+end:;
 }
 
 struct netdev_adjacent {
@@ -11382,7 +11387,7 @@ static int __init net_dev_init(void)
                INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
                sd->cpu = i;
 #endif
-               INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL);
+               INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
                spin_lock_init(&sd->defer_lock);
 
                init_gro_hash(&sd->backlog);
index 328b37a..cbb8a92 100644 (file)
@@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev);
 /* sysctls not referred to from outside net/core/ */
 extern int             netdev_budget;
 extern unsigned int    netdev_budget_usecs;
-
+extern unsigned int    sysctl_skb_defer_max;
 extern int             netdev_tstamp_prequeue;
 extern int             netdev_unregister_timeout_secs;
 extern int             weight_p;
index 2fea964..1d10bb4 100644 (file)
@@ -80,6 +80,7 @@
 #include <linux/user_namespace.h>
 #include <linux/indirect_call_wrapper.h>
 
+#include "dev.h"
 #include "sock_destructor.h"
 
 struct kmem_cache *skbuff_head_cache __ro_after_init;
@@ -6496,16 +6497,21 @@ void skb_attempt_defer_free(struct sk_buff *skb)
        int cpu = skb->alloc_cpu;
        struct softnet_data *sd;
        unsigned long flags;
+       unsigned int defer_max;
        bool kick;
 
        if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
            !cpu_online(cpu) ||
            cpu == raw_smp_processor_id()) {
-               __kfree_skb(skb);
+nodefer:       __kfree_skb(skb);
                return;
        }
 
        sd = &per_cpu(softnet_data, cpu);
+       defer_max = READ_ONCE(sysctl_skb_defer_max);
+       if (READ_ONCE(sd->defer_count) >= defer_max)
+               goto nodefer;
+
        /* We do not send an IPI or any signal.
         * Remote cpu will eventually call skb_defer_free_flush()
         */
@@ -6515,18 +6521,14 @@ void skb_attempt_defer_free(struct sk_buff *skb)
        WRITE_ONCE(sd->defer_list, skb);
        sd->defer_count++;
 
-       /* kick every time queue length reaches 128.
-        * This should avoid blocking in smp_call_function_single_async().
-        * This condition should hardly be bit under normal conditions,
-        * unless cpu suddenly stopped to receive NIC interrupts.
-        */
-       kick = sd->defer_count == 128;
+       /* Send an IPI every time queue reaches half capacity. */
+       kick = sd->defer_count == (defer_max >> 1);
 
        spin_unlock_irqrestore(&sd->defer_lock, flags);
 
        /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
         * if we are unlucky enough (this seems very unlikely).
         */
-       if (unlikely(kick))
+       if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
                smp_call_function_single_async(cpu, &sd->defer_csd);
 }
index 195ca5c..ca8d383 100644 (file)
@@ -578,6 +578,14 @@ static struct ctl_table net_core_table[] = {
                .extra1         = SYSCTL_ONE,
                .extra2         = &int_3600,
        },
+       {
+               .procname       = "skb_defer_max",
+               .data           = &sysctl_skb_defer_max,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+       },
        { }
 };