rcu: Move expedited grace period (GP) work to RT kthread_worker

author Kalesh Singh <kaleshsingh@google.com>

Sat, 9 Apr 2022 00:35:27 +0000 (17:35 -0700)

committer Paul E. McKenney <paulmck@kernel.org>

Wed, 11 May 2022 18:47:10 +0000 (11:47 -0700)
author Kalesh Singh <kaleshsingh@google.com>
Sat, 9 Apr 2022 00:35:27 +0000 (17:35 -0700)
committer Paul E. McKenney <paulmck@kernel.org>
Wed, 11 May 2022 18:47:10 +0000 (11:47 -0700)
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig

index bf8e341..fd64a75 100644 (file)
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -195,6 +195,20 @@ config RCU_BOOST_DELAY
  
           Accept the default if unsure.
  
+config RCU_EXP_KTHREAD
+       bool "Perform RCU expedited work in a real-time kthread"
+       depends on RCU_BOOST && RCU_EXPERT
+       default !PREEMPT_RT && NR_CPUS <= 32
+       help
+         Use this option to further reduce the latencies of expedited
+         grace periods at the expense of being more disruptive.
+
+         This option is disabled by default on PREEMPT_RT=y kernels which
+         disable expedited grace periods after boot by unconditionally
+         setting rcupdate.rcu_normal_after_boot=1.
+
+         Accept the default if unsure.
+
  config RCU_NOCB_CPU
         bool "Offload RCU callback processing from boot-selected CPUs"
         depends on TREE_RCU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h

index 20f0300..e27bf7d 100644 (file)
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -536,7 +536,12 @@ int rcu_get_gp_kthreads_prio(void);
  void rcu_fwd_progress_check(unsigned long j);
  void rcu_force_quiescent_state(void);
  extern struct workqueue_struct *rcu_gp_wq;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+extern struct kthread_worker *rcu_exp_gp_kworker;
+extern struct kthread_worker *rcu_exp_par_gp_kworker;
+#else /* !CONFIG_RCU_EXP_KTHREAD */
  extern struct workqueue_struct *rcu_par_gp_wq;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
  #endif /* #else #ifdef CONFIG_TINY_RCU */
  
  #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index a4b8189..763e45f 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4471,6 +4471,51 @@ static int rcu_pm_notify(struct notifier_block *self,
         return NOTIFY_OK;
  }
  
+#ifdef CONFIG_RCU_EXP_KTHREAD
+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+       const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+       const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+       struct sched_param param = { .sched_priority = kthread_prio };
+
+       rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+       if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+               pr_err("Failed to create %s!\n", gp_kworker_name);
+               return;
+       }
+
+       rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+       if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+               pr_err("Failed to create %s!\n", par_gp_kworker_name);
+               kthread_destroy_worker(rcu_exp_gp_kworker);
+               return;
+       }
+
+       sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+       sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+                                  &param);
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+struct workqueue_struct *rcu_par_gp_wq;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+       rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+       WARN_ON(!rcu_par_gp_wq);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
  /*
   * Spawn the kthreads that handle RCU's grace periods.
   */
@@ -4500,6 +4545,8 @@ static int __init rcu_spawn_gp_kthread(void)
         rcu_spawn_nocb_kthreads();
         rcu_spawn_boost_kthreads();
         rcu_spawn_core_kthreads();
+       /* Create kthread worker for expedited GPs */
+       rcu_start_exp_gp_kworkers();
         return 0;
  }
  early_initcall(rcu_spawn_gp_kthread);
@@ -4745,7 +4792,6 @@ static void __init rcu_dump_rcu_node_tree(void)
  }
  
  struct workqueue_struct *rcu_gp_wq;
-struct workqueue_struct *rcu_par_gp_wq;
  
  static void __init kfree_rcu_batch_init(void)
  {
@@ -4811,8 +4857,7 @@ void __init rcu_init(void)
         /* Create workqueue for Tree SRCU and for expedited GPs. */
         rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
         WARN_ON(!rcu_gp_wq);
-       rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
-       WARN_ON(!rcu_par_gp_wq);
+       rcu_alloc_par_gp_wq();
  
         /* Fill in default value for rcutree.qovld boot parameter. */
         /* -After- the rcu_node ->lock fields are initialized! */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index 926673e..b577cdf 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -10,6 +10,7 @@
   */
  
  #include <linux/cache.h>
+#include <linux/kthread.h>
  #include <linux/spinlock.h>
  #include <linux/rtmutex.h>
  #include <linux/threads.h>
@@ -23,7 +24,11 @@
  /* Communicate arguments to a workqueue handler. */
  struct rcu_exp_work {
         unsigned long rew_s;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+       struct kthread_work rew_work;
+#else
         struct work_struct rew_work;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
  };
  
  /* RCU's kthread states for tracing. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h

index b1f52b5..0f70f62 100644 (file)
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -334,15 +334,13 @@ fastpath:
   * Select the CPUs within the specified rcu_node that the upcoming
   * expedited grace period needs to wait for.
   */
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
  {
         int cpu;
         unsigned long flags;
         unsigned long mask_ofl_test;
         unsigned long mask_ofl_ipi;
         int ret;
-       struct rcu_exp_work *rewp =
-               container_of(wp, struct rcu_exp_work, rew_work);
         struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
  
         raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -417,13 +415,119 @@ retry_ipi:
                 rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
  }
  
+static void rcu_exp_sel_wait_wake(unsigned long s);
+
+#ifdef CONFIG_RCU_EXP_KTHREAD
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
+{
+       struct rcu_exp_work *rewp =
+               container_of(wp, struct rcu_exp_work, rew_work);
+
+       __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+       return !!READ_ONCE(rcu_exp_par_gp_kworker);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+       kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+       /*
+        * Use rcu_exp_par_gp_kworker, because flushing a work item from
+        * another work item on the same kthread worker can result in
+        * deadlock.
+        */
+       kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+       kthread_flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct kthread_work *wp)
+{
+       struct rcu_exp_work *rewp;
+
+       rewp = container_of(wp, struct rcu_exp_work, rew_work);
+       rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+       kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
+       kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+{
+       struct rcu_exp_work *rewp =
+               container_of(wp, struct rcu_exp_work, rew_work);
+
+       __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+       return !!READ_ONCE(rcu_par_gp_wq);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+       int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
+
+       INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+       /* If all offline, queue the work on an unbound CPU. */
+       if (unlikely(cpu > rnp->grphi - rnp->grplo))
+               cpu = WORK_CPU_UNBOUND;
+       else
+               cpu += rnp->grplo;
+       queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+       flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+       struct rcu_exp_work *rewp;
+
+       rewp = container_of(wp, struct rcu_exp_work, rew_work);
+       rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+       INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
+       queue_work(rcu_gp_wq, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+       destroy_work_on_stack(&rew->rew_work);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
  /*
   * Select the nodes that the upcoming expedited grace period needs
   * to wait for.
   */
  static void sync_rcu_exp_select_cpus(void)
  {
-       int cpu;
         struct rcu_node *rnp;
  
         trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
@@ -435,28 +539,21 @@ static void sync_rcu_exp_select_cpus(void)
                 rnp->exp_need_flush = false;
                 if (!READ_ONCE(rnp->expmask))
                         continue; /* Avoid early boot non-existent wq. */
-               if (!READ_ONCE(rcu_par_gp_wq) ||
+               if (!rcu_gp_par_worker_started() ||
                     rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
                     rcu_is_last_leaf_node(rnp)) {
-                       /* No workqueues yet or last leaf, do direct call. */
+                       /* No worker started yet or last leaf, do direct call. */
                         sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
                         continue;
                 }
-               INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
-               cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
-               /* If all offline, queue the work on an unbound CPU. */
-               if (unlikely(cpu > rnp->grphi - rnp->grplo))
-                       cpu = WORK_CPU_UNBOUND;
-               else
-                       cpu += rnp->grplo;
-               queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+               sync_rcu_exp_select_cpus_queue_work(rnp);
                 rnp->exp_need_flush = true;
         }
  
-       /* Wait for workqueue jobs (if any) to complete. */
+       /* Wait for jobs (if any) to complete. */
         rcu_for_each_leaf_node(rnp)
                 if (rnp->exp_need_flush)
-                       flush_work(&rnp->rew.rew_work);
+                       sync_rcu_exp_select_cpus_flush_work(rnp);
  }
  
  /*
@@ -622,17 +719,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
         rcu_exp_wait_wake(s);
  }
  
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
-       struct rcu_exp_work *rewp;
-
-       rewp = container_of(wp, struct rcu_exp_work, rew_work);
-       rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
  #ifdef CONFIG_PREEMPT_RCU
  
  /*
@@ -848,20 +934,19 @@ void synchronize_rcu_expedited(void)
         } else {
                 /* Marshall arguments & schedule the expedited grace period. */
                 rew.rew_s = s;
-               INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
-               queue_work(rcu_gp_wq, &rew.rew_work);
+               synchronize_rcu_expedited_queue_work(&rew);
         }
  
         /* Wait for expedited grace period to complete. */
         rnp = rcu_get_root();
         wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
                    sync_exp_work_done(s));
-       smp_mb(); /* Workqueue actions happen before return. */
+       smp_mb(); /* Work actions happen before return. */
  
         /* Let the next expedited grace period start. */
         mutex_unlock(&rcu_state.exp_mutex);
  
         if (likely(!boottime))
-               destroy_work_on_stack(&rew.rew_work);
+               synchronize_rcu_expedited_destroy_work(&rew);
  }
  EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
author	Kalesh Singh <kaleshsingh@google.com>
	Sat, 9 Apr 2022 00:35:27 +0000 (17:35 -0700)
committer	Paul E. McKenney <paulmck@kernel.org>
	Wed, 11 May 2022 18:47:10 +0000 (11:47 -0700)
kernel/rcu/Kconfig		patch \| blob \| history
kernel/rcu/rcu.h		patch \| blob \| history
kernel/rcu/tree.c		patch \| blob \| history
kernel/rcu/tree.h		patch \| blob \| history
kernel/rcu/tree_exp.h		patch \| blob \| history