rcu: Parallelize expedited grace-period initialization

author Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Fri, 2 Feb 2018 06:05:38 +0000 (22:05 -0800)

committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Tue, 15 May 2018 17:25:44 +0000 (10:25 -0700)
author Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Fri, 2 Feb 2018 06:05:38 +0000 (22:05 -0800)
committer Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tue, 15 May 2018 17:25:44 +0000 (10:25 -0700)
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h

index 7a693e3..976019d 100644 (file)
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -486,6 +486,7 @@ void rcu_force_quiescent_state(void);
  void rcu_bh_force_quiescent_state(void);
  void rcu_sched_force_quiescent_state(void);
  extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
  #endif /* #else #ifdef CONFIG_TINY_RCU */
  
  #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 2a73469..23781fc 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4168,6 +4168,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
  }
  
  struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
  
  void __init rcu_init(void)
  {
@@ -4199,6 +4200,8 @@ void __init rcu_init(void)
         /* Create workqueue for expedited GPs and for Tree SRCU. */
         rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
         WARN_ON(!rcu_gp_wq);
+       rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+       WARN_ON(!rcu_par_gp_wq);
  }
  
  #include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index f491ab4..98d3390 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
  #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
  };
  
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+       smp_call_func_t rew_func;
+       struct rcu_state *rew_rsp;
+       unsigned long rew_s;
+       struct work_struct rew_work;
+};
+
  /* RCU's kthread states for tracing. */
  #define RCU_KTHREAD_STOPPED  0
  #define RCU_KTHREAD_RUNNING  1
@@ -157,6 +165,8 @@ struct rcu_node {
         spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
         unsigned long exp_seq_rq;
         wait_queue_head_t exp_wq[4];
+       struct rcu_exp_work rew;
+       bool exp_need_flush;    /* Need to flush workitem? */
  } ____cacheline_internodealigned_in_smp;
  
  /*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h

index f72eefa..73e1d3d 100644 (file)
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -362,93 +362,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
  }
  
  /*
- * Select the nodes that the upcoming expedited grace period needs
- * to wait for.
+ * Select the CPUs within the specified rcu_node that the upcoming
+ * expedited grace period needs to wait for.
   */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
-                                    smp_call_func_t func)
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
  {
         int cpu;
         unsigned long flags;
+       smp_call_func_t func;
         unsigned long mask_ofl_test;
         unsigned long mask_ofl_ipi;
         int ret;
-       struct rcu_node *rnp;
+       struct rcu_exp_work *rewp =
+               container_of(wp, struct rcu_exp_work, rew_work);
+       struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+       struct rcu_state *rsp = rewp->rew_rsp;
  
-       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
-       sync_exp_reset_tree(rsp);
-       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
-       rcu_for_each_leaf_node(rsp, rnp) {
-               raw_spin_lock_irqsave_rcu_node(rnp, flags);
-
-               /* Each pass checks a CPU for identity, offline, and idle. */
-               mask_ofl_test = 0;
-               for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-                       unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-                       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-                       struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
-                       int snap;
+       func = rewp->rew_func;
+       raw_spin_lock_irqsave_rcu_node(rnp, flags);
  
-                       if (raw_smp_processor_id() == cpu ||
-                           !(rnp->qsmaskinitnext & mask)) {
+       /* Each pass checks a CPU for identity, offline, and idle. */
+       mask_ofl_test = 0;
+       for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+               unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+               struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+               struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+               int snap;
+
+               if (raw_smp_processor_id() == cpu ||
+                   !(rnp->qsmaskinitnext & mask)) {
+                       mask_ofl_test |= mask;
+               } else {
+                       snap = rcu_dynticks_snap(rdtp);
+                       if (rcu_dynticks_in_eqs(snap))
                                 mask_ofl_test |= mask;
-                       } else {
-                               snap = rcu_dynticks_snap(rdtp);
-                               if (rcu_dynticks_in_eqs(snap))
-                                       mask_ofl_test |= mask;
-                               else
-                                       rdp->exp_dynticks_snap = snap;
-                       }
+                       else
+                               rdp->exp_dynticks_snap = snap;
                 }
-               mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-
-               /*
-                * Need to wait for any blocked tasks as well.  Note that
-                * additional blocking tasks will also block the expedited
-                * GP until such time as the ->expmask bits are cleared.
-                */
-               if (rcu_preempt_has_tasks(rnp))
-                       rnp->exp_tasks = rnp->blkd_tasks.next;
-               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+       }
+       mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
  
-               /* IPI the remaining CPUs for expedited quiescent state. */
-               for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-                       unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-                       struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+       /*
+        * Need to wait for any blocked tasks as well.  Note that
+        * additional blocking tasks will also block the expedited GP
+        * until such time as the ->expmask bits are cleared.
+        */
+       if (rcu_preempt_has_tasks(rnp))
+               rnp->exp_tasks = rnp->blkd_tasks.next;
+       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+       /* IPI the remaining CPUs for expedited quiescent state. */
+       for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+               unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+               struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
  
-                       if (!(mask_ofl_ipi & mask))
-                               continue;
+               if (!(mask_ofl_ipi & mask))
+                       continue;
  retry_ipi:
-                       if (rcu_dynticks_in_eqs_since(rdp->dynticks,
-                                                     rdp->exp_dynticks_snap)) {
-                               mask_ofl_test |= mask;
-                               continue;
-                       }
-                       ret = smp_call_function_single(cpu, func, rsp, 0);
-                       if (!ret) {
-                               mask_ofl_ipi &= ~mask;
-                               continue;
-                       }
-                       /* Failed, raced with CPU hotplug operation. */
-                       raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                       if ((rnp->qsmaskinitnext & mask) &&
-                           (rnp->expmask & mask)) {
-                               /* Online, so delay for a bit and try again. */
-                               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                               trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
-                               schedule_timeout_uninterruptible(1);
-                               goto retry_ipi;
-                       }
-                       /* CPU really is offline, so we can ignore it. */
-                       if (!(rnp->expmask & mask))
-                               mask_ofl_ipi &= ~mask;
+               if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+                                             rdp->exp_dynticks_snap)) {
+                       mask_ofl_test |= mask;
+                       continue;
+               }
+               ret = smp_call_function_single(cpu, func, rsp, 0);
+               if (!ret) {
+                       mask_ofl_ipi &= ~mask;
+                       continue;
+               }
+               /* Failed, raced with CPU hotplug operation. */
+               raw_spin_lock_irqsave_rcu_node(rnp, flags);
+               if ((rnp->qsmaskinitnext & mask) &&
+                   (rnp->expmask & mask)) {
+                       /* Online, so delay for a bit and try again. */
                         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+                       schedule_timeout_uninterruptible(1);
+                       goto retry_ipi;
+               }
+               /* CPU really is offline, so we can ignore it. */
+               if (!(rnp->expmask & mask))
+                       mask_ofl_ipi &= ~mask;
+               raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+       }
+       /* Report quiescent states for those that went offline. */
+       mask_ofl_test |= mask_ofl_ipi;
+       if (mask_ofl_test)
+               rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+                                    smp_call_func_t func)
+{
+       struct rcu_node *rnp;
+
+       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+       sync_exp_reset_tree(rsp);
+       trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+
+       /* Schedule work for each leaf rcu_node structure. */
+       rcu_for_each_leaf_node(rsp, rnp) {
+               rnp->exp_need_flush = false;
+               if (!READ_ONCE(rnp->expmask))
+                       continue; /* Avoid early boot non-existent wq. */
+               rnp->rew.rew_func = func;
+               rnp->rew.rew_rsp = rsp;
+               if (!READ_ONCE(rcu_par_gp_wq) ||
+                   rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+                       /* No workqueues yet. */
+                       sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+                       continue;
                 }
-               /* Report quiescent states for those that went offline. */
-               mask_ofl_test |= mask_ofl_ipi;
-               if (mask_ofl_test)
-                       rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+               INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+               queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
+               rnp->exp_need_flush = true;
         }
+
+       /* Wait for workqueue jobs (if any) to complete. */
+       rcu_for_each_leaf_node(rsp, rnp)
+               if (rnp->exp_need_flush)
+                       flush_work(&rnp->rew.rew_work);
  }
  
  static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -560,14 +596,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
         mutex_unlock(&rsp->exp_wake_mutex);
  }
  
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
-       smp_call_func_t rew_func;
-       struct rcu_state *rew_rsp;
-       unsigned long rew_s;
-       struct work_struct rew_work;
-};
-
  /*
   * Common code to drive an expedited grace period forward, used by
   * workqueues and mid-boot-time tasks.
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Fri, 2 Feb 2018 06:05:38 +0000 (22:05 -0800)
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>
	Tue, 15 May 2018 17:25:44 +0000 (10:25 -0700)
kernel/rcu/rcu.h		patch \| blob \| history
kernel/rcu/tree.c		patch \| blob \| history
kernel/rcu/tree.h		patch \| blob \| history
kernel/rcu/tree_exp.h		patch \| blob \| history