Merge branches 'doc.2022.06.21a', 'fixes.2022.07.19a', 'nocb.2022.07.19a', 'poll...

author Paul E. McKenney <paulmck@kernel.org>

Fri, 22 Jul 2022 00:43:16 +0000 (17:43 -0700)

committer Paul E. McKenney <paulmck@kernel.org>

Fri, 22 Jul 2022 00:43:16 +0000 (17:43 -0700)
author Paul E. McKenney <paulmck@kernel.org>
Fri, 22 Jul 2022 00:43:16 +0000 (17:43 -0700)
committer Paul E. McKenney <paulmck@kernel.org>
Fri, 22 Jul 2022 00:43:16 +0000 (17:43 -0700)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index 2522b11..4cd3ca5 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3659,6 +3659,9 @@
                         just as if they had also been called out in the
                         rcu_nocbs= boot parameter.
  
+                       Note that this argument takes precedence over
+                       the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
+
         noiotrap        [SH] Disables trapped I/O port accesses.
  
         noirqdebug      [X86-32] Disables the code which attempts to detect and
@@ -4557,6 +4560,9 @@
                         no-callback mode from boot but the mode may be
                         toggled at runtime via cpusets.
  
+                       Note that this argument takes precedence over
+                       the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
+
         rcu_nocb_poll   [KNL]
                         Rather than requiring that offloaded CPUs
                         (specified by rcu_nocbs= above) explicitly
@@ -4666,6 +4672,34 @@
                         When RCU_NOCB_CPU is set, also adjust the
                         priority of NOCB callback kthreads.
  
+       rcutree.rcu_divisor= [KNL]
+                       Set the shift-right count to use to compute
+                       the callback-invocation batch limit bl from
+                       the number of callbacks queued on this CPU.
+                       The result will be bounded below by the value of
+                       the rcutree.blimit kernel parameter.  Every bl
+                       callbacks, the softirq handler will exit in
+                       order to allow the CPU to do other work.
+
+                       Please note that this callback-invocation batch
+                       limit applies only to non-offloaded callback
+                       invocation.  Offloaded callbacks are instead
+                       invoked in the context of an rcuoc kthread, which
+                       scheduler will preempt as it does any other task.
+
+       rcutree.nocb_nobypass_lim_per_jiffy= [KNL]
+                       On callback-offloaded (rcu_nocbs) CPUs,
+                       RCU reduces the lock contention that would
+                       otherwise be caused by callback floods through
+                       use of the ->nocb_bypass list.  However, in the
+                       common non-flooded case, RCU queues directly to
+                       the main ->cblist in order to avoid the extra
+                       overhead of the ->nocb_bypass list and its lock.
+                       But if there are too many callbacks queued during
+                       a single jiffy, RCU pre-queues the callbacks into
+                       the ->nocb_bypass queue.  The definition of "too
+                       many" is supplied by this kernel boot parameter.
+
         rcutree.rcu_nocb_gp_stride= [KNL]
                         Set the number of NOCB callback kthreads in
                         each group, which defaults to the square root
@@ -5771,6 +5805,24 @@
                         expediting.  Set to zero to disable automatic
                         expediting.
  
+       srcutree.srcu_max_nodelay [KNL]
+                       Specifies the number of no-delay instances
+                       per jiffy for which the SRCU grace period
+                       worker thread will be rescheduled with zero
+                       delay. Beyond this limit, worker thread will
+                       be rescheduled with a sleep delay of one jiffy.
+
+       srcutree.srcu_max_nodelay_phase [KNL]
+                       Specifies the per-grace-period phase, number of
+                       non-sleeping polls of readers. Beyond this limit,
+                       grace period worker thread will be rescheduled
+                       with a sleep delay of one jiffy, between each
+                       rescan of the readers, for a grace period phase.
+
+       srcutree.srcu_retry_check_delay [KNL]
+                       Specifies number of microseconds of non-sleeping
+                       delay between each non-sleeping poll of readers.
+
         srcutree.small_contention_lim [KNL]
                         Specifies the number of update-side contention
                         events per jiffy will be tolerated before
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index 7f12daa..937a58b 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -170,13 +170,24 @@ void synchronize_rcu_tasks(void);
  # endif
  
  # ifdef CONFIG_TASKS_TRACE_RCU
-# define rcu_tasks_trace_qs(t)                                         \
-       do {                                                            \
-               if (!likely(READ_ONCE((t)->trc_reader_checked)) &&      \
-                   !unlikely(READ_ONCE((t)->trc_reader_nesting))) {    \
-                       smp_store_release(&(t)->trc_reader_checked, true); \
-                       smp_mb(); /* Readers partitioned by store. */   \
-               }                                                       \
+// Bits for ->trc_reader_special.b.need_qs field.
+#define TRC_NEED_QS            0x1  // Task needs a quiescent state.
+#define TRC_NEED_QS_CHECKED    0x2  // Task has been checked for needing quiescent state.
+
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
+void rcu_tasks_trace_qs_blkd(struct task_struct *t);
+
+# define rcu_tasks_trace_qs(t)                                                 \
+       do {                                                                    \
+               int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);       \
+                                                                               \
+               if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) &&    \
+                   likely(!___rttq_nesting)) {                                 \
+                       rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED);   \
+               } else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&     \
+                          !READ_ONCE((t)->trc_reader_special.b.blocked)) {     \
+                       rcu_tasks_trace_qs_blkd(t);                             \
+               }                                                               \
         } while (0)
  # else
  # define rcu_tasks_trace_qs(t) do { } while (0)
@@ -185,7 +196,7 @@ void synchronize_rcu_tasks(void);
  #define rcu_tasks_qs(t, preempt)                                       \
  do {                                                                   \
         rcu_tasks_classic_qs((t), (preempt));                           \
-       rcu_tasks_trace_qs((t));                                        \
+       rcu_tasks_trace_qs(t);                                          \
  } while (0)
  
  # ifdef CONFIG_TASKS_RUDE_RCU
diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h

index 6f9c358..9bc8cbb 100644 (file)
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -75,7 +75,7 @@ static inline void rcu_read_unlock_trace(void)
         nesting = READ_ONCE(t->trc_reader_nesting) - 1;
         barrier(); // Critical section before disabling.
         // Disable IPI-based setting of .need_qs.
-       WRITE_ONCE(t->trc_reader_nesting, INT_MIN);
+       WRITE_ONCE(t->trc_reader_nesting, INT_MIN + nesting);
         if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) {
                 WRITE_ONCE(t->trc_reader_nesting, nesting);
                 return;  // We assume shallow reader nesting.
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h

index ab7e20d..e6bb31a 100644 (file)
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -48,7 +48,7 @@ static inline void synchronize_rcu_expedited(void)
   */
  extern void kvfree(const void *addr);
  
-static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+static inline void __kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
  {
         if (head) {
                 call_rcu(head, func);
@@ -61,6 +61,15 @@ static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
         kvfree((void *) func);
  }
  
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+#else
+static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+       __kvfree_call_rcu(head, func);
+}
+#endif
+
  void rcu_qs(void);
  
  static inline void rcu_softirq_qs(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index c46f3a6..72242bc 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -843,8 +843,9 @@ struct task_struct {
         int                             trc_reader_nesting;
         int                             trc_ipi_to_cpu;
         union rcu_special               trc_reader_special;
-       bool                            trc_reader_checked;
         struct list_head                trc_holdout_list;
+       struct list_head                trc_blkd_node;
+       int                             trc_blkd_cpu;
  #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  
         struct sched_info               sched_info;
@@ -2223,6 +2224,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
  
  extern bool sched_task_on_rq(struct task_struct *p);
  extern unsigned long get_wchan(struct task_struct *p);
+extern struct task_struct *cpu_curr_snapshot(int cpu);
  
  /*
   * In order to reduce various lock holder preemption latencies provide an
diff --git a/init/init_task.c b/init/init_task.c

index 73cc8f0..ff6c4b9 100644 (file)
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -157,6 +157,7 @@ struct task_struct init_task
         .trc_reader_nesting = 0,
         .trc_reader_special.s = 0,
         .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
+       .trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
  #endif
  #ifdef CONFIG_CPUSETS
         .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
diff --git a/kernel/fork.c b/kernel/fork.c

index 9d44f2d..1950eb8 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1814,6 +1814,7 @@ static inline void rcu_copy_process(struct task_struct *p)
         p->trc_reader_nesting = 0;
         p->trc_reader_special.s = 0;
         INIT_LIST_HEAD(&p->trc_holdout_list);
+       INIT_LIST_HEAD(&p->trc_blkd_node);
  #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  }
  
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig

index 1c630e5..c05ca52 100644 (file)
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -262,6 +262,35 @@ config RCU_NOCB_CPU
           Say Y here if you need reduced OS jitter, despite added overhead.
           Say N here if you are unsure.
  
+config RCU_NOCB_CPU_DEFAULT_ALL
+       bool "Offload RCU callback processing from all CPUs by default"
+       depends on RCU_NOCB_CPU
+       default n
+       help
+         Use this option to offload callback processing from all CPUs
+         by default, in the absence of the rcu_nocbs or nohz_full boot
+         parameter. This also avoids the need to use any boot parameters
+         to achieve the effect of offloading all CPUs on boot.
+
+         Say Y here if you want offload all CPUs by default on boot.
+         Say N here if you are unsure.
+
+config RCU_NOCB_CPU_CB_BOOST
+       bool "Offload RCU callback from real-time kthread"
+       depends on RCU_NOCB_CPU && RCU_BOOST
+       default y if PREEMPT_RT
+       help
+         Use this option to invoke offloaded callbacks as SCHED_FIFO
+         to avoid starvation by heavy SCHED_OTHER background load.
+         Of course, running as SCHED_FIFO during callback floods will
+         cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+         of milliseconds or more.  Therefore, when enabling this option,
+         it is your responsibility to ensure that latency-sensitive
+         tasks either run with higher priority or run on some other CPU.
+
+         Say Y here if you want to set RT priority for offloading kthreads.
+         Say N here if you are building a !PREEMPT_RT kernel and are unsure.
+
  config TASKS_TRACE_RCU_READ_MB
         bool "Tasks Trace RCU readers use memory barriers in user and idle"
         depends on RCU_EXPERT && TASKS_TRACE_RCU
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug

index 9b64e55..4da05be 100644 (file)
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -121,7 +121,7 @@ config RCU_EQS_DEBUG
  
  config RCU_STRICT_GRACE_PERIOD
         bool "Provide debug RCU implementation with short grace periods"
-       depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4
+       depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
         default n
         select PREEMPT_COUNT if PREEMPT=n
         help
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c

index 277a5bf..3ef02d4 100644 (file)
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -419,6 +419,7 @@ rcu_scale_writer(void *arg)
         VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
         WARN_ON(!wdpp);
         set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+       current->flags |= PF_NO_SETAFFINITY;
         sched_set_fifo_low(current);
  
         if (holdoff)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c

index 0788ef2..d8e1b27 100644 (file)
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -75,64 +75,47 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
  
  torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
               "Extend readers by disabling bh (1), irqs (2), or preempt (4)");
-torture_param(int, fqs_duration, 0,
-             "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable");
  torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
  torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(int, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)");
  torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
-torture_param(int, fwd_progress_holdoff, 60,
-             "Time between forward-progress tests (s)");
-torture_param(bool, fwd_progress_need_resched, 1,
-             "Hide cond_resched() behind need_resched()");
+torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
+torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
  torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
  torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
  torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(bool, gp_normal, false,
-            "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
  torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
  torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
  torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
  torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
  torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
-torture_param(int, n_barrier_cbs, 0,
-            "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
  torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
  torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, object_debug, 0,
-            "Enable debug-object double call_rcu() testing");
+torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
  torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
-            "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
  torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
  torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
-torture_param(int, read_exit_delay, 13,
-             "Delay between read-then-exit episodes (s)");
-torture_param(int, read_exit_burst, 16,
-             "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
  torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
  torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
  torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-torture_param(int, stall_cpu_holdoff, 10,
-            "Time to wait before starting stall (s).");
-torture_param(bool, stall_no_softlockup, false,
-            "Avoid softlockup warning during cpu stall.");
+torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
  torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
  torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
-torture_param(int, stall_gp_kthread, 0,
-             "Grace-period kthread stall duration (s).");
-torture_param(int, stat_interval, 60,
-            "Number of seconds between stats printk()s");
+torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
  torture_param(int, stutter, 5, "Number of seconds to run/halt test");
  torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-torture_param(int, test_boost_duration, 4,
-            "Duration of each boost test, seconds.");
-torture_param(int, test_boost_interval, 7,
-            "Interval between boost tests, seconds.");
-torture_param(bool, test_no_idle_hz, true,
-            "Test support for tickless idle CPUs");
-torture_param(int, verbose, 1,
-            "Enable verbose debugging printk()s");
+torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
  
  static char *torture_type = "rcu";
  module_param(torture_type, charp, 0444);
@@ -1386,8 +1369,9 @@ rcu_torture_writer(void *arg)
                                 if (list_empty(&rcu_tortures[i].rtort_free) &&
                                     rcu_access_pointer(rcu_torture_current) !=
                                     &rcu_tortures[i]) {
-                                       rcu_ftrace_dump(DUMP_ALL);
+                                       tracing_off();
                                         WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+                                       rcu_ftrace_dump(DUMP_ALL);
                                 }
                 if (stutter_waited)
                         sched_set_normal(current, oldnice);
@@ -1945,7 +1929,7 @@ rcu_torture_stats_print(void)
                         batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
                 }
         }
-       for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+       for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
                 if (pipesummary[i] != 0)
                         break;
         }
@@ -2067,7 +2051,13 @@ static void rcu_torture_mem_dump_obj(void)
         static int z;
  
         kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+       if (WARN_ON_ONCE(!kcp))
+               return;
         rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+       if (WARN_ON_ONCE(!rhp)) {
+               kmem_cache_destroy(kcp);
+               return;
+       }
         pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
         pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
         mem_dump_obj(ZERO_SIZE_PTR);
@@ -2084,6 +2074,8 @@ static void rcu_torture_mem_dump_obj(void)
         kmem_cache_free(kcp, rhp);
         kmem_cache_destroy(kcp);
         rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+       if (WARN_ON_ONCE(!rhp))
+               return;
         pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
         pr_alert("mem_dump_obj(kmalloc %px):", rhp);
         mem_dump_obj(rhp);
@@ -2091,6 +2083,8 @@ static void rcu_torture_mem_dump_obj(void)
         mem_dump_obj(&rhp->func);
         kfree(rhp);
         rhp = vmalloc(4096);
+       if (WARN_ON_ONCE(!rhp))
+               return;
         pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
         pr_alert("mem_dump_obj(vmalloc %px):", rhp);
         mem_dump_obj(rhp);
@@ -2152,6 +2146,19 @@ static int rcutorture_booster_init(unsigned int cpu)
         if (boost_tasks[cpu] != NULL)
                 return 0;  /* Already created, nothing more to do. */
  
+       // Testing RCU priority boosting requires rcutorture do
+       // some serious abuse.  Counter this by running ksoftirqd
+       // at higher priority.
+       if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+               struct sched_param sp;
+               struct task_struct *t;
+
+               t = per_cpu(ksoftirqd, cpu);
+               WARN_ON_ONCE(!t);
+               sp.sched_priority = 2;
+               sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+       }
+
         /* Don't allow time recalculation while creating a new task. */
         mutex_lock(&boost_mutex);
         rcu_torture_disable_rt_throttle();
@@ -2950,7 +2957,6 @@ static int rcu_torture_read_exit_child(void *trsp_in)
  // Parent kthread which creates and destroys read-exit child kthreads.
  static int rcu_torture_read_exit(void *unused)
  {
-       int count = 0;
         bool errexit = false;
         int i;
         struct task_struct *tsp;
@@ -2962,34 +2968,28 @@ static int rcu_torture_read_exit(void *unused)
  
         // Each pass through this loop does one read-exit episode.
         do {
-               if (++count > read_exit_burst) {
-                       VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
-                       rcu_barrier(); // Wait for task_struct free, avoid OOM.
-                       for (i = 0; i < read_exit_delay; i++) {
-                               schedule_timeout_uninterruptible(HZ);
-                               if (READ_ONCE(read_exit_child_stop))
-                                       break;
+               VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+               for (i = 0; i < read_exit_burst; i++) {
+                       if (READ_ONCE(read_exit_child_stop))
+                               break;
+                       stutter_wait("rcu_torture_read_exit");
+                       // Spawn child.
+                       tsp = kthread_run(rcu_torture_read_exit_child,
+                                         &trs, "%s", "rcu_torture_read_exit_child");
+                       if (IS_ERR(tsp)) {
+                               TOROUT_ERRSTRING("out of memory");
+                               errexit = true;
+                               break;
                         }
-                       if (!READ_ONCE(read_exit_child_stop))
-                               VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
-                       count = 0;
-               }
-               if (READ_ONCE(read_exit_child_stop))
-                       break;
-               // Spawn child.
-               tsp = kthread_run(rcu_torture_read_exit_child,
-                                    &trs, "%s",
-                                    "rcu_torture_read_exit_child");
-               if (IS_ERR(tsp)) {
-                       TOROUT_ERRSTRING("out of memory");
-                       errexit = true;
-                       tsp = NULL;
-                       break;
+                       cond_resched();
+                       kthread_stop(tsp);
+                       n_read_exits++;
                 }
-               cond_resched();
-               kthread_stop(tsp);
-               n_read_exits ++;
-               stutter_wait("rcu_torture_read_exit");
+               VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+               rcu_barrier(); // Wait for task_struct free, avoid OOM.
+               i = 0;
+               for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
+                       schedule_timeout_uninterruptible(HZ);
         } while (!errexit && !READ_ONCE(read_exit_child_stop));
  
         // Clean up and exit.
@@ -3199,6 +3199,7 @@ static void rcu_test_debug_objects(void)
         pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
         destroy_rcu_head_on_stack(&rh1);
         destroy_rcu_head_on_stack(&rh2);
+       kfree(rhp);
  #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
         pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
  #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -3406,21 +3407,6 @@ rcu_torture_init(void)
                 rcutor_hp = firsterr;
                 if (torture_init_error(firsterr))
                         goto unwind;
-
-               // Testing RCU priority boosting requires rcutorture do
-               // some serious abuse.  Counter this by running ksoftirqd
-               // at higher priority.
-               if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
-                       for_each_online_cpu(cpu) {
-                               struct sched_param sp;
-                               struct task_struct *t;
-
-                               t = per_cpu(ksoftirqd, cpu);
-                               WARN_ON_ONCE(!t);
-                               sp.sched_priority = 2;
-                               sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
-                       }
-               }
         }
         shutdown_jiffies = jiffies + shutdown_secs * HZ;
         firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c

index 909644a..435c884 100644 (file)
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -385,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = {
  };
  
  // Definitions for global spinlock
-static DEFINE_SPINLOCK(test_lock);
+static DEFINE_RAW_SPINLOCK(test_lock);
  
  static void ref_lock_section(const int nloops)
  {
@@ -393,8 +393,8 @@ static void ref_lock_section(const int nloops)
  
         preempt_disable();
         for (i = nloops; i >= 0; i--) {
-               spin_lock(&test_lock);
-               spin_unlock(&test_lock);
+               raw_spin_lock(&test_lock);
+               raw_spin_unlock(&test_lock);
         }
         preempt_enable();
  }
@@ -405,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
  
         preempt_disable();
         for (i = nloops; i >= 0; i--) {
-               spin_lock(&test_lock);
+               raw_spin_lock(&test_lock);
                 un_delay(udl, ndl);
-               spin_unlock(&test_lock);
+               raw_spin_unlock(&test_lock);
         }
         preempt_enable();
  }
@@ -427,8 +427,8 @@ static void ref_lock_irq_section(const int nloops)
  
         preempt_disable();
         for (i = nloops; i >= 0; i--) {
-               spin_lock_irqsave(&test_lock, flags);
-               spin_unlock_irqrestore(&test_lock, flags);
+               raw_spin_lock_irqsave(&test_lock, flags);
+               raw_spin_unlock_irqrestore(&test_lock, flags);
         }
         preempt_enable();
  }
@@ -440,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
  
         preempt_disable();
         for (i = nloops; i >= 0; i--) {
-               spin_lock_irqsave(&test_lock, flags);
+               raw_spin_lock_irqsave(&test_lock, flags);
                 un_delay(udl, ndl);
-               spin_unlock_irqrestore(&test_lock, flags);
+               raw_spin_unlock_irqrestore(&test_lock, flags);
         }
         preempt_enable();
  }
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c

index 50ba70f..1c304fe 100644 (file)
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
         return sum;
  }
  
-#define SRCU_INTERVAL          1       // Base delay if no expedited GPs pending.
-#define SRCU_MAX_INTERVAL      10      // Maximum incremental delay from slow readers.
-#define SRCU_MAX_NODELAY_PHASE 1       // Maximum per-GP-phase consecutive no-delay instances.
-#define SRCU_MAX_NODELAY       100     // Maximum consecutive no-delay instances.
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections.  If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY         5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL          1               // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL      10              // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO      3UL     // Lowmark on default per-GP-phase
+                                                       // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI      1000UL  // Highmark on default per-GP-phase
+                                                       // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low)     ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high)    ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high)  SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED        \
+       (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+       SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED,  \
+                     SRCU_DEFAULT_MAX_NODELAY_PHASE_LO,        \
+                     SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY       (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+                                        SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);
  
  /*
   * Return grace-period delay, zero if there are expedited grace
@@ -522,16 +564,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
   */
  static unsigned long srcu_get_delay(struct srcu_struct *ssp)
  {
+       unsigned long gpstart;
+       unsigned long j;
         unsigned long jbase = SRCU_INTERVAL;
  
         if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
                 jbase = 0;
-       if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
-               jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
-       if (!jbase) {
-               WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
-               if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
-                       jbase = 1;
+       if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
+               j = jiffies - 1;
+               gpstart = READ_ONCE(ssp->srcu_gp_start);
+               if (time_after(j, gpstart))
+                       jbase += j - gpstart;
+               if (!jbase) {
+                       WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
+                       if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+                               jbase = 1;
+               }
         }
         return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
  }
@@ -607,15 +655,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
  EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  
  /*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited().  We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections.  If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
- */
-#define SRCU_RETRY_CHECK_DELAY         5
-
-/*
   * Start an SRCU grace period.
   */
  static void srcu_gp_start(struct srcu_struct *ssp)
@@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
   */
  static void srcu_gp_end(struct srcu_struct *ssp)
  {
-       unsigned long cbdelay;
+       unsigned long cbdelay = 1;
         bool cbs;
         bool last_lvl;
         int cpu;
@@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
         spin_lock_irq_rcu_node(ssp);
         idx = rcu_seq_state(ssp->srcu_gp_seq);
         WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
-       cbdelay = !!srcu_get_delay(ssp);
+       if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+               cbdelay = 0;
+
         WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
         rcu_seq_end(&ssp->srcu_gp_seq);
         gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
   */
  static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
  {
+       unsigned long curdelay;
+
+       curdelay = !srcu_get_delay(ssp);
+
         for (;;) {
                 if (srcu_readers_active_idx_check(ssp, idx))
                         return true;
-               if (--trycount + !srcu_get_delay(ssp) <= 0)
+               if ((--trycount + curdelay) <= 0)
                         return false;
-               udelay(SRCU_RETRY_CHECK_DELAY);
+               udelay(srcu_retry_check_delay);
         }
  }
  
@@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
                 j = jiffies;
                 if (READ_ONCE(ssp->reschedule_jiffies) == j) {
                         WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
-                       if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
+                       if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
                                 curdelay = 1;
                 } else {
                         WRITE_ONCE(ssp->reschedule_count, 1);
@@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
         pr_info("Hierarchical SRCU implementation.\n");
         if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
                 pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+       if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+               pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+       if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+               pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+       pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
         return 0;
  }
  early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h

index 3925e32..83c7e66 100644 (file)
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -14,7 +14,7 @@
  
  struct rcu_tasks;
  typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);
-typedef void (*pregp_func_t)(void);
+typedef void (*pregp_func_t)(struct list_head *hop);
  typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop);
  typedef void (*postscan_func_t)(struct list_head *hop);
  typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
@@ -29,6 +29,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
   * @rtp_work: Work queue for invoking callbacks.
   * @rtp_irq_work: IRQ work queue for deferred wakeups.
   * @barrier_q_head: RCU callback for barrier operation.
+ * @rtp_blkd_tasks: List of tasks blocked as readers.
   * @cpu: CPU number corresponding to this entry.
   * @rtpp: Pointer to the rcu_tasks structure.
   */
@@ -40,6 +41,7 @@ struct rcu_tasks_percpu {
         struct work_struct rtp_work;
         struct irq_work rtp_irq_work;
         struct rcu_head barrier_q_head;
+       struct list_head rtp_blkd_tasks;
         int cpu;
         struct rcu_tasks *rtpp;
  };
@@ -48,6 +50,7 @@ struct rcu_tasks_percpu {
   * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
   * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
   * @cbs_gbl_lock: Lock protecting callback list.
+ * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
   * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
   * @gp_func: This flavor's grace-period-wait function.
   * @gp_state: Grace period's most recent state transition (debugging).
@@ -79,6 +82,7 @@ struct rcu_tasks_percpu {
  struct rcu_tasks {
         struct rcuwait cbs_wait;
         raw_spinlock_t cbs_gbl_lock;
+       struct mutex tasks_gp_mutex;
         int gp_state;
         int gp_sleep;
         int init_fract;
@@ -119,6 +123,7 @@ static struct rcu_tasks rt_name =                                                   \
  {                                                                                      \
         .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait),                                \
         .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock),                 \
+       .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex),                  \
         .gp_func = gp,                                                                  \
         .call_func = call,                                                              \
         .rtpcpu = &rt_name ## __percpu,                                                 \
@@ -140,6 +145,7 @@ static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
  module_param(rcu_task_ipi_delay, int, 0644);
  
  /* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
+#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
  #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
  static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
  module_param(rcu_task_stall_timeout, int, 0644);
@@ -253,6 +259,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
                 INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
                 rtpcp->cpu = cpu;
                 rtpcp->rtpp = rtp;
+               if (!rtpcp->rtp_blkd_tasks.next)
+                       INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
                 raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled.
         }
         raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
@@ -323,17 +331,6 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
                 irq_work_queue(&rtpcp->rtp_irq_work);
  }
  
-// Wait for a grace period for the specified flavor of Tasks RCU.
-static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
-{
-       /* Complain if the scheduler has not started.  */
-       RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
-                        "synchronize_rcu_tasks called too soon");
-
-       /* Wait for the grace period. */
-       wait_rcu_gp(rtp->call_func);
-}
-
  // RCU callback function for rcu_barrier_tasks_generic().
  static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
  {
@@ -439,6 +436,11 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
                         WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
                         pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
                 }
+               for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+                       struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+                       WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+               }
                 raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
         }
  
@@ -497,10 +499,41 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
         rcu_tasks_invoke_cbs(rtp, rtpcp);
  }
  
-/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
-static int __noreturn rcu_tasks_kthread(void *arg)
+// Wait for one grace period.
+static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
  {
         int needgpcb;
+
+       mutex_lock(&rtp->tasks_gp_mutex);
+
+       // If there were none, wait a bit and start over.
+       if (unlikely(midboot)) {
+               needgpcb = 0x2;
+       } else {
+               set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+               rcuwait_wait_event(&rtp->cbs_wait,
+                                  (needgpcb = rcu_tasks_need_gpcb(rtp)),
+                                  TASK_IDLE);
+       }
+
+       if (needgpcb & 0x2) {
+               // Wait for one grace period.
+               set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+               rtp->gp_start = jiffies;
+               rcu_seq_start(&rtp->tasks_gp_seq);
+               rtp->gp_func(rtp);
+               rcu_seq_end(&rtp->tasks_gp_seq);
+       }
+
+       // Invoke callbacks.
+       set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
+       rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+       mutex_unlock(&rtp->tasks_gp_mutex);
+}
+
+// RCU-tasks kthread that detects grace periods and invokes callbacks.
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
         struct rcu_tasks *rtp = arg;
  
         /* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
@@ -514,29 +547,28 @@ static int __noreturn rcu_tasks_kthread(void *arg)
          * This loop is terminated by the system going down.  ;-)
          */
         for (;;) {
-               set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+               // Wait for one grace period and invoke any callbacks
+               // that are ready.
+               rcu_tasks_one_gp(rtp, false);
  
-               /* If there were none, wait a bit and start over. */
-               rcuwait_wait_event(&rtp->cbs_wait,
-                                  (needgpcb = rcu_tasks_need_gpcb(rtp)),
-                                  TASK_IDLE);
-
-               if (needgpcb & 0x2) {
-                       // Wait for one grace period.
-                       set_tasks_gp_state(rtp, RTGS_WAIT_GP);
-                       rtp->gp_start = jiffies;
-                       rcu_seq_start(&rtp->tasks_gp_seq);
-                       rtp->gp_func(rtp);
-                       rcu_seq_end(&rtp->tasks_gp_seq);
-               }
+               // Paranoid sleep to keep this from entering a tight loop.
+               schedule_timeout_idle(rtp->gp_sleep);
+       }
+}
  
-               /* Invoke callbacks. */
-               set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
-               rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+// Wait for a grace period for the specified flavor of Tasks RCU.
+static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+{
+       /* Complain if the scheduler has not started.  */
+       RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+                        "synchronize_rcu_tasks called too soon");
  
-               /* Paranoid sleep to keep this from entering a tight loop */
-               schedule_timeout_idle(rtp->gp_sleep);
+       // If the grace-period kthread is running, use it.
+       if (READ_ONCE(rtp->kthread_ptr)) {
+               wait_rcu_gp(rtp->call_func);
+               return;
         }
+       rcu_tasks_one_gp(rtp, true);
  }
  
  /* Spawn RCU-tasks grace-period kthread. */
@@ -630,7 +662,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
         struct task_struct *t;
  
         set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
-       rtp->pregp_func();
+       rtp->pregp_func(&holdouts);
  
         /*
          * There were callbacks, so we need to wait for an RCU-tasks
@@ -639,10 +671,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
          * and make a list of them in holdouts.
          */
         set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST);
-       rcu_read_lock();
-       for_each_process_thread(g, t)
-               rtp->pertask_func(t, &holdouts);
-       rcu_read_unlock();
+       if (rtp->pertask_func) {
+               rcu_read_lock();
+               for_each_process_thread(g, t)
+                       rtp->pertask_func(t, &holdouts);
+               rcu_read_unlock();
+       }
  
         set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST);
         rtp->postscan_func(&holdouts);
@@ -760,7 +794,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
  // disabling.
  
  /* Pre-grace-period preparation. */
-static void rcu_tasks_pregp_step(void)
+static void rcu_tasks_pregp_step(struct list_head *hop)
  {
         /*
          * Wait for all pre-existing t->on_rq and t->nvcsw transitions
@@ -1105,11 +1139,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
  // 3.  Avoids expensive read-side instructions, having overhead similar
  //     to that of Preemptible RCU.
  //
-// There are of course downsides.  The grace-period code can send IPIs to
-// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace.
-// It is necessary to scan the full tasklist, much as for Tasks RCU.  There
-// is a single callback queue guarded by a single lock, again, much as for
-// Tasks RCU.  If needed, these downsides can be at least partially remedied.
+// There are of course downsides.  For example, the grace-period code
+// can send IPIs to CPUs, even when those CPUs are in the idle loop or
+// in nohz_full userspace.  If needed, these downsides can be at least
+// partially remedied.
  //
  // Perhaps most important, this variant of RCU does not affect the vanilla
  // flavors, rcu_preempt and rcu_sched.  The fact that RCU Tasks Trace
@@ -1122,38 +1155,30 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
  // invokes these functions in this order:
  //
  // rcu_tasks_trace_pregp_step():
-//     Initialize the count of readers and block CPU-hotplug operations.
-// rcu_tasks_trace_pertask(), invoked on every non-idle task:
-//     Initialize per-task state and attempt to identify an immediate
-//     quiescent state for that task, or, failing that, attempt to
-//     set that task's .need_qs flag so that task's next outermost
-//     rcu_read_unlock_trace() will report the quiescent state (in which
-//     case the count of readers is incremented).  If both attempts fail,
-//     the task is added to a "holdout" list.  Note that IPIs are used
-//     to invoke trc_read_check_handler() in the context of running tasks
-//     in order to avoid ordering overhead on common-case shared-variable
-//     accessses.
+//     Disables CPU hotplug, adds all currently executing tasks to the
+//     holdout list, then checks the state of all tasks that blocked
+//     or were preempted within their current RCU Tasks Trace read-side
+//     critical section, adding them to the holdout list if appropriate.
+//     Finally, this function re-enables CPU hotplug.
+// The ->pertask_func() pointer is NULL, so there is no per-task processing.
  // rcu_tasks_trace_postscan():
-//     Initialize state and attempt to identify an immediate quiescent
-//     state as above (but only for idle tasks), unblock CPU-hotplug
-//     operations, and wait for an RCU grace period to avoid races with
-//     tasks that are in the process of exiting.
+//     Invokes synchronize_rcu() to wait for late-stage exiting tasks
+//     to finish exiting.
  // check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
  //     Scans the holdout list, attempting to identify a quiescent state
  //     for each task on the list.  If there is a quiescent state, the
-//     corresponding task is removed from the holdout list.
+//     corresponding task is removed from the holdout list.  Once this
+//     list is empty, the grace period has completed.
  // rcu_tasks_trace_postgp():
-//     Wait for the count of readers do drop to zero, reporting any stalls.
-//     Also execute full memory barriers to maintain ordering with code
-//     executing after the grace period.
+//     Provides the needed full memory barrier and does debug checks.
  //
  // The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
  //
-// Pre-grace-period update-side code is ordered before the grace
-// period via the ->cbs_lock and barriers in rcu_tasks_kthread().
-// Pre-grace-period read-side code is ordered before the grace period by
-// atomic_dec_and_test() of the count of readers (for IPIed readers) and by
-// scheduler context-switch ordering (for locked-down non-running readers).
+// Pre-grace-period update-side code is ordered before the grace period
+// via the ->cbs_lock and barriers in rcu_tasks_kthread().  Pre-grace-period
+// read-side code is ordered before the grace period by atomic operations
+// on .b.need_qs flag of each task involved in this process, or by scheduler
+// context-switch ordering (for locked-down non-running readers).
  
  // The lockdep state must be outside of #ifdef to be useful.
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -1165,9 +1190,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
  
  #ifdef CONFIG_TASKS_TRACE_RCU
  
-static atomic_t trc_n_readers_need_end;                // Number of waited-for readers.
-static DECLARE_WAIT_QUEUE_HEAD(trc_wait);      // List of holdout tasks.
-
  // Record outstanding IPIs to each CPU.  No point in sending two...
  static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
  
@@ -1176,44 +1198,104 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
  static unsigned long n_heavy_reader_attempts;
  static unsigned long n_heavy_reader_updates;
  static unsigned long n_heavy_reader_ofl_updates;
+static unsigned long n_trc_holdouts;
  
  void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
  DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
                  "RCU Tasks Trace");
  
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+       smp_mb(); // Enforce full grace-period ordering.
+       return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+       smp_store_release(&t->trc_reader_special.b.need_qs, v);
+       smp_mb(); // Enforce full grace-period ordering.
+}
+
  /*
- * This irq_work handler allows rcu_read_unlock_trace() to be invoked
- * while the scheduler locks are held.
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ * Returns the old value, which is often ignored.
   */
-static void rcu_read_unlock_iw(struct irq_work *iwp)
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
  {
-       wake_up(&trc_wait);
+       union rcu_special ret;
+       union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
+       union rcu_special trs_new = trs_old;
+
+       if (trs_old.b.need_qs != old)
+               return trs_old.b.need_qs;
+       trs_new.b.need_qs = new;
+       ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+       return ret.b.need_qs;
  }
-static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
  
-/* If we are the last reader, wake up the grace-period kthread. */
+/*
+ * If we are the last reader, signal the grace-period kthread.
+ * Also remove from the per-CPU list of blocked tasks.
+ */
  void rcu_read_unlock_trace_special(struct task_struct *t)
  {
-       int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
+       unsigned long flags;
+       struct rcu_tasks_percpu *rtpcp;
+       union rcu_special trs;
+
+       // Open-coded full-word version of rcu_ld_need_qs().
+       smp_mb(); // Enforce full grace-period ordering.
+       trs = smp_load_acquire(&t->trc_reader_special);
  
-       if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
-           t->trc_reader_special.b.need_mb)
+       if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
                 smp_mb(); // Pairs with update-side barriers.
         // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
-       if (nq)
-               WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
+       if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
+               u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+                                                      TRC_NEED_QS_CHECKED);
+
+               WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
+       }
+       if (trs.b.blocked) {
+               rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
+               raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+               list_del_init(&t->trc_blkd_node);
+               WRITE_ONCE(t->trc_reader_special.b.blocked, false);
+               raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+       }
         WRITE_ONCE(t->trc_reader_nesting, 0);
-       if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
-               irq_work_queue(&rcu_tasks_trace_iw);
  }
  EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
  
+/* Add a newly blocked reader task to its CPU's list. */
+void rcu_tasks_trace_qs_blkd(struct task_struct *t)
+{
+       unsigned long flags;
+       struct rcu_tasks_percpu *rtpcp;
+
+       local_irq_save(flags);
+       rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
+       raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
+       t->trc_blkd_cpu = smp_processor_id();
+       if (!rtpcp->rtp_blkd_tasks.next)
+               INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+       list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+       WRITE_ONCE(t->trc_reader_special.b.blocked, true);
+       raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
+
  /* Add a task to the holdout list, if it is not already on the list. */
  static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
  {
         if (list_empty(&t->trc_holdout_list)) {
                 get_task_struct(t);
                 list_add(&t->trc_holdout_list, bhp);
+               n_trc_holdouts++;
         }
  }
  
@@ -1223,37 +1305,36 @@ static void trc_del_holdout(struct task_struct *t)
         if (!list_empty(&t->trc_holdout_list)) {
                 list_del_init(&t->trc_holdout_list);
                 put_task_struct(t);
+               n_trc_holdouts--;
         }
  }
  
  /* IPI handler to check task state. */
  static void trc_read_check_handler(void *t_in)
  {
+       int nesting;
         struct task_struct *t = current;
         struct task_struct *texp = t_in;
  
         // If the task is no longer running on this CPU, leave.
-       if (unlikely(texp != t)) {
+       if (unlikely(texp != t))
                 goto reset_ipi; // Already on holdout list, so will check later.
-       }
  
         // If the task is not in a read-side critical section, and
         // if this is the last reader, awaken the grace-period kthread.
-       if (likely(!READ_ONCE(t->trc_reader_nesting))) {
-               WRITE_ONCE(t->trc_reader_checked, true);
+       nesting = READ_ONCE(t->trc_reader_nesting);
+       if (likely(!nesting)) {
+               rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
                 goto reset_ipi;
         }
         // If we are racing with an rcu_read_unlock_trace(), try again later.
-       if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0))
+       if (unlikely(nesting < 0))
                 goto reset_ipi;
-       WRITE_ONCE(t->trc_reader_checked, true);
  
-       // Get here if the task is in a read-side critical section.  Set
-       // its state so that it will awaken the grace-period kthread upon
-       // exit from that critical section.
-       atomic_inc(&trc_n_readers_need_end); // One more to wait on.
-       WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
-       WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+       // Get here if the task is in a read-side critical section.
+       // Set its state so that it will update state for the grace-period
+       // kthread upon exit from that critical section.
+       rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
  
  reset_ipi:
         // Allow future IPIs to be sent on CPU and for task.
@@ -1264,48 +1345,50 @@ reset_ipi:
  }
  
  /* Callback function for scheduler to check locked-down task.  */
-static int trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
  {
+       struct list_head *bhp = bhp_in;
         int cpu = task_cpu(t);
         int nesting;
         bool ofl = cpu_is_offline(cpu);
  
-       if (task_curr(t)) {
-               WARN_ON_ONCE(ofl && !is_idle_task(t));
-
+       if (task_curr(t) && !ofl) {
                 // If no chance of heavyweight readers, do it the hard way.
-               if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+               if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
                         return -EINVAL;
  
                 // If heavyweight readers are enabled on the remote task,
                 // we can inspect its state despite its currently running.
                 // However, we cannot safely change its state.
                 n_heavy_reader_attempts++;
-               if (!ofl && // Check for "running" idle tasks on offline CPUs.
-                   !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+               // Check for "running" idle tasks on offline CPUs.
+               if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
                         return -EINVAL; // No quiescent state, do it the hard way.
                 n_heavy_reader_updates++;
-               if (ofl)
-                       n_heavy_reader_ofl_updates++;
                 nesting = 0;
         } else {
                 // The task is not running, so C-language access is safe.
                 nesting = t->trc_reader_nesting;
+               WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t));
+               if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
+                       n_heavy_reader_ofl_updates++;
         }
  
         // If not exiting a read-side critical section, mark as checked
         // so that the grace-period kthread will remove it from the
         // holdout list.
-       t->trc_reader_checked = nesting >= 0;
-       if (nesting <= 0)
-               return nesting ? -EINVAL : 0;  // If in QS, done, otherwise try again later.
+       if (!nesting) {
+               rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+               return 0;  // In QS, so done.
+       }
+       if (nesting < 0)
+               return -EINVAL; // Reader transitioning, try again later.
  
         // The task is in a read-side critical section, so set up its
-       // state so that it will awaken the grace-period kthread upon exit
-       // from that critical section.
-       atomic_inc(&trc_n_readers_need_end); // One more to wait on.
-       WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
-       WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+       // state so that it will update state upon exit from that critical
+       // section.
+       if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+               trc_add_holdout(t, bhp);
         return 0;
  }
  
@@ -1321,14 +1404,14 @@ static void trc_wait_for_one_reader(struct task_struct *t,
  
         // The current task had better be in a quiescent state.
         if (t == current) {
-               t->trc_reader_checked = true;
+               rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
                 WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
                 return;
         }
  
         // Attempt to nail down the task for inspection.
         get_task_struct(t);
-       if (!task_call_func(t, trc_inspect_reader, NULL)) {
+       if (!task_call_func(t, trc_inspect_reader, bhp)) {
                 put_task_struct(t);
                 return;
         }
@@ -1366,56 +1449,93 @@ static void trc_wait_for_one_reader(struct task_struct *t,
         }
  }
  
+/*
+ * Initialize for first-round processing for the specified task.
+ * Return false if task is NULL or already taken care of, true otherwise.
+ */
+static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
+{
+       // During early boot when there is only the one boot CPU, there
+       // is no idle task for the other CPUs.  Also, the grace-period
+       // kthread is always in a quiescent state.  In addition, just return
+       // if this task is already on the list.
+       if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
+               return false;
+
+       rcu_st_need_qs(t, 0);
+       t->trc_ipi_to_cpu = -1;
+       return true;
+}
+
+/* Do first-round processing for the specified task. */
+static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
+{
+       if (rcu_tasks_trace_pertask_prep(t, true))
+               trc_wait_for_one_reader(t, hop);
+}
+
  /* Initialize for a new RCU-tasks-trace grace period. */
-static void rcu_tasks_trace_pregp_step(void)
+static void rcu_tasks_trace_pregp_step(struct list_head *hop)
  {
+       LIST_HEAD(blkd_tasks);
         int cpu;
-
-       // Allow for fast-acting IPIs.
-       atomic_set(&trc_n_readers_need_end, 1);
+       unsigned long flags;
+       struct rcu_tasks_percpu *rtpcp;
+       struct task_struct *t;
  
         // There shouldn't be any old IPIs, but...
         for_each_possible_cpu(cpu)
                 WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
  
-       // Disable CPU hotplug across the tasklist scan.
-       // This also waits for all readers in CPU-hotplug code paths.
+       // Disable CPU hotplug across the CPU scan for the benefit of
+       // any IPIs that might be needed.  This also waits for all readers
+       // in CPU-hotplug code paths.
         cpus_read_lock();
-}
  
-/* Do first-round processing for the specified task. */
-static void rcu_tasks_trace_pertask(struct task_struct *t,
-                                   struct list_head *hop)
-{
-       // During early boot when there is only the one boot CPU, there
-       // is no idle task for the other CPUs. Just return.
-       if (unlikely(t == NULL))
-               return;
+       // These rcu_tasks_trace_pertask_prep() calls are serialized to
+       // allow safe access to the hop list.
+       for_each_online_cpu(cpu) {
+               rcu_read_lock();
+               t = cpu_curr_snapshot(cpu);
+               if (rcu_tasks_trace_pertask_prep(t, true))
+                       trc_add_holdout(t, hop);
+               rcu_read_unlock();
+       }
  
-       WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
-       WRITE_ONCE(t->trc_reader_checked, false);
-       t->trc_ipi_to_cpu = -1;
-       trc_wait_for_one_reader(t, hop);
+       // Only after all running tasks have been accounted for is it
+       // safe to take care of the tasks that have blocked within their
+       // current RCU tasks trace read-side critical section.
+       for_each_possible_cpu(cpu) {
+               rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
+               raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+               list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
+               while (!list_empty(&blkd_tasks)) {
+                       rcu_read_lock();
+                       t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
+                       list_del_init(&t->trc_blkd_node);
+                       list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+                       raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+                       rcu_tasks_trace_pertask(t, hop);
+                       rcu_read_unlock();
+                       raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+               }
+               raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+       }
+
+       // Re-enable CPU hotplug now that the holdout list is populated.
+       cpus_read_unlock();
  }
  
  /*
- * Do intermediate processing between task and holdout scans and
- * pick up the idle tasks.
+ * Do intermediate processing between task and holdout scans.
   */
  static void rcu_tasks_trace_postscan(struct list_head *hop)
  {
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               rcu_tasks_trace_pertask(idle_task(cpu), hop);
-
-       // Re-enable CPU hotplug now that the tasklist scan has completed.
-       cpus_read_unlock();
-
         // Wait for late-stage exiting tasks to finish exiting.
         // These might have passed the call to exit_tasks_rcu_finish().
         synchronize_rcu();
-       // Any tasks that exit after this point will set ->trc_reader_checked.
+       // Any tasks that exit after this point will set
+       // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
  }
  
  /* Communicate task state back to the RCU tasks trace stall warning request. */
@@ -1429,11 +1549,11 @@ static int trc_check_slow_task(struct task_struct *t, void *arg)
  {
         struct trc_stall_chk_rdr *trc_rdrp = arg;
  
-       if (task_curr(t))
+       if (task_curr(t) && cpu_online(task_cpu(t)))
                 return false; // It is running, so decline to inspect it.
         trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
         trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
-       trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs);
+       trc_rdrp->needqs = rcu_ld_need_qs(t);
         return true;
  }
  
@@ -1450,18 +1570,21 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
         }
         cpu = task_cpu(t);
         if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
-               pr_alert("P%d: %c\n",
+               pr_alert("P%d: %c%c\n",
                          t->pid,
+                        ".I"[t->trc_ipi_to_cpu >= 0],
                          ".i"[is_idle_tsk]);
         else
-               pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
+               pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
                          t->pid,
                          ".I"[trc_rdr.ipi_to_cpu >= 0],
                          ".i"[is_idle_tsk],
                          ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+                        ".B"[!!data_race(t->trc_reader_special.b.blocked)],
                          trc_rdr.nesting,
-                        " N"[!!trc_rdr.needqs],
-                        cpu);
+                        " !CN"[trc_rdr.needqs & 0x3],
+                        " ?"[trc_rdr.needqs > 0x3],
+                        cpu, cpu_online(cpu) ? "" : "(offline)");
         sched_show_task(t);
  }
  
@@ -1481,18 +1604,18 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
  {
         struct task_struct *g, *t;
  
-       // Disable CPU hotplug across the holdout list scan.
+       // Disable CPU hotplug across the holdout list scan for IPIs.
         cpus_read_lock();
  
         list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
                 // If safe and needed, try to check the current task.
                 if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
-                   !READ_ONCE(t->trc_reader_checked))
+                   !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
                         trc_wait_for_one_reader(t, hop);
  
                 // If check succeeded, remove this task from the list.
                 if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
-                   READ_ONCE(t->trc_reader_checked))
+                   rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
                         trc_del_holdout(t);
                 else if (needreport)
                         show_stalled_task_trace(t, firstreport);
@@ -1516,10 +1639,6 @@ static void rcu_tasks_trace_empty_fn(void *unused)
  static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
  {
         int cpu;
-       bool firstreport;
-       struct task_struct *g, *t;
-       LIST_HEAD(holdouts);
-       long ret;
  
         // Wait for any lingering IPI handlers to complete.  Note that
         // if a CPU has gone offline or transitioned to userspace in the
@@ -1530,37 +1649,6 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
                 if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
                         smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
  
-       // Remove the safety count.
-       smp_mb__before_atomic();  // Order vs. earlier atomics
-       atomic_dec(&trc_n_readers_need_end);
-       smp_mb__after_atomic();  // Order vs. later atomics
-
-       // Wait for readers.
-       set_tasks_gp_state(rtp, RTGS_WAIT_READERS);
-       for (;;) {
-               ret = wait_event_idle_exclusive_timeout(
-                               trc_wait,
-                               atomic_read(&trc_n_readers_need_end) == 0,
-                               READ_ONCE(rcu_task_stall_timeout));
-               if (ret)
-                       break;  // Count reached zero.
-               // Stall warning time, so make a list of the offenders.
-               rcu_read_lock();
-               for_each_process_thread(g, t)
-                       if (READ_ONCE(t->trc_reader_special.b.need_qs))
-                               trc_add_holdout(t, &holdouts);
-               rcu_read_unlock();
-               firstreport = true;
-               list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) {
-                       if (READ_ONCE(t->trc_reader_special.b.need_qs))
-                               show_stalled_task_trace(t, &firstreport);
-                       trc_del_holdout(t); // Release task_struct reference.
-               }
-               if (firstreport)
-                       pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n");
-               show_stalled_ipi_trace();
-               pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end));
-       }
         smp_mb(); // Caller's code must be ordered after wakeup.
                   // Pairs with pretty much every ordering primitive.
  }
@@ -1568,11 +1656,14 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
  /* Report any needed quiescent state for this exiting task. */
  static void exit_tasks_rcu_finish_trace(struct task_struct *t)
  {
-       WRITE_ONCE(t->trc_reader_checked, true);
+       union rcu_special trs = READ_ONCE(t->trc_reader_special);
+
+       rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
         WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
-       WRITE_ONCE(t->trc_reader_nesting, 0);
-       if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
+       if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
                 rcu_read_unlock_trace_special(t);
+       else
+               WRITE_ONCE(t->trc_reader_nesting, 0);
  }
  
  /**
@@ -1646,7 +1737,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
                         rcu_tasks_trace.init_fract = 1;
         }
         rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
-       rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask;
         rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
         rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
         rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
@@ -1659,7 +1749,8 @@ void show_rcu_tasks_trace_gp_kthread(void)
  {
         char buf[64];
  
-       sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end),
+       sprintf(buf, "N%lu h:%lu/%lu/%lu",
+               data_race(n_trc_holdouts),
                 data_race(n_heavy_reader_ofl_updates),
                 data_race(n_heavy_reader_updates),
                 data_race(n_heavy_reader_attempts));
@@ -1686,23 +1777,24 @@ struct rcu_tasks_test_desc {
         struct rcu_head rh;
         const char *name;
         bool notrun;
+       unsigned long runstart;
  };
  
  static struct rcu_tasks_test_desc tests[] = {
         {
                 .name = "call_rcu_tasks()",
                 /* If not defined, the test is skipped. */
-               .notrun = !IS_ENABLED(CONFIG_TASKS_RCU),
+               .notrun = IS_ENABLED(CONFIG_TASKS_RCU),
         },
         {
                 .name = "call_rcu_tasks_rude()",
                 /* If not defined, the test is skipped. */
-               .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
+               .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
         },
         {
                 .name = "call_rcu_tasks_trace()",
                 /* If not defined, the test is skipped. */
-               .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+               .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
         }
  };
  
@@ -1713,46 +1805,85 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
  
         pr_info("Callback from %s invoked.\n", rttd->name);
  
-       rttd->notrun = true;
+       rttd->notrun = false;
  }
  
  static void rcu_tasks_initiate_self_tests(void)
  {
+       unsigned long j = jiffies;
+
         pr_info("Running RCU-tasks wait API self tests\n");
  #ifdef CONFIG_TASKS_RCU
+       tests[0].runstart = j;
         synchronize_rcu_tasks();
         call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
  #endif
  
  #ifdef CONFIG_TASKS_RUDE_RCU
+       tests[1].runstart = j;
         synchronize_rcu_tasks_rude();
         call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
  #endif
  
  #ifdef CONFIG_TASKS_TRACE_RCU
+       tests[2].runstart = j;
         synchronize_rcu_tasks_trace();
         call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
  #endif
  }
  
+/*
+ * Return:  0 - test passed
+ *         1 - test failed, but have not timed out yet
+ *        -1 - test failed and timed out
+ */
  static int rcu_tasks_verify_self_tests(void)
  {
         int ret = 0;
         int i;
+       unsigned long bst = rcu_task_stall_timeout;
  
+       if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT)
+               bst = RCU_TASK_BOOT_STALL_TIMEOUT;
         for (i = 0; i < ARRAY_SIZE(tests); i++) {
-               if (!tests[i].notrun) {         // still hanging.
-                       pr_err("%s has been failed.\n", tests[i].name);
-                       ret = -1;
+               while (tests[i].notrun) {               // still hanging.
+                       if (time_after(jiffies, tests[i].runstart + bst)) {
+                               pr_err("%s has failed boot-time tests.\n", tests[i].name);
+                               ret = -1;
+                               break;
+                       }
+                       ret = 1;
+                       break;
                 }
         }
-
-       if (ret)
-               WARN_ON(1);
+       WARN_ON(ret < 0);
  
         return ret;
  }
-late_initcall(rcu_tasks_verify_self_tests);
+
+/*
+ * Repeat the rcu_tasks_verify_self_tests() call once every second until the
+ * test passes or has timed out.
+ */
+static struct delayed_work rcu_tasks_verify_work;
+static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused)
+{
+       int ret = rcu_tasks_verify_self_tests();
+
+       if (ret <= 0)
+               return;
+
+       /* Test fails but not timed out yet, reschedule another check */
+       schedule_delayed_work(&rcu_tasks_verify_work, HZ);
+}
+
+static int rcu_tasks_verify_schedule_work(void)
+{
+       INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn);
+       rcu_tasks_verify_work_fn(NULL);
+       return 0;
+}
+late_initcall(rcu_tasks_verify_schedule_work);
  #else /* #ifdef CONFIG_PROVE_RCU */
  static void rcu_tasks_initiate_self_tests(void) { }
  #endif /* #else #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c

index 6007181..f0561ee 100644 (file)
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -220,6 +220,20 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
  }
  EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+       if (head) {
+               void *ptr = (void *) head - (unsigned long) func;
+
+               kasan_record_aux_stack_noalloc(ptr);
+       }
+
+       __kvfree_call_rcu(head, func);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+#endif
+
  void __init rcu_init(void)
  {
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 6cf5b51..62e5147 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -154,7 +154,11 @@ static void sync_sched_exp_online_cleanup(int cpu);
  static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
  static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
  
-/* rcuc/rcub/rcuop kthread realtime priority */
+/*
+ * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
+ * real-time priority(enabling/disabling) is controlled by
+ * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
+ */
  static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
  module_param(kthread_prio, int, 0444);
  
@@ -2045,19 +2049,23 @@ static void rcu_gp_fqs(bool first_time)
   */
  static noinline_for_stack void rcu_gp_fqs_loop(void)
  {
-       bool first_gp_fqs;
+       bool first_gp_fqs = true;
         int gf = 0;
         unsigned long j;
         int ret;
         struct rcu_node *rnp = rcu_get_root();
  
-       first_gp_fqs = true;
         j = READ_ONCE(jiffies_till_first_fqs);
         if (rcu_state.cbovld)
                 gf = RCU_GP_FLAG_OVLD;
         ret = 0;
         for (;;) {
-               if (!ret) {
+               if (rcu_state.cbovld) {
+                       j = (j + 2) / 3;
+                       if (j <= 0)
+                               j = 1;
+               }
+               if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
                         WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
                         /*
                          * jiffies_force_qs before RCU_GP_WAIT_FQS state
@@ -2075,7 +2083,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
                 rcu_gp_torture_wait();
                 WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
                 /* Locking provides needed memory barriers. */
-               /* If grace period done, leave loop. */
+               /*
+                * Exit the loop if the root rcu_node structure indicates that the grace period
+                * has ended, leave the loop.  The rcu_preempt_blocked_readers_cgp(rnp) check
+                * is required only for single-node rcu_node trees because readers blocking
+                * the current grace period are queued only on leaf rcu_node structures.
+                * For multi-node trees, checking the root node's ->qsmask suffices, because a
+                * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+                * the corresponding leaf nodes have passed through their quiescent state.
+                */
                 if (!READ_ONCE(rnp->qsmask) &&
                     !rcu_preempt_blocked_readers_cgp(rnp))
                         break;
@@ -2605,7 +2621,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
                 trace_rcu_batch_end(rcu_state.name, 0,
                                     !rcu_segcblist_empty(&rdp->cblist),
                                     need_resched(), is_idle_task(current),
-                                   rcu_is_callbacks_kthread());
+                                   rcu_is_callbacks_kthread(rdp));
                 return;
         }
  
@@ -2683,7 +2699,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
         rcu_nocb_lock_irqsave(rdp, flags);
         rdp->n_cbs_invoked += count;
         trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
-                           is_idle_task(current), rcu_is_callbacks_kthread());
+                           is_idle_task(current), rcu_is_callbacks_kthread(rdp));
  
         /* Update counts and requeue any remaining callbacks. */
         rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
@@ -3286,7 +3302,6 @@ struct kfree_rcu_cpu_work {
   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
   * @lock: Synchronize access to this structure
   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
   * @initialized: The @rcu_work fields have been initialized
   * @count: Number of objects for which GP not started
   * @bkvcache:
@@ -3311,7 +3326,6 @@ struct kfree_rcu_cpu {
         struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
         raw_spinlock_t lock;
         struct delayed_work monitor_work;
-       bool monitor_todo;
         bool initialized;
         int count;
  
@@ -3491,6 +3505,18 @@ static void kfree_rcu_work(struct work_struct *work)
         }
  }
  
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+       int i;
+
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               if (krcp->bkvhead[i])
+                       return true;
+
+       return !!krcp->head;
+}
+
  /*
   * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
   */
@@ -3547,9 +3573,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
         // of the channels that is still busy we should rearm the
         // work to repeat an attempt. Because previous batches are
         // still in progress.
-       if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
-               krcp->monitor_todo = false;
-       else
+       if (need_offload_krc(krcp))
                 schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
  
         raw_spin_unlock_irqrestore(&krcp->lock, flags);
@@ -3737,11 +3761,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
         WRITE_ONCE(krcp->count, krcp->count + 1);
  
         // Set timer to drain after KFREE_DRAIN_JIFFIES.
-       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
-           !krcp->monitor_todo) {
-               krcp->monitor_todo = true;
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
                 schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
-       }
  
  unlock_return:
         krc_this_cpu_unlock(krcp, flags);
@@ -3816,14 +3837,8 @@ void __init kfree_rcu_scheduler_running(void)
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                 raw_spin_lock_irqsave(&krcp->lock, flags);
-               if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
-                               krcp->monitor_todo) {
-                       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-                       continue;
-               }
-               krcp->monitor_todo = true;
-               schedule_delayed_work_on(cpu, &krcp->monitor_work,
-                                        KFREE_DRAIN_JIFFIES);
+               if (need_offload_krc(krcp))
+                       schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
                 raw_spin_unlock_irqrestore(&krcp->lock, flags);
         }
  }
@@ -4533,6 +4548,7 @@ void rcu_report_dead(unsigned int cpu)
         rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
         if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
                 /* Report quiescent state -before- changing ->qsmaskinitnext! */
+               rcu_disable_urgency_upon_qs(rdp);
                 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
         }
@@ -4578,6 +4594,7 @@ void rcutree_migrate_callbacks(int cpu)
         needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
         rcu_segcblist_disable(&rdp->cblist);
         WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+       check_cb_ovld_locked(my_rdp, my_rnp);
         if (rcu_rdp_is_offloaded(my_rdp)) {
                 raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
                 __call_rcu_nocb_wake(my_rdp, true, flags);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h

index fb77dec..3cdc189 100644 (file)
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -239,6 +239,7 @@ struct rcu_data {
                                          * if rdp_gp.
                                          */
         struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
+       struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
  
         /* The following fields are used by CB kthread, hence new cacheline. */
         struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -432,7 +433,7 @@ static void rcu_flavor_sched_clock_irq(int user);
  static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static bool rcu_is_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
  static void rcu_cpu_kthread_setup(unsigned int cpu);
  static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
  static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h

index 46694e1..a8f574d 100644 (file)
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -546,52 +546,51 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
         }
  }
  
-/*
- * Check if we ignore this rdp.
- *
- * We check that without holding the nocb lock but
- * we make sure not to miss a freshly offloaded rdp
- * with the current ordering:
- *
- *  rdp_offload_toggle()        nocb_gp_enabled_cb()
- * -------------------------   ----------------------------
- *    WRITE flags                 LOCK nocb_gp_lock
- *    LOCK nocb_gp_lock           READ/WRITE nocb_gp_sleep
- *    READ/WRITE nocb_gp_sleep    UNLOCK nocb_gp_lock
- *    UNLOCK nocb_gp_lock         READ flags
- */
-static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
-{
-       u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
-
-       return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
-                                                    bool *needwake_state)
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
+                              bool *wake_state)
  {
         struct rcu_segcblist *cblist = &rdp->cblist;
+       unsigned long flags;
+       int ret;
  
-       if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
-               if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
-                       rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
-                       if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
-                               *needwake_state = true;
-               }
-               return false;
+       rcu_nocb_lock_irqsave(rdp, flags);
+       if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+           !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+               /*
+                * Offloading. Set our flag and notify the offload worker.
+                * We will handle this rdp until it ever gets de-offloaded.
+                */
+               rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+               if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+                       *wake_state = true;
+               ret = 1;
+       } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+                  rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+               /*
+                * De-offloading. Clear our flag and notify the de-offload worker.
+                * We will ignore this rdp until it ever gets re-offloaded.
+                */
+               rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+               if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+                       *wake_state = true;
+               ret = 0;
+       } else {
+               WARN_ON_ONCE(1);
+               ret = -1;
         }
  
-       /*
-        * De-offloading. Clear our flag and notify the de-offload worker.
-        * We will ignore this rdp until it ever gets re-offloaded.
-        */
-       WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
-       rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
-       if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
-               *needwake_state = true;
-       return true;
+       rcu_nocb_unlock_irqrestore(rdp, flags);
+
+       return ret;
  }
  
+static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
+{
+       trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+       swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+                                       !READ_ONCE(my_rdp->nocb_gp_sleep));
+       trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+}
  
  /*
   * No-CBs GP kthreads come here to wait for additional callbacks to show up
@@ -609,7 +608,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
         bool needwait_gp = false; // This prevents actual uninitialized use.
         bool needwake;
         bool needwake_gp;
-       struct rcu_data *rdp;
+       struct rcu_data *rdp, *rdp_toggling = NULL;
         struct rcu_node *rnp;
         unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
         bool wasempty = false;
@@ -634,19 +633,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
          * is added to the list, so the skipped-over rcu_data structures
          * won't be ignored for long.
          */
-       list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) {
-               bool needwake_state = false;
-
-               if (!nocb_gp_enabled_cb(rdp))
-                       continue;
+       list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
                 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
                 rcu_nocb_lock_irqsave(rdp, flags);
-               if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
-                       rcu_nocb_unlock_irqrestore(rdp, flags);
-                       if (needwake_state)
-                               swake_up_one(&rdp->nocb_state_wq);
-                       continue;
-               }
+               lockdep_assert_held(&rdp->nocb_lock);
                 bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
                 if (bypass_ncbs &&
                     (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
@@ -656,8 +646,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
                         bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
                 } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
                         rcu_nocb_unlock_irqrestore(rdp, flags);
-                       if (needwake_state)
-                               swake_up_one(&rdp->nocb_state_wq);
                         continue; /* No callbacks here, try next. */
                 }
                 if (bypass_ncbs) {
@@ -705,8 +693,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
                 }
                 if (needwake_gp)
                         rcu_gp_kthread_wake();
-               if (needwake_state)
-                       swake_up_one(&rdp->nocb_state_wq);
         }
  
         my_rdp->nocb_gp_bypass = bypass;
@@ -723,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
                 /* Polling, so trace if first poll in the series. */
                 if (gotcbs)
                         trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
-               schedule_timeout_idle(1);
+               if (list_empty(&my_rdp->nocb_head_rdp)) {
+                       raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+                       if (!my_rdp->nocb_toggling_rdp)
+                               WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+                       raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+                       /* Wait for any offloading rdp */
+                       nocb_gp_sleep(my_rdp, cpu);
+               } else {
+                       schedule_timeout_idle(1);
+               }
         } else if (!needwait_gp) {
                 /* Wait for callbacks to appear. */
-               trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
-               swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
-                               !READ_ONCE(my_rdp->nocb_gp_sleep));
-               trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+               nocb_gp_sleep(my_rdp, cpu);
         } else {
                 rnp = my_rdp->mynode;
                 trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
@@ -739,15 +731,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
                         !READ_ONCE(my_rdp->nocb_gp_sleep));
                 trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
         }
+
         if (!rcu_nocb_poll) {
                 raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+               // (De-)queue an rdp to/from the group if its nocb state is changing
+               rdp_toggling = my_rdp->nocb_toggling_rdp;
+               if (rdp_toggling)
+                       my_rdp->nocb_toggling_rdp = NULL;
+
                 if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
                         WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
                         del_timer(&my_rdp->nocb_timer);
                 }
                 WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
                 raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+       } else {
+               rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
+               if (rdp_toggling) {
+                       /*
+                        * Paranoid locking to make sure nocb_toggling_rdp is well
+                        * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
+                        * race with another round of nocb toggling for this rdp.
+                        * Nocb locking should prevent from that already but we stick
+                        * to paranoia, especially in rare path.
+                        */
+                       raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+                       my_rdp->nocb_toggling_rdp = NULL;
+                       raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+               }
+       }
+
+       if (rdp_toggling) {
+               bool wake_state = false;
+               int ret;
+
+               ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+               if (ret == 1)
+                       list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
+               else if (ret == 0)
+                       list_del(&rdp_toggling->nocb_entry_rdp);
+               if (wake_state)
+                       swake_up_one(&rdp_toggling->nocb_state_wq);
         }
+
         my_rdp->nocb_gp_seq = -1;
         WARN_ON(signal_pending(current));
  }
@@ -966,16 +992,15 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
         swake_up_one(&rdp->nocb_cb_wq);
  
         raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+       // Queue this rdp for add/del to/from the list to iterate on rcuog
+       WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
         if (rdp_gp->nocb_gp_sleep) {
                 rdp_gp->nocb_gp_sleep = false;
                 wake_gp = true;
         }
         raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
  
-       if (wake_gp)
-               wake_up_process(rdp_gp->nocb_gp_kthread);
-
-       return 0;
+       return wake_gp;
  }
  
  static long rcu_nocb_rdp_deoffload(void *arg)
@@ -983,9 +1008,15 @@ static long rcu_nocb_rdp_deoffload(void *arg)
         struct rcu_data *rdp = arg;
         struct rcu_segcblist *cblist = &rdp->cblist;
         unsigned long flags;
-       int ret;
+       int wake_gp;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
  
-       WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+       /*
+        * rcu_nocb_rdp_deoffload() may be called directly if
+        * rcuog/o[p] spawn failed, because at this time the rdp->cpu
+        * is not online yet.
+        */
+       WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
  
         pr_info("De-offloading %d\n", rdp->cpu);
  
@@ -1009,12 +1040,41 @@ static long rcu_nocb_rdp_deoffload(void *arg)
          */
         rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
         invoke_rcu_core();
-       ret = rdp_offload_toggle(rdp, false, flags);
-       swait_event_exclusive(rdp->nocb_state_wq,
-                             !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
-                                                       SEGCBLIST_KTHREAD_GP));
-       /* Stop nocb_gp_wait() from iterating over this structure. */
-       list_del_rcu(&rdp->nocb_entry_rdp);
+       wake_gp = rdp_offload_toggle(rdp, false, flags);
+
+       mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+       if (rdp_gp->nocb_gp_kthread) {
+               if (wake_gp)
+                       wake_up_process(rdp_gp->nocb_gp_kthread);
+
+               /*
+                * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
+                * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
+                */
+               if (!rdp->nocb_cb_kthread) {
+                       rcu_nocb_lock_irqsave(rdp, flags);
+                       rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+                       rcu_nocb_unlock_irqrestore(rdp, flags);
+               }
+
+               swait_event_exclusive(rdp->nocb_state_wq,
+                                       !rcu_segcblist_test_flags(cblist,
+                                         SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+       } else {
+               /*
+                * No kthread to clear the flags for us or remove the rdp from the nocb list
+                * to iterate. Do it here instead. Locking doesn't look stricly necessary
+                * but we stick to paranoia in this rare path.
+                */
+               rcu_nocb_lock_irqsave(rdp, flags);
+               rcu_segcblist_clear_flags(&rdp->cblist,
+                               SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+               rcu_nocb_unlock_irqrestore(rdp, flags);
+
+               list_del(&rdp->nocb_entry_rdp);
+       }
+       mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
         /*
          * Lock one last time to acquire latest callback updates from kthreads
          * so we can later handle callbacks locally without locking.
@@ -1035,7 +1095,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
         WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
  
  
-       return ret;
+       return 0;
  }
  
  int rcu_nocb_cpu_deoffload(int cpu)
@@ -1043,8 +1103,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
         int ret = 0;
  
-       mutex_lock(&rcu_state.barrier_mutex);
         cpus_read_lock();
+       mutex_lock(&rcu_state.barrier_mutex);
         if (rcu_rdp_is_offloaded(rdp)) {
                 if (cpu_online(cpu)) {
                         ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
@@ -1055,8 +1115,8 @@ int rcu_nocb_cpu_deoffload(int cpu)
                         ret = -EINVAL;
                 }
         }
-       cpus_read_unlock();
         mutex_unlock(&rcu_state.barrier_mutex);
+       cpus_read_unlock();
  
         return ret;
  }
@@ -1067,7 +1127,8 @@ static long rcu_nocb_rdp_offload(void *arg)
         struct rcu_data *rdp = arg;
         struct rcu_segcblist *cblist = &rdp->cblist;
         unsigned long flags;
-       int ret;
+       int wake_gp;
+       struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
  
         WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
         /*
@@ -1077,17 +1138,10 @@ static long rcu_nocb_rdp_offload(void *arg)
         if (!rdp->nocb_gp_rdp)
                 return -EINVAL;
  
-       pr_info("Offloading %d\n", rdp->cpu);
+       if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+               return -EINVAL;
  
-       /*
-        * Cause future nocb_gp_wait() invocations to iterate over
-        * structure, resetting ->nocb_gp_sleep and waking up the related
-        * "rcuog".  Since nocb_gp_wait() in turn locks ->nocb_gp_lock
-        * before setting ->nocb_gp_sleep again, we are guaranteed to
-        * iterate this newly added structure before "rcuog" goes to
-        * sleep again.
-        */
-       list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp);
+       pr_info("Offloading %d\n", rdp->cpu);
  
         /*
          * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
@@ -1111,7 +1165,9 @@ static long rcu_nocb_rdp_offload(void *arg)
          *      WRITE flags               READ callbacks
          *      rcu_nocb_unlock()         rcu_nocb_unlock()
          */
-       ret = rdp_offload_toggle(rdp, true, flags);
+       wake_gp = rdp_offload_toggle(rdp, true, flags);
+       if (wake_gp)
+               wake_up_process(rdp_gp->nocb_gp_kthread);
         swait_event_exclusive(rdp->nocb_state_wq,
                               rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
                               rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
@@ -1124,7 +1180,7 @@ static long rcu_nocb_rdp_offload(void *arg)
         rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
         rcu_nocb_unlock_irqrestore(rdp, flags);
  
-       return ret;
+       return 0;
  }
  
  int rcu_nocb_cpu_offload(int cpu)
@@ -1132,8 +1188,8 @@ int rcu_nocb_cpu_offload(int cpu)
         struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
         int ret = 0;
  
-       mutex_lock(&rcu_state.barrier_mutex);
         cpus_read_lock();
+       mutex_lock(&rcu_state.barrier_mutex);
         if (!rcu_rdp_is_offloaded(rdp)) {
                 if (cpu_online(cpu)) {
                         ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
@@ -1144,8 +1200,8 @@ int rcu_nocb_cpu_offload(int cpu)
                         ret = -EINVAL;
                 }
         }
-       cpus_read_unlock();
         mutex_unlock(&rcu_state.barrier_mutex);
+       cpus_read_unlock();
  
         return ret;
  }
@@ -1155,11 +1211,21 @@ void __init rcu_init_nohz(void)
  {
         int cpu;
         bool need_rcu_nocb_mask = false;
+       bool offload_all = false;
         struct rcu_data *rdp;
  
+#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL)
+       if (!rcu_state.nocb_is_setup) {
+               need_rcu_nocb_mask = true;
+               offload_all = true;
+       }
+#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */
+
  #if defined(CONFIG_NO_HZ_FULL)
-       if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+       if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) {
                 need_rcu_nocb_mask = true;
+               offload_all = false; /* NO_HZ_FULL has its own mask. */
+       }
  #endif /* #if defined(CONFIG_NO_HZ_FULL) */
  
         if (need_rcu_nocb_mask) {
@@ -1180,6 +1246,9 @@ void __init rcu_init_nohz(void)
                 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
  #endif /* #if defined(CONFIG_NO_HZ_FULL) */
  
+       if (offload_all)
+               cpumask_setall(rcu_nocb_mask);
+
         if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
                 pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
                 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
@@ -1246,7 +1315,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
                                 "rcuog/%d", rdp_gp->cpu);
                 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
                         mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
-                       return;
+                       goto end;
                 }
                 WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
                 if (kthread_prio)
@@ -1258,12 +1327,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
         t = kthread_run(rcu_nocb_cb_kthread, rdp,
                         "rcuo%c/%d", rcu_state.abbr, cpu);
         if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
-               return;
+               goto end;
  
-       if (kthread_prio)
+       if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
                 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+
         WRITE_ONCE(rdp->nocb_cb_kthread, t);
         WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+       return;
+end:
+       mutex_lock(&rcu_state.barrier_mutex);
+       if (rcu_rdp_is_offloaded(rdp)) {
+               rcu_nocb_rdp_deoffload(rdp);
+               cpumask_clear_cpu(cpu, rcu_nocb_mask);
+       }
+       mutex_unlock(&rcu_state.barrier_mutex);
  }
  
  /* How many CB CPU IDs per GP kthread?  Default of -1 for sqrt(nr_cpu_ids). */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h

index c8ba0fe..7ae1551 100644 (file)
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
   * be quite short, for example, in the case of the call from
   * rcu_read_unlock_special().
   */
-static void
+static notrace void
  rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
  {
         bool empty_exp;
@@ -581,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
   * is disabled.  This function cannot be expected to understand these
   * nuances, so the caller must handle them.
   */
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
  {
         return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
                 READ_ONCE(t->rcu_read_unlock_special.s)) &&
@@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
   * evaluate safety in terms of interrupt, softirq, and preemption
   * disabling.
   */
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+static notrace void rcu_preempt_deferred_qs(struct task_struct *t)
  {
         unsigned long flags;
  
@@ -899,8 +899,8 @@ void rcu_note_context_switch(bool preempt)
         this_cpu_write(rcu_data.rcu_urgent_qs, false);
         if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
                 rcu_momentary_dyntick_idle();
-       rcu_tasks_qs(current, preempt);
  out:
+       rcu_tasks_qs(current, preempt);
         trace_rcu_utilization(TPS("End context switch"));
  }
  EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -926,7 +926,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
   * Because there is no preemptible RCU, there can be no deferred quiescent
   * states.
   */
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
  {
         return false;
  }
@@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
  // period for a quiescent state from this CPU.  Note that requests from
  // tasks are handled when removing the task from the blocked-tasks list
  // below.
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+static notrace void rcu_preempt_deferred_qs(struct task_struct *t)
  {
         struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
  
@@ -1012,6 +1012,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
         WRITE_ONCE(rdp->rcuc_activity, jiffies);
  }
  
+static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+       return rdp->nocb_cb_kthread == current;
+#else
+       return false;
+#endif
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
+{
+       return rdp->rcu_cpu_kthread_task == current ||
+                       rcu_is_callbacks_nocb_kthread(rdp);
+}
+
  #ifdef CONFIG_RCU_BOOST
  
  /*
@@ -1140,7 +1159,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
             (rnp->gp_tasks != NULL &&
              rnp->boost_tasks == NULL &&
              rnp->qsmask == 0 &&
-            (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
+            (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
+             IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
                 if (rnp->exp_tasks == NULL)
                         WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
                 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1151,15 +1171,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
         }
  }
  
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
-       return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
-}
-
  #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
  
  /*
@@ -1242,11 +1253,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  }
  
-static bool rcu_is_callbacks_kthread(void)
-{
-       return false;
-}
-
  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  {
  }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index da0bf6f..9568019 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4264,6 +4264,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
  }
  
  /**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU.  If the same task is running on that CPU throughout,
+ * the return value will be a pointer to that task's task_struct structure.
+ * If the CPU did any context switches even vaguely concurrently with the
+ * execution of this function, the return value will be a pointer to the
+ * task_struct structure of a randomly chosen task that was running on
+ * that CPU somewhere around the time that this function was executing.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee.  Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+       struct task_struct *t;
+
+       smp_mb(); /* Pairing determined by caller's synchronization design. */
+       t = rcu_dereference(cpu_curr(cpu));
+       smp_mb(); /* Pairing determined by caller's synchronization design. */
+       return t;
+}
+
+/**
   * wake_up_process - Wake up a specific process
   * @p: The process to be woken up.
   *
diff --git a/kernel/smp.c b/kernel/smp.c

index dd215f4..650810a 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str)
         if (val)
                 static_branch_enable(&csdlock_debug_enabled);
  
-       return 0;
+       return 1;
  }
-early_param("csdlock_debug", csdlock_debug);
+__setup("csdlock_debug=", csdlock_debug);
  
  static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
  static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh

index f17000a..ed0ec7f 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
@@ -35,7 +35,7 @@ then
         exit 1
  fi
  
-# Remember where we started so that we can get back and the end.
+# Remember where we started so that we can get back at the end.
  curcommit="`git status | head -1 | awk '{ print $NF }'`"
  
  nfail=0
@@ -73,15 +73,10 @@ do
                 # Test the specified commit.
                 git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1
                 echo git checkout return code: $? "(Commit $ntry: $i)"
-               kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1
+               kvm.sh --allcpus --duration 3 --trust-make --datestamp "$ds/$idir" > $resdir/$ds/$idir/kvm.sh.out 2>&1
                 ret=$?
                 echo kvm.sh return code $ret for commit $i from branch $gitbr
-
-               # Move the build products to their resting place.
-               runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`"
-               mv $runresdir $resdir/$ds/$idir
-               rrd="`echo $runresdir | sed -e 's,^.*/,,'`"
-               echo Run results: $resdir/$ds/$idir/$rrd
+               echo Run results: $resdir/$ds/$idir
                 if test "$ret" -ne 0
                 then
                         # Failure, so leave all evidence intact.
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh

index 0ff59bd..9f0a5d5 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh
@@ -262,6 +262,7 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log"
  # Wait for all remaining scenarios to complete and collect results.
  for i in $systems
  do
+       echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log"
         while checkremotefile "$i" "$resdir/$ds/remote.run"
         do
                 sleep 30
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh

index 263e16a..6c73481 100755 (executable)
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -164,7 +164,7 @@ do
                 shift
                 ;;
         --gdb)
-               TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG
+               TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG
                 TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG
                 TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG
                 ;;
@@ -180,7 +180,7 @@ do
                 shift
                 ;;
         --kasan)
-               TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
+               TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
                 if test -n "$torture_qemu_mem_default"
                 then
                         TORTURE_QEMU_MEM=2G
@@ -192,7 +192,7 @@ do
                 shift
                 ;;
         --kcsan)
-               TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
+               TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG
                 ;;
         --kmake-arg|--kmake-args)
                 checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
author	Paul E. McKenney <paulmck@kernel.org>
	Fri, 22 Jul 2022 00:43:16 +0000 (17:43 -0700)
committer	Paul E. McKenney <paulmck@kernel.org>
	Fri, 22 Jul 2022 00:43:16 +0000 (17:43 -0700)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
include/linux/rcupdate.h		patch \| blob \| history
include/linux/rcupdate_trace.h		patch \| blob \| history
include/linux/rcutiny.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/init_task.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/rcu/Kconfig		patch \| blob \| history
kernel/rcu/Kconfig.debug		patch \| blob \| history
kernel/rcu/rcuscale.c		patch \| blob \| history
kernel/rcu/rcutorture.c		patch \| blob \| history
kernel/rcu/refscale.c		patch \| blob \| history
kernel/rcu/srcutree.c		patch \| blob \| history
kernel/rcu/tasks.h		patch \| blob \| history
kernel/rcu/tiny.c		patch \| blob \| history
kernel/rcu/tree.c		patch \| blob \| history
kernel/rcu/tree.h		patch \| blob \| history
kernel/rcu/tree_nocb.h		patch \| blob \| history
kernel/rcu/tree_plugin.h		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/smp.c		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/kvm-remote.sh		patch \| blob \| history
tools/testing/selftests/rcutorture/bin/kvm.sh		patch \| blob \| history