smp,csd: Throw an error if a CSD lock is stuck for too long

author Rik van Riel <riel@surriel.com>

Mon, 21 Aug 2023 20:04:09 +0000 (16:04 -0400)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 28 Nov 2023 17:19:36 +0000 (17:19 +0000)
author Rik van Riel <riel@surriel.com>
Mon, 21 Aug 2023 20:04:09 +0000 (16:04 -0400)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 28 Nov 2023 17:19:36 +0000 (17:19 +0000)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index 0a1731a0f0ef373421c0594886c065461ade1064..41644336e358727b5b9b184761e1d11b1332fcca 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5858,6 +5858,13 @@
                         This feature may be more efficiently disabled
                         using the csdlock_debug- kernel parameter.
  
+       smp.panic_on_ipistall= [KNL]
+                       If a csd_lock_timeout extends for more than
+                       the specified number of milliseconds, panic the
+                       system.  By default, let CSD-lock acquisition
+                       take as long as they take.  Specifying 300,000
+                       for this value provides a 5-minute timeout.
+
         smsc-ircc2.nopnp        [HW] Don't use PNP to discover SMC devices
         smsc-ircc2.ircc_cfg=    [HW] Device configuration I/O port
         smsc-ircc2.ircc_sir=    [HW] SIR base I/O port
diff --git a/kernel/smp.c b/kernel/smp.c

index 8455a53465af8c0fb3aeabd8e44e7b72ad647c9b..695eb13a276d26302f5d413740fed2592502200e 100644 (file)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -170,6 +170,8 @@ static DEFINE_PER_CPU(void *, cur_csd_info);
  
  static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
  module_param(csd_lock_timeout, ulong, 0444);
+static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
+module_param(panic_on_ipistall, int, 0444);
  
  static atomic_t csd_bug_count = ATOMIC_INIT(0);
  
@@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
         }
  
         ts2 = sched_clock();
+       /* How long since we last checked for a stuck CSD lock.*/
         ts_delta = ts2 - *ts1;
         if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
                 return false;
@@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
         else
                 cpux = cpu;
         cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
+       /* How long since this CSD lock was stuck. */
+       ts_delta = ts2 - ts0;
         pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
-                firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
+                firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
                  cpu, csd->func, csd->info);
+       /*
+        * If the CSD lock is still stuck after 5 minutes, it is unlikely
+        * to become unstuck. Use a signed comparison to avoid triggering
+        * on underflows when the TSC is out of sync between sockets.
+        */
+       BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
         if (cpu_cur_csd && csd != cpu_cur_csd) {
                 pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
                          *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
author	Rik van Riel <riel@surriel.com>
	Mon, 21 Aug 2023 20:04:09 +0000 (16:04 -0400)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 28 Nov 2023 17:19:36 +0000 (17:19 +0000)
Documentation/admin-guide/kernel-parameters.txt		patch \| blob \| history
kernel/smp.c		patch \| blob \| history