x86/nmi: Print reasons why backtrace NMIs are ignored
authorPaul E. McKenney <paulmck@kernel.org>
Sat, 17 Dec 2022 00:52:01 +0000 (16:52 -0800)
committerPaul E. McKenney <paulmck@kernel.org>
Thu, 19 Jan 2023 23:55:12 +0000 (15:55 -0800)
Instrument nmi_trigger_cpumask_backtrace() to dump out diagnostics based
on evidence accumulated by exc_nmi().  These diagnostics are dumped for
CPUs that ignored an NMI backtrace request for more than 10 seconds.

[ paulmck: Apply Ingo Molnar feedback. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
arch/x86/kernel/nmi.c
include/linux/nmi.h
lib/nmi_backtrace.c

index 701ff27..c315b18 100644 (file)
@@ -570,6 +570,79 @@ DEFINE_IDTENTRY_RAW(exc_nmi_noist)
 EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
 #endif
 
+#ifdef CONFIG_NMI_CHECK_CPU
+
+static char *nmi_check_stall_msg[] = {
+/*                                                                     */
+/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler.          */
+/* | +------ cpu_is_offline(cpu)                                       */
+/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls):  */
+/* | | |       NMI handler has been invoked.                           */
+/* | | |                                                               */
+/* V V V                                                               */
+/* 0 0 0 */ "NMIs are not reaching exc_nmi() handler",
+/* 0 0 1 */ "exc_nmi() handler is ignoring NMIs",
+/* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler",
+/* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs",
+/* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler",
+/* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs",
+/* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler",
+/* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs",
+};
+
+void nmi_backtrace_stall_snap(const struct cpumask *btp)
+{
+       int cpu;
+       struct nmi_stats *nsp;
+
+       for_each_cpu(cpu, btp) {
+               nsp = per_cpu_ptr(&nmi_stats, cpu);
+               nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq);
+               nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq);
+               nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored);
+               nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls);
+       }
+}
+
+void nmi_backtrace_stall_check(const struct cpumask *btp)
+{
+       int cpu;
+       int idx;
+       unsigned long nmi_seq;
+       unsigned long j = jiffies;
+       char *modp;
+       char *msgp;
+       char *msghp;
+       struct nmi_stats *nsp;
+
+       for_each_cpu(cpu, btp) {
+               nsp = per_cpu_ptr(&nmi_stats, cpu);
+               modp = "";
+               msghp = "";
+               nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
+               if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
+                       msgp = "CPU entered NMI handler function, but has not exited";
+               } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) {
+                       msgp = "CPU is handling NMIs";
+               } else {
+                       idx = ((nsp->idt_seq_snap & 0x1) << 2) |
+                             (cpu_is_offline(cpu) << 1) |
+                             (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
+                       msgp = nmi_check_stall_msg[idx];
+                       if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
+                               modp = ", but OK because ignore_nmis was set";
+                       if (nmi_seq & ~0x1)
+                               msghp = " (CPU currently in NMI handler function)";
+                       else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
+                               msghp = " (CPU exited one NMI handler function)";
+               }
+               pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n",
+                        __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies));
+       }
+}
+
+#endif
+
 void stop_nmi(void)
 {
        ignore_nmis++;
index f700ff2..048c0b9 100644 (file)
@@ -214,4 +214,12 @@ int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *);
 #include <asm/nmi.h>
 #endif
 
+#ifdef CONFIG_NMI_CHECK_CPU
+void nmi_backtrace_stall_snap(const struct cpumask *btp);
+void nmi_backtrace_stall_check(const struct cpumask *btp);
+#else
+static inline void nmi_backtrace_stall_snap(const struct cpumask *btp) {}
+static inline void nmi_backtrace_stall_check(const struct cpumask *btp) {}
+#endif
+
 #endif
index d01aec6..5274bbb 100644 (file)
@@ -64,6 +64,7 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
        if (!cpumask_empty(to_cpumask(backtrace_mask))) {
                pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n",
                        this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask));
+               nmi_backtrace_stall_snap(to_cpumask(backtrace_mask));
                raise(to_cpumask(backtrace_mask));
        }
 
@@ -74,6 +75,7 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
                mdelay(1);
                touch_softlockup_watchdog();
        }
+       nmi_backtrace_stall_check(to_cpumask(backtrace_mask));
 
        /*
         * Force flush any remote buffers that might be stuck in IRQ context