powerpc/eeh: Add a debugfs interface to check if a driver supports recovery
authorOliver O'Halloran <oohall@gmail.com>
Tue, 3 Nov 2020 05:15:12 +0000 (16:15 +1100)
committerMichael Ellerman <mpe@ellerman.id.au>
Sun, 31 Jan 2021 11:35:47 +0000 (22:35 +1100)
If a PCI device's current driver implements the error handling callbacks
EEH can use them to recover the device after an error occurs. For devices
without the error handling callbacks we recover them by removing the device
and re-scanning it so the PCI core puts the device back into a known good
state.

Currently there's no way for userspace to determine if the driver supports
recovery or not which makes it difficult to write automated tests for EEH.
This patch addressing that by adding a debugfs interface for querying if
a specific device can be recovered or not.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201103051512.919333-2-oohall@gmail.com
arch/powerpc/kernel/eeh.c

index f9182ff..cd60bc1 100644 (file)
@@ -1868,6 +1868,53 @@ static const struct file_operations eeh_dev_break_fops = {
        .read   = eeh_debugfs_dev_usage,
 };
 
+static ssize_t eeh_dev_can_recover(struct file *filp,
+                                  const char __user *user_buf,
+                                  size_t count, loff_t *ppos)
+{
+       struct pci_driver *drv;
+       struct pci_dev *pdev;
+       size_t ret;
+
+       pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos);
+       if (IS_ERR(pdev))
+               return PTR_ERR(pdev);
+
+       /*
+        * In order for error recovery to work the driver needs to implement
+        * .error_detected(), so it can quiesce IO to the device, and
+        * .slot_reset() so it can re-initialise the device after a reset.
+        *
+        * Ideally they'd implement .resume() too, but some drivers which
+        * we need to support (notably IPR) don't so I guess we can tolerate
+        * that.
+        *
+        * .mmio_enabled() is mostly there as a work-around for devices which
+        * take forever to re-init after a hot reset. Implementing that is
+        * strictly optional.
+        */
+       drv = pci_dev_driver(pdev);
+       if (drv &&
+           drv->err_handler &&
+           drv->err_handler->error_detected &&
+           drv->err_handler->slot_reset) {
+               ret = count;
+       } else {
+               ret = -EOPNOTSUPP;
+       }
+
+       pci_dev_put(pdev);
+
+       return ret;
+}
+
+static const struct file_operations eeh_dev_can_recover_fops = {
+       .open   = simple_open,
+       .llseek = no_llseek,
+       .write  = eeh_dev_can_recover,
+       .read   = eeh_debugfs_dev_usage,
+};
+
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1892,6 +1939,9 @@ static int __init eeh_init_proc(void)
                debugfs_create_file_unsafe("eeh_force_recover", 0600,
                                powerpc_debugfs_root, NULL,
                                &eeh_force_recover_fops);
+               debugfs_create_file_unsafe("eeh_dev_can_recover", 0600,
+                               powerpc_debugfs_root, NULL,
+                               &eeh_dev_can_recover_fops);
                eeh_cache_debugfs_init();
 #endif
        }