intel_l3_parity: Support a daemonic mode
authorBen Widawsky <benjamin.widawsky@intel.com>
Tue, 10 Sep 2013 21:21:23 +0000 (14:21 -0700)
committerBen Widawsky <benjamin.widawsky@intel.com>
Fri, 20 Sep 2013 16:42:07 +0000 (09:42 -0700)
v2: Add a comment explaining the dangers of directly accessing the DFT
register (Daniel)

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
tools/Makefile.am
tools/intel_l3_parity.c
tools/intel_l3_parity.h [new file with mode: 0644]
tools/intel_l3_udev_listener.c [new file with mode: 0644]

index 47bd5b3..19810cf 100644 (file)
@@ -39,7 +39,7 @@ dist_bin_SCRIPTS = intel_gpu_abrt
 
 AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/lib
 AM_CFLAGS = $(DRM_CFLAGS) $(PCIACCESS_CFLAGS) $(CWARNFLAGS) $(CAIRO_CFLAGS)
-LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS)
+LDADD = $(top_builddir)/lib/libintel_tools.la $(DRM_LIBS) $(PCIACCESS_LIBS) $(CAIRO_LIBS) $(LIBUDEV_LIBS)
 
 intel_dump_decode_SOURCES =    \
        intel_dump_decode.c
@@ -50,3 +50,7 @@ intel_error_decode_SOURCES =  \
 intel_bios_reader_SOURCES =    \
        intel_bios_reader.c     \
        intel_bios.h
+
+intel_l3_parity_SOURCES =      \
+       intel_l3_parity.c       \
+       intel_l3_udev_listener.c
index d2ad3c9..ead8fb5 100644 (file)
 #include "intel_chipset.h"
 #include "intel_gpu_tools.h"
 #include "drmtest.h"
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#if HAVE_UDEV
+#include <libudev.h>
+#include <syslog.h>
+#endif
+#include "intel_l3_parity.h"
 
 static unsigned int devid;
 /* L3 size is always a function of banks. The number of banks cannot be
@@ -157,7 +165,8 @@ static void usage(const char *name)
                "  -r, --row=[row]                      The row to act upon (default 0)\n"
                "  -b, --bank=[bank]                    The bank to act upon (default 0)\n"
                "  -s, --subbank=[subbank]              The subbank to act upon (default 0)\n"
-               "  -w, --slice=[slice]                  Which slice to act on (default: -1 [all])"
+               "  -w, --slice=[slice]                  Which slice to act on (default: -1 [all])\n"
+               "    , --daemon                         Run the listener (-L) as a daemon\n"
                " ACTIONS (only 1 may be specified at a time):\n"
                "  -h, --help                           Display this help\n"
                "  -H, --hw-info                                Display the current L3 properties\n"
@@ -166,7 +175,8 @@ static void usage(const char *name)
                "  -e, --enable                         Enable row, bank, subbank (undo -d)\n"
                "  -d, --disable=<row,bank,subbank>     Disable row, bank, subbank (inline arguments are deprecated. Please use -r, -b, -s instead\n"
                "  -i, --inject                         [HSW only] Cause hardware to inject a row errors\n"
-               "  -u, --uninject                       [HSW only] Turn off hardware error injectection (undo -i)\n",
+               "  -u, --uninject                       [HSW only] Turn off hardware error injectection (undo -i)\n"
+               "  -L, --listen                         Listen for uevent errors\n",
                name);
 }
 
@@ -179,6 +189,7 @@ int main(int argc, char *argv[])
        int fd[REAL_MAX_SLICES] = {0}, ret, i;
        int action = '0';
        int drm_fd = drm_open_any();
+       int daemonize = 0;
        devid = intel_get_drm_devid(drm_fd);
 
        if (intel_gen(devid) < 7 || IS_VALLEYVIEW(devid))
@@ -202,11 +213,18 @@ int main(int argc, char *argv[])
                assert(lseek(fd[i], 0, SEEK_SET) == 0);
        }
 
+       /* NB: It is potentially unsafe to read this register if the kernel is
+        * actively using this register range, or we're running multiple
+        * instances of this tool. Since neither of those cases should occur
+        * (and the tool should be root only) we can safely ignore this for
+        * now. Just be aware of this if for some reason a hang is reported
+        * when using this tool.
+        */
        dft = intel_register_read(0xb038);
 
        while (1) {
                int c, option_index = 0;
-               static struct option long_options[] = {
+               struct option long_options[] = {
                        { "help", no_argument, 0, 'h' },
                        { "list", no_argument, 0, 'l' },
                        { "clear-all", no_argument, 0, 'a' },
@@ -215,18 +233,23 @@ int main(int argc, char *argv[])
                        { "inject", no_argument, 0, 'i' },
                        { "uninject", no_argument, 0, 'u' },
                        { "hw-info", no_argument, 0, 'H' },
+                       { "listen", no_argument, 0, 'L' },
                        { "row", required_argument, 0, 'r' },
                        { "bank", required_argument, 0, 'b' },
                        { "subbank", required_argument, 0, 's' },
                        { "slice", required_argument, 0, 'w' },
+                       { "daemon", no_argument, &daemonize, 1 },
                        {0, 0, 0, 0}
                };
 
-               c = getopt_long(argc, argv, "hHr:b:s:w:aled::iu", long_options,
+               c = getopt_long(argc, argv, "hHr:b:s:w:aled::iuL", long_options,
                                &option_index);
                if (c == -1)
                        break;
 
+               if (c == 0)
+                       continue;
+
                switch (c) {
                        case '?':
                        case 'h':
@@ -274,6 +297,7 @@ int main(int argc, char *argv[])
                        case 'a':
                        case 'l':
                        case 'e':
+                       case 'L':
                                if (action != '0') {
                                        fprintf(stderr, "Only one action may be specified\n");
                                        exit(EXIT_FAILURE);
@@ -299,6 +323,20 @@ int main(int argc, char *argv[])
                        printf("warning: overwriting existing injections. This is very dangerous.\n");
        }
 
+       /* Daemon doesn't work like the other commands */
+       if (action == 'L') {
+               struct l3_parity par;
+               struct l3_location loc;
+               if (daemonize) {
+                       assert(daemon(0, 0) == 0);
+                       openlog(argv[0], LOG_CONS | LOG_PID, LOG_USER);
+               }
+               memset(&par, 0, sizeof(par));
+               assert(l3_uevent_setup(&par) == 0);
+               assert(l3_listen(&par, daemonize == 1, &loc) == 0);
+               exit(EXIT_SUCCESS);
+       }
+
        if (action == 'l')
                decode_dft(dft);
 
diff --git a/tools/intel_l3_parity.h b/tools/intel_l3_parity.h
new file mode 100644 (file)
index 0000000..65697c4
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef INTEL_L3_PARITY_H_
+#define INTEL_L3_PARITY_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+struct l3_parity {
+       struct udev *udev;
+       struct udev_monitor *uevent_monitor;
+       int fd;
+       fd_set fdset;
+};
+
+struct l3_location {
+       uint8_t slice;
+       uint16_t row;
+       uint8_t bank;
+       uint8_t subbank;
+};
+
+#if HAVE_UDEV
+int l3_uevent_setup(struct l3_parity *par);
+/* Listens (blocks) for an l3 parity event. Returns the location of the error. */
+int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc);
+#define l3_uevent_teardown(par) {}
+#else
+#define l3_uevent_setup(par, daemon, loc) -1
+#define l3_listen(par) -1
+#endif
+
+#endif
diff --git a/tools/intel_l3_udev_listener.c b/tools/intel_l3_udev_listener.c
new file mode 100644 (file)
index 0000000..c50820c
--- /dev/null
@@ -0,0 +1,108 @@
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#if HAVE_UDEV
+#include <libudev.h>
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <syslog.h>
+#include "i915_drm.h"
+#include "intel_l3_parity.h"
+
+#ifndef I915_L3_PARITY_UEVENT
+#define I915_L3_PARITY_UEVENT "L3_PARITY_ERROR"
+#endif
+
+int l3_uevent_setup(struct l3_parity *par)
+{
+       struct udev *udev;
+       struct udev_monitor *uevent_monitor;
+       fd_set fdset;
+       int fd, ret = -1;
+
+       udev = udev_new();
+       if (!udev) {
+               return -1;
+       }
+
+       uevent_monitor = udev_monitor_new_from_netlink(udev, "udev");
+       if (!uevent_monitor)
+               goto err_out;
+
+       ret = udev_monitor_filter_add_match_subsystem_devtype(uevent_monitor, "drm", "drm_minor");
+       if (ret < 0)
+               goto err_out;
+
+       ret = udev_monitor_enable_receiving(uevent_monitor);
+       if (ret < 0)
+               goto err_out;
+
+       fd = udev_monitor_get_fd(uevent_monitor);
+       FD_ZERO(&fdset);
+       FD_SET(fd, &fdset);
+
+       par->udev = udev;
+       par->fd = fd;
+       par->fdset = fdset;
+       par->uevent_monitor = uevent_monitor;
+       return 0;
+
+err_out:
+       udev_unref(udev);
+       return ret;
+}
+
+int l3_listen(struct l3_parity *par, bool daemon, struct l3_location *loc)
+{
+       struct udev_device *udev_dev;
+       const char *parity_status;
+       char *err_msg;
+       int ret;
+
+again:
+       ret = select(par->fd + 1, &par->fdset, NULL, NULL, NULL);
+       /* Number of bits set is returned, must be >= 1 */
+       if (ret <= 0) {
+               return ret;
+       }
+
+       assert(FD_ISSET(par->fd, &par->fdset));
+
+       udev_dev = udev_monitor_receive_device(par->uevent_monitor);
+       if (!udev_dev)
+               return -1;
+
+       parity_status = udev_device_get_property_value(udev_dev, I915_L3_PARITY_UEVENT);
+       if (strncmp(parity_status, "1", 1))
+               goto again;
+
+       loc->slice = atoi(udev_device_get_property_value(udev_dev, "SLICE"));
+       loc->row = atoi(udev_device_get_property_value(udev_dev, "ROW"));
+       loc->bank = atoi(udev_device_get_property_value(udev_dev, "BANK"));
+       loc->subbank = atoi(udev_device_get_property_value(udev_dev, "SUBBANK"));
+
+       udev_device_unref(udev_dev);
+
+       asprintf(&err_msg, "Parity error detected on: %d,%d,%d,%d. "
+                       "Try to run intel_l3_parity -r %d -b %d -s %d -w %d -d",
+                       loc->slice, loc->row, loc->bank, loc->subbank,
+                       loc->row, loc->bank, loc->subbank, loc->slice);
+       if (daemon) {
+               syslog(LOG_INFO, "%s\n", err_msg);
+               goto again;
+       }
+
+       fprintf(stderr, "%s\n", err_msg);
+
+       free(err_msg);
+
+       return 0;
+}
+#endif