first stab at S.M.A.R.T. support
authorDavid Zeuthen <davidz@redhat.com>
Sun, 30 Mar 2008 06:43:38 +0000 (02:43 -0400)
committerDavid Zeuthen <davidz@redhat.com>
Sun, 30 Mar 2008 06:43:38 +0000 (02:43 -0400)
Yeah, yuck, screenscraping the output of smartctl(8) sucks. But there
is no library. Have to think about how to export more data such as the
attributes. Also needs to make this work with SCSI drives as the
smartctl(8) output depends on whether it's SCSI or ATA (!).

http://people.freedesktop.org/~david/gdu-smart-and-healthy.png
http://people.freedesktop.org/~david/gdu-smart-and-failing.png

src/Makefile.am
src/devkit-disks-device.c
src/devkit-disks-device.h
src/job-smart-selftest.c [new file with mode: 0644]
src/org.freedesktop.DeviceKit.Disks.Device.xml

index c8fdd6b..3d61d51 100644 (file)
@@ -69,7 +69,8 @@ libexec_PROGRAMS += devkit-disks-helper-erase                   \
                    devkit-disks-helper-create-partition        \
                    devkit-disks-helper-modify-partition        \
                    devkit-disks-helper-create-partition-table  \
-                   devkit-disks-helper-change-filesystem-label
+                   devkit-disks-helper-change-filesystem-label \
+                   devkit-disks-helper-smart-selftest
 
 libexec_SCRIPTS = devkit-disks-helper-change-luks-password
 
@@ -101,6 +102,10 @@ devkit_disks_helper_change_filesystem_label_SOURCES = job-shared.h job-change-fi
 devkit_disks_helper_change_filesystem_label_CPPFLAGS = $(AM_CPPFLAGS)
 devkit_disks_helper_change_filesystem_label_LDADD = $(GLIB_LIBS)
 
+devkit_disks_helper_smart_selftest_SOURCES = job-shared.h job-smart-selftest.c
+devkit_disks_helper_smart_selftest_CPPFLAGS = $(AM_CPPFLAGS)
+devkit_disks_helper_smart_selftest_LDADD = $(GLIB_LIBS)
+
 # TODO: move to udev
 udevhelperdir = /lib/udev
 udevhelper_PROGRAMS = part_id
index 08fd61b..c7cc1b6 100644 (file)
@@ -230,6 +230,9 @@ devkit_disks_device_error_get_type (void)
                         ENUM_ENTRY (DEVKIT_DISKS_DEVICE_ERROR_CRYPTO_ALREADY_UNLOCKED, "CryptoAlreadyUnlocked"),
                         ENUM_ENTRY (DEVKIT_DISKS_DEVICE_ERROR_CRYPTO_NOT_UNLOCKED, "CryptoNotUnlocked"),
                         ENUM_ENTRY (DEVKIT_DISKS_DEVICE_ERROR_IS_BUSY, "IsBusy"),
+                        ENUM_ENTRY (DEVKIT_DISKS_DEVICE_ERROR_NOT_DRIVE, "NotDrive"),
+                        ENUM_ENTRY (DEVKIT_DISKS_DEVICE_ERROR_NOT_SMART_CAPABLE, "NotSmartCapable"),
+
                         { 0, 0, 0 }
                 };
                 g_assert (DEVKIT_DISKS_DEVICE_NUM_ERRORS == G_N_ELEMENTS (values) - 1);
@@ -4743,6 +4746,297 @@ out:
 
 /*--------------------------------------------------------------------------------------------------------------*/
 
+static void
+retrieve_smart_data_completed_cb (DBusGMethodInvocation *context,
+                                  DevkitDisksDevice *device,
+                                  PolKitCaller *pk_caller,
+                                  gboolean job_was_cancelled,
+                                  int status,
+                                  const char *stderr,
+                                  const char *stdout,
+                                  gpointer user_data)
+{
+        int rc;
+        gboolean passed;
+        int n;
+        char **lines;
+        gboolean in_attributes;
+        int power_on_hours;
+        int temperature;
+
+        if (job_was_cancelled || stdout == NULL) {
+                if (job_was_cancelled) {
+                        throw_error (context,
+                                     DEVKIT_DISKS_DEVICE_ERROR_JOB_WAS_CANCELLED,
+                                     "Job was cancelled");
+                } else {
+                        throw_error (context,
+                                     DEVKIT_DISKS_DEVICE_ERROR_GENERAL,
+                                     "Error retrieving S.M.A.R.T. data: no output",
+                                     WEXITSTATUS (status), stderr);
+                }
+                goto out;
+        }
+
+        rc = WEXITSTATUS (status);
+
+        if ((rc & (0x02|0x04)) != 0) {
+                throw_error (context,
+                             DEVKIT_DISKS_DEVICE_ERROR_NOT_SMART_CAPABLE,
+                             "Device is not S.M.A.R.T. capable");
+                goto out;
+        }
+
+        passed = TRUE;
+        power_on_hours = 0;
+        temperature = 0;
+
+        if ((rc & 0x08) != 0)
+                passed = FALSE;
+
+        lines = g_strsplit (stdout, "\n", 0);
+
+        in_attributes = FALSE;
+        for (n = 0; lines[n] != NULL; n++) {
+                const char *line = (const char *) lines[n];
+                int id;
+                char name[256];
+                unsigned int flags;
+                int value;
+                int worst;
+                int threshold;
+                char type[256];
+                char updated[256];
+                char when_failed[256];
+                int raw_value;
+
+                /* We're looking at parsing this block of the output
+                 *
+                 * ID# ATTRIBUTE_NAME          FLAG     VALUE WORST THRESH TYPE      UPDATED  WHEN_FAILED RAW_VALUE
+                 *   1 Raw_Read_Error_Rate     0x000f   200   200   051    Pre-fail  Always       -       1284
+                 *   3 Spin_Up_Time            0x0003   225   215   021    Pre-fail  Always       -       5725
+                 *   4 Start_Stop_Count        0x0032   100   100   000    Old_age   Always       -       204
+                 *   5 Reallocated_Sector_Ct   0x0033   199   199   140    Pre-fail  Always       -       2
+                 *   7 Seek_Error_Rate         0x000f   127   127   051    Pre-fail  Always       -       65877
+                 *   9 Power_On_Hours          0x0032   096   096   000    Old_age   Always       -       3429
+                 *  10 Spin_Retry_Count        0x0013   100   100   051    Pre-fail  Always       -       0
+                 *  11 Calibration_Retry_Count 0x0012   100   100   051    Old_age   Always       -       0
+                 *  12 Power_Cycle_Count       0x0032   100   100   000    Old_age   Always       -       153
+                 * 190 Temperature_Celsius     0x0022   058   032   045    Old_age   Always   In_the_past 42
+                 * 194 Temperature_Celsius     0x0022   253   253   000    Old_age   Always       -       43
+                 * 196 Reallocated_Event_Count 0x0032   198   198   000    Old_age   Always       -       2
+                 * 197 Current_Pending_Sector  0x0012   191   191   000    Old_age   Always       -       762
+                 * 198 Offline_Uncorrectable   0x0010   200   200   000    Old_age   Offline      -       21
+                 * 199 UDMA_CRC_Error_Count    0x003e   200   200   000    Old_age   Always       -       40
+                 * 200 Multi_Zone_Error_Rate   0x0009   170   170   051    Pre-fail  Offline      -       1542
+                 *
+                 */
+
+                if (g_str_has_prefix (line, "ID# ATTRIBUTE_NAME ")) {
+                        in_attributes = TRUE;
+                        continue;
+                }
+
+                if (!in_attributes)
+                        continue;
+
+                if (strlen (line) == 0) {
+                        break;
+                }
+
+                if (strlen (line) >= 256) {
+                        g_warning ("Ignoring line '%s' (too long)", line);
+                        continue;
+                }
+
+                if (sscanf (line, "%d %s 0x%x %d %d %d %s %s %s %d",
+                            &id, name, &flags, &value, &worst, &threshold,
+                            type, updated, when_failed, &raw_value) == 10) {
+#if 0
+                        g_printerr ("           id=%d\n", id);
+                        g_printerr ("         name='%s'\n", name);
+                        g_printerr ("        flags=0x%x\n", flags);
+                        g_printerr ("        value=%d\n", value);
+                        g_printerr ("        worst=%d\n", worst);
+                        g_printerr ("    threshold=%d\n", threshold);
+                        g_printerr ("         type='%s'\n", type);
+                        g_printerr ("      updated='%s'\n", updated);
+                        g_printerr ("  when_failed='%s'\n", when_failed);
+                        g_printerr ("    raw_value=%d\n", raw_value);
+#endif
+
+                        if (id == 9) {
+                                power_on_hours = raw_value;
+                        } else if (id == 194) {
+                                temperature = raw_value;
+                        }
+                }
+
+        }
+        g_strfreev (lines);
+
+        dbus_g_method_return (context, passed, power_on_hours, temperature);
+out:
+        ;
+}
+
+gboolean
+devkit_disks_device_retrieve_smart_data (DevkitDisksDevice     *device,
+                                         DBusGMethodInvocation *context)
+{
+        int n;
+        char *argv[10];
+        GError *error;
+        PolKitCaller *pk_caller;
+
+        if ((pk_caller = devkit_disks_damon_local_get_caller_for_context (device->priv->daemon, context)) == NULL)
+                goto out;
+
+        if (!device->priv->info.device_is_drive) {
+                throw_error (context, DEVKIT_DISKS_DEVICE_ERROR_NOT_DRIVE,
+                             "Device is not a drive");
+                goto out;
+        }
+
+#if 0
+        if (!devkit_disks_damon_local_check_auth (device->priv->daemon,
+                                                  pk_caller,
+                                                  /* TODO: revisit auth */
+                                                  "org.freedesktop.devicekit.disks.erase",
+                                                  context)) {
+                goto out;
+        }
+#endif
+
+        n = 0;
+        argv[n++] = "smartctl";
+        argv[n++] = "--all";
+        argv[n++] = device->priv->info.device_file;
+        argv[n++] = NULL;
+
+        error = NULL;
+        if (!job_new (context,
+                      "RetrieveSmartData",
+                      FALSE,
+                      device,
+                      pk_caller,
+                      argv,
+                      NULL,
+                      retrieve_smart_data_completed_cb,
+                      NULL,
+                      NULL)) {
+                goto out;
+        }
+
+out:
+        if (pk_caller != NULL)
+                polkit_caller_unref (pk_caller);
+        return TRUE;
+}
+
+/*--------------------------------------------------------------------------------------------------------------*/
+
+static void
+run_smart_selftest_completed_cb (DBusGMethodInvocation *context,
+                                 DevkitDisksDevice *device,
+                                 PolKitCaller *pk_caller,
+                                 gboolean job_was_cancelled,
+                                 int status,
+                                 const char *stderr,
+                                 const char *stdout,
+                                 gpointer user_data)
+{
+        if (WEXITSTATUS (status) == 0 && !job_was_cancelled) {
+
+                dbus_g_method_return (context);
+
+        } else {
+                if (job_was_cancelled) {
+                        throw_error (context,
+                                     DEVKIT_DISKS_DEVICE_ERROR_JOB_WAS_CANCELLED,
+                                     "Job was cancelled");
+                } else {
+                        throw_error (context,
+                                     DEVKIT_DISKS_DEVICE_ERROR_GENERAL,
+                                     "Error running self test: helper exited with exit code %d: %s",
+                                     WEXITSTATUS (status), stderr);
+                }
+        }
+}
+
+gboolean
+devkit_disks_device_run_smart_selftest (DevkitDisksDevice     *device,
+                                        const char            *test,
+                                        gboolean               captive,
+                                        DBusGMethodInvocation *context)
+{
+        int n;
+        char *argv[10];
+        GError *error;
+        PolKitCaller *pk_caller;
+
+        if ((pk_caller = devkit_disks_damon_local_get_caller_for_context (device->priv->daemon, context)) == NULL)
+                goto out;
+
+        if (!device->priv->info.device_is_drive) {
+                throw_error (context, DEVKIT_DISKS_DEVICE_ERROR_NOT_DRIVE,
+                             "Device is not a drive");
+                goto out;
+        }
+
+        if (captive) {
+                if (devkit_disks_device_local_is_busy (device)) {
+                        throw_error (context, DEVKIT_DISKS_DEVICE_ERROR_IS_BUSY,
+                                     "Device is busy");
+                        goto out;
+                }
+
+                if (devkit_disks_device_local_partitions_are_busy (device)) {
+                        throw_error (context, DEVKIT_DISKS_DEVICE_ERROR_IS_BUSY,
+                                     "A partition on the device is busy");
+                        goto out;
+                }
+        }
+
+#if 0
+        if (!devkit_disks_damon_local_check_auth (device->priv->daemon,
+                                                  pk_caller,
+                                                  /* TODO: revisit auth */
+                                                  "org.freedesktop.devicekit.disks.erase",
+                                                  context)) {
+                goto out;
+        }
+#endif
+
+        n = 0;
+        argv[n++] = PACKAGE_LIBEXEC_DIR "/devkit-disks-helper-smart-selftest";
+        argv[n++] = device->priv->info.device_file;
+        argv[n++] = (char *) test;
+        argv[n++] = captive ? "1" : "0";
+        argv[n++] = NULL;
+
+        error = NULL;
+        if (!job_new (context,
+                      "RunSmartSelftest",
+                      TRUE,
+                      device,
+                      pk_caller,
+                      argv,
+                      NULL,
+                      run_smart_selftest_completed_cb,
+                      NULL,
+                      NULL)) {
+                goto out;
+        }
+
+out:
+        if (pk_caller != NULL)
+                polkit_caller_unref (pk_caller);
+        return TRUE;
+}
+
+/*--------------------------------------------------------------------------------------------------------------*/
+
 typedef struct {
         char                     *mount_path;
         ForceRemovalCompleteFunc  fr_callback;
@@ -5035,4 +5329,5 @@ pending:
         ;
 }
 
+
 /*--------------------------------------------------------------------------------------------------------------*/
index f6e34bb..3aeae6d 100644 (file)
@@ -71,6 +71,8 @@ typedef enum
         DEVKIT_DISKS_DEVICE_ERROR_CRYPTO_ALREADY_UNLOCKED,
         DEVKIT_DISKS_DEVICE_ERROR_CRYPTO_NOT_UNLOCKED,
         DEVKIT_DISKS_DEVICE_ERROR_IS_BUSY,
+        DEVKIT_DISKS_DEVICE_ERROR_NOT_DRIVE,
+        DEVKIT_DISKS_DEVICE_ERROR_NOT_SMART_CAPABLE,
         DEVKIT_DISKS_DEVICE_NUM_ERRORS
 } DevkitDisksDeviceError;
 
@@ -172,6 +174,14 @@ gboolean devkit_disks_device_change_filesystem_label (DevkitDisksDevice     *dev
                                                       const char            *new_label,
                                                       DBusGMethodInvocation *context);
 
+gboolean devkit_disks_device_retrieve_smart_data (DevkitDisksDevice     *device,
+                                                  DBusGMethodInvocation *context);
+
+gboolean devkit_disks_device_run_smart_selftest (DevkitDisksDevice     *device,
+                                                 const char            *test,
+                                                 gboolean               captive,
+                                                 DBusGMethodInvocation *context);
+
 G_END_DECLS
 
 #endif /* __DEVKIT_DISKS_DEVICE_H__ */
diff --git a/src/job-smart-selftest.c b/src/job-smart-selftest.c
new file mode 100644 (file)
index 0000000..5009e7f
--- /dev/null
@@ -0,0 +1,263 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2008 David Zeuthen <david@fubar.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#define _LARGEFILE64_SOURCE
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#include <glib.h>
+
+#include "job-shared.h"
+
+static const char *device;
+static gboolean cancelled = FALSE;
+
+static void
+sigterm_handler (int signum)
+{
+        cancelled = TRUE;
+}
+
+static gboolean
+abort_test (void)
+{
+        gboolean ret;
+        int exit_status;
+        char *standard_error;
+        char *command_line;
+        GError *error;
+
+        ret = FALSE;
+
+        command_line = g_strdup_printf ("smartctl -X %s", device);
+
+        error = NULL;
+        if (!g_spawn_command_line_sync (command_line,
+                                        NULL,
+                                        &standard_error,
+                                        &exit_status,
+                                        &error)) {
+                g_printerr ("cannot spawn '%s'\n", command_line);
+                goto out;
+        }
+        if (WEXITSTATUS (exit_status) != 0) {
+                g_printerr ("helper failed with:\n%s", standard_error);
+                goto out;
+        }
+        g_free (standard_error);
+
+        ret = TRUE;
+
+out:
+        return ret;
+}
+
+int
+main (int argc, char **argv)
+{
+        int ret;
+        int exit_status;
+        GError *error;
+        char *command_line;
+        char *standard_output;
+        char *standard_error;
+        const char *test;
+        gboolean captive;
+        const char *result;
+
+        ret = 1;
+        command_line = NULL;
+        standard_error = NULL;
+        result = NULL;
+
+        if (argc != 4) {
+                g_printerr ("wrong usage\n");
+                goto out;
+        }
+        device = argv[1];
+        test = argv[2];
+        captive = (strcmp (argv[3], "1") == 0);
+
+        g_print ("device   = '%s'\n", device);
+        g_print ("test     = '%s'\n", test);
+        g_print ("captive  = %d\n", captive);
+
+        command_line = g_strdup_printf ("smartctl -t %s %s %s",
+                                        test,
+                                        captive ? "-C" : "",
+                                        device);
+
+        error = NULL;
+        if (!g_spawn_command_line_sync (command_line,
+                                        NULL,
+                                        &standard_error,
+                                        &exit_status,
+                                        &error)) {
+                g_printerr ("cannot spawn '%s'\n", command_line);
+                goto out;
+        }
+        if (WEXITSTATUS (exit_status) != 0) {
+                g_printerr ("helper failed with:\n%s", standard_error);
+                goto out;
+        }
+        g_free (standard_error);
+        standard_error = NULL;
+
+        signal (SIGTERM, sigterm_handler);
+
+        /* ok, now poll every five secs via 'smartctl -c' until the test is done */
+
+        g_free (command_line);
+        command_line = g_strdup_printf ("smartctl -c %s", device);
+
+
+        /* progress at 0% initially */
+        g_print ("progress: 0 1 0 smartselftest\n");
+
+        while (TRUE) {
+                int exec_status = -1;
+                int percentage_done;
+
+                sleep (5);
+
+                if (cancelled) {
+                        g_printerr ("Abort test and exiting since we caught SIGTERM\n");
+                        abort_test ();
+                        goto out;
+                }
+
+                error = NULL;
+                if (!g_spawn_command_line_sync (command_line,
+                                                &standard_output,
+                                                &standard_error,
+                                                &exit_status,
+                                                &error)) {
+                        g_printerr ("cannot spawn '%s'\n", command_line);
+                        goto out;
+                }
+                if (WEXITSTATUS (exit_status) != 0) {
+                        g_printerr ("helper failed with:\n%s", standard_error);
+                        goto out;
+                }
+                g_free (standard_error);
+                standard_error = NULL;
+
+                int n;
+                char **lines;
+
+                lines = g_strsplit (standard_output, "\n", 0);
+                for (n = 0; lines[n] != NULL; n++) {
+                        const char *line = (const char *) lines[n];
+                        if (g_str_has_prefix (line, "Self-test execution status:")) {
+                                int m;
+
+                                for (m = 0; line[m] != '\0'; m++) {
+                                        if (g_ascii_isdigit (line[m]))
+                                                break;
+                                }
+                                if (line[m] != '\0') {
+                                        char *endp;
+
+                                        exec_status = strtol (line + m, &endp, 10);
+                                        if (*endp == ')') {
+                                                /* good */
+                                        } else {
+                                                exec_status = -1;
+                                        }
+                                }
+                        }
+                }
+                g_strfreev (lines);
+
+                /* didn't manage to parse output */
+                if (exec_status == -1) {
+                        g_printerr ("Unexpected output polling drive for selftest completion\n");
+                        abort_test ();
+                        goto out;
+                }
+
+                /* see ataprint.cpp:ataPrintSelectiveSelfTestLog() in smartmontools */
+
+                if ((exec_status >> 4) == 15) {
+                        percentage_done = 100 - (exec_status & 0x0f) * 10;
+                        g_print ("progress: 0 1 %d smartselftest\n", percentage_done);
+                } else {
+                        switch ((exec_status)>>4){
+                        case  0:
+                                result = "Completed";
+                                break;
+                        case  1:
+                                result = "Aborted_by_host";
+                                break;
+                        case  2:
+                                result = "Interrupted";
+                                break;
+                        case  3:
+                                result = "Fatal_error";
+                                break;
+                        case  4:
+                                result = "Completed_unknown_failure";
+                                break;
+                        case  5:
+                                result = "Completed_electrical_failure";
+                                break;
+                        case  6:
+                                result = "Completed_servo/seek_failure";
+                                break;
+                        case  7:
+                                result = "Completed_read_failure";
+                                break;
+                        case  8:
+                                result = "Completed_handling_damage??";
+                                break;
+                        default:
+                                g_printerr ("Unexpected status %d polling drive for selftest completion\n",
+                                            exec_status);
+                                abort_test ();
+                                goto out;
+                        }
+                        goto test_complete;
+                }
+        }
+
+test_complete:
+        /* send the result of the test back to the daemon */
+        /* g_printerr ("job-smart-selftest: %s\n", result); */
+        ret = 0;
+
+out:
+        g_free (standard_error);
+        g_free (command_line);
+        return ret;
+}
index 54b8d57..cfbacf2 100644 (file)
       <arg name="options" direction="in" type="as"/>
     </method>
 
+    <!-- RetrieveSmartData:
+
+         Retrieves S.M.A.R.T. data from the drive.
+
+         TODO: We probably should return all attributes or something
+               more detailed. And at the same time try to abstract
+               the difference between SCSI and ATA.
+
+         Returns: Whether the drive is healthy, the number of hours
+                  it's been powered and the temperature in Celcius.
+      -->
+    <method name="RetrieveSmartData">
+      <annotation name="org.freedesktop.DBus.GLib.Async" value=""/>
+
+      <arg name="is_healthy" direction="out" type="b"/>
+      <arg name="power_on_hours" direction="out" type="i"/>
+      <arg name="temperature" direction="out" type="i"/>
+    </method>
+
+
+    <!-- RunSmartSelftest:
+
+         @test:     The name of the test to run; supported values
+                    are 'short' (usually less than ten minutes)
+                    and 'long' (usually tens of minutes) and
+                    'conveyance' (usually a few minutes). See the
+                    smartctl(1) man page for details.
+
+         @captive:  If set to #TRUE then the drive will block
+                    access to the drive for the duration of the
+                    test. The method will error out if the drive
+                    is busy, e.g. if partitions are mounted.
+
+         Runs a S.M.A.R.T. self test on the drive.
+
+         TODO: need to figure out whether we need a return code
+
+      -->
+    <method name="RunSmartSelftest">
+      <annotation name="org.freedesktop.DBus.GLib.Async" value=""/>
+
+      <arg name="test" direction="in" type="s"/>
+      <arg name="captive" direction="in" type="b"/>
+    </method>
+
+
     <!-- CreatePartitionTable:
          @scheme:   The type of partition table to create. Currently
                     the types 'none', 'mbr', 'gpt' and 'apm' are