EDAC: Expose per-DIMM error counts in sysfs
authorAaron Miller <aaronmiller@fb.com>
Thu, 3 Nov 2016 22:01:53 +0000 (15:01 -0700)
committerBorislav Petkov <bp@suse.de>
Thu, 19 Jan 2017 09:29:40 +0000 (10:29 +0100)
The old csrowX sysfs directories have per-csrow error counters, but the
new dimmX directories do not currently expose error counts.

EDAC already keeps these counts, add them to sysfs so per-DIMM counts
are still available when CONFIG_EDAC_LEGACY_SYSFS=n.

Signed-off-by: Aaron Miller <aaronmiller@fb.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20161103220153.3997328-1-aaronmiller@fb.com
Signed-off-by: Borislav Petkov <bp@suse.de>
Documentation/ABI/testing/sysfs-devices-edac
Documentation/admin-guide/ras.rst
drivers/edac/edac_mc_sysfs.c

index 6568e00..46ff929 100644 (file)
@@ -138,3 +138,20 @@ Contact:   Mauro Carvalho Chehab <m.chehab@samsung.com>
 Description:   This attribute file will display what type of memory is
                currently on this csrow. Normally, either buffered or
                unbuffered memory (for example, Unbuffered-DDR3).
+
+What:          /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ce_count
+Date:          October 2016
+Contact:       linux-edac@vger.kernel.org
+Description:   This attribute file displays the total count of correctable
+               errors that have occurred on this DIMM. This count is very important
+               to examine. CEs provide early indications that a DIMM is beginning
+               to fail. This count field should be monitored for non-zero values
+               and report such information to the system administrator.
+
+What:          /sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_ue_count
+Date:          October 2016
+Contact:       linux-edac@vger.kernel.org
+Description:   This attribute file displays the total count of uncorrectable
+               errors that have occurred on this DIMM. If panic_on_ue is set, this
+               counter will not have a chance to increment, since EDAC will panic the
+               system
index d71340e..9939348 100644 (file)
@@ -438,11 +438,13 @@ A typical EDAC system has the following structure under
        │   │   ├── ce_count
        │   │   ├── ce_noinfo_count
        │   │   ├── dimm0
+       │   │   │   ├── dimm_ce_count
        │   │   │   ├── dimm_dev_type
        │   │   │   ├── dimm_edac_mode
        │   │   │   ├── dimm_label
        │   │   │   ├── dimm_location
        │   │   │   ├── dimm_mem_type
+       │   │   │   ├── dimm_ue_count
        │   │   │   ├── size
        │   │   │   └── uevent
        │   │   ├── max_location
@@ -457,11 +459,13 @@ A typical EDAC system has the following structure under
        │   │   ├── ce_count
        │   │   ├── ce_noinfo_count
        │   │   ├── dimm0
+       │   │   │   ├── dimm_ce_count
        │   │   │   ├── dimm_dev_type
        │   │   │   ├── dimm_edac_mode
        │   │   │   ├── dimm_label
        │   │   │   ├── dimm_location
        │   │   │   ├── dimm_mem_type
+       │   │   │   ├── dimm_ue_count
        │   │   │   ├── size
        │   │   │   └── uevent
        │   │   ├── max_location
@@ -483,6 +487,22 @@ this ``X`` memory module:
        This attribute file displays, in count of megabytes, the memory
        that this csrow contains.
 
+- ``dimm_ue_count`` - Uncorrectable Errors count attribute file
+
+       This attribute file displays the total count of uncorrectable
+       errors that have occurred on this DIMM. If panic_on_ue is set
+       this counter will not have a chance to increment, since EDAC
+       will panic the system.
+
+- ``dimm_ce_count`` - Correctable Errors count attribute file
+
+       This attribute file displays the total count of correctable
+       errors that have occurred on this DIMM. This count is very
+       important to examine. CEs provide early indications that a
+       DIMM is beginning to fail. This count field should be
+       monitored for non-zero values and report such information
+       to the system administrator.
+
 - ``dimm_dev_type``  - Device type attribute file
 
        This attribute file will display what type of DRAM device is
index cc45b1d..445862d 100644 (file)
@@ -569,6 +569,40 @@ static ssize_t dimmdev_edac_mode_show(struct device *dev,
        return sprintf(data, "%s\n", edac_caps[dimm->edac_mode]);
 }
 
+static ssize_t dimmdev_ce_count_show(struct device *dev,
+                                     struct device_attribute *mattr,
+                                     char *data)
+{
+       struct dimm_info *dimm = to_dimm(dev);
+       u32 count;
+       int off;
+
+       off = EDAC_DIMM_OFF(dimm->mci->layers,
+                           dimm->mci->n_layers,
+                           dimm->location[0],
+                           dimm->location[1],
+                           dimm->location[2]);
+       count = dimm->mci->ce_per_layer[dimm->mci->n_layers-1][off];
+       return sprintf(data, "%u\n", count);
+}
+
+static ssize_t dimmdev_ue_count_show(struct device *dev,
+                                     struct device_attribute *mattr,
+                                     char *data)
+{
+       struct dimm_info *dimm = to_dimm(dev);
+       u32 count;
+       int off;
+
+       off = EDAC_DIMM_OFF(dimm->mci->layers,
+                           dimm->mci->n_layers,
+                           dimm->location[0],
+                           dimm->location[1],
+                           dimm->location[2]);
+       count = dimm->mci->ue_per_layer[dimm->mci->n_layers-1][off];
+       return sprintf(data, "%u\n", count);
+}
+
 /* dimm/rank attribute files */
 static DEVICE_ATTR(dimm_label, S_IRUGO | S_IWUSR,
                   dimmdev_label_show, dimmdev_label_store);
@@ -577,6 +611,8 @@ static DEVICE_ATTR(size, S_IRUGO, dimmdev_size_show, NULL);
 static DEVICE_ATTR(dimm_mem_type, S_IRUGO, dimmdev_mem_type_show, NULL);
 static DEVICE_ATTR(dimm_dev_type, S_IRUGO, dimmdev_dev_type_show, NULL);
 static DEVICE_ATTR(dimm_edac_mode, S_IRUGO, dimmdev_edac_mode_show, NULL);
+static DEVICE_ATTR(dimm_ce_count, S_IRUGO, dimmdev_ce_count_show, NULL);
+static DEVICE_ATTR(dimm_ue_count, S_IRUGO, dimmdev_ue_count_show, NULL);
 
 /* attributes of the dimm<id>/rank<id> object */
 static struct attribute *dimm_attrs[] = {
@@ -586,6 +622,8 @@ static struct attribute *dimm_attrs[] = {
        &dev_attr_dimm_mem_type.attr,
        &dev_attr_dimm_dev_type.attr,
        &dev_attr_dimm_edac_mode.attr,
+       &dev_attr_dimm_ce_count.attr,
+       &dev_attr_dimm_ue_count.attr,
        NULL,
 };