powerpc/papr_scm: Implement initial support for injecting smart errors
authorVaibhav Jain <vaibhav@linux.ibm.com>
Mon, 24 Jan 2022 20:22:04 +0000 (01:52 +0530)
committerMichael Ellerman <mpe@ellerman.id.au>
Wed, 16 Feb 2022 12:10:47 +0000 (23:10 +1100)
Presently PAPR doesn't support injecting smart errors on an
NVDIMM. This makes testing the NVDIMM health reporting functionality
difficult as simulating NVDIMM health related events need a hacked up
qemu version.

To solve this problem this patch proposes simulating certain set of
NVDIMM health related events in papr_scm. Specifically 'fatal' health
state and 'dirty' shutdown state. These error can be injected via the
user-space 'ndctl-inject-smart(1)' command. With the proposed patch and
corresponding ndctl patches following command flow is expected:

$ sudo ndctl list -DH -d nmem0
...
      "health_state":"ok",
      "shutdown_state":"clean",
...
 # inject unsafe shutdown and fatal health error
$ sudo ndctl inject-smart nmem0 -Uf
...
      "health_state":"fatal",
      "shutdown_state":"dirty",
...
 # uninject all errors
$ sudo ndctl inject-smart nmem0 -N
...
      "health_state":"ok",
      "shutdown_state":"clean",
...

The patch adds a new member 'health_bitmap_inject_mask' inside struct
papr_scm_priv which is then bitwise ANDed to the health bitmap fetched from the
hypervisor. The value for 'health_bitmap_inject_mask' is accessible from sysfs
at nmemX/papr/health_bitmap_inject.

A new PDSM named 'SMART_INJECT' is proposed that accepts newly
introduced 'struct nd_papr_pdsm_smart_inject' as payload thats
exchanged between libndctl and papr_scm to indicate the requested
smart-error states.

When the processing the PDSM 'SMART_INJECT', papr_pdsm_smart_inject()
constructs a pair or 'inject_mask' and 'clear_mask' bitmaps from the payload
and bit-blt it to the 'health_bitmap_inject_mask'. This ensures the after being
fetched from the hypervisor, the health_bitmap reflects requested smart-error
states.

Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220124202204.1488346-1-vaibhav@linux.ibm.com
Documentation/ABI/testing/sysfs-bus-papr-pmem
arch/powerpc/include/uapi/asm/papr_pdsm.h
arch/powerpc/platforms/pseries/papr_scm.c

index 95254ce..4ac0673 100644 (file)
@@ -61,3 +61,15 @@ Description:
                * "CchRHCnt" : Cache Read Hit Count
                * "CchWHCnt" : Cache Write Hit Count
                * "FastWCnt" : Fast Write Count
+
+What:          /sys/bus/nd/devices/nmemX/papr/health_bitmap_inject
+Date:          Jan, 2022
+KernelVersion: v5.17
+Contact:       linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, nvdimm@lists.linux.dev,
+Description:
+               (RO) Reports the health bitmap inject bitmap that is applied to
+               bitmap received from PowerVM via the H_SCM_HEALTH. This is used
+               to forcibly set specific bits returned from Hcall. These is then
+               used to simulate various health or shutdown states for an nvdimm
+               and are set by user-space tools like ndctl by issuing a PAPR DSM.
+
index 82488b1..1743992 100644 (file)
@@ -116,6 +116,22 @@ struct nd_papr_pdsm_health {
        };
 };
 
+/* Flags for injecting specific smart errors */
+#define PDSM_SMART_INJECT_HEALTH_FATAL         (1 << 0)
+#define PDSM_SMART_INJECT_BAD_SHUTDOWN         (1 << 1)
+
+struct nd_papr_pdsm_smart_inject {
+       union {
+               struct {
+                       /* One or more of PDSM_SMART_INJECT_ */
+                       __u32 flags;
+                       __u8 fatal_enable;
+                       __u8 unsafe_shutdown_enable;
+               };
+               __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
+       };
+};
+
 /*
  * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel
  * via 'nd_cmd_pkg.nd_command' member of the ioctl struct
@@ -123,12 +139,14 @@ struct nd_papr_pdsm_health {
 enum papr_pdsm {
        PAPR_PDSM_MIN = 0x0,
        PAPR_PDSM_HEALTH,
+       PAPR_PDSM_SMART_INJECT,
        PAPR_PDSM_MAX,
 };
 
 /* Maximal union that can hold all possible payload types */
 union nd_pdsm_payload {
        struct nd_papr_pdsm_health health;
+       struct nd_papr_pdsm_smart_inject smart_inject;
        __u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
 } __packed;
 
index f48e87a..20aafd3 100644 (file)
@@ -120,6 +120,10 @@ struct papr_scm_priv {
 
        /* length of the stat buffer as expected by phyp */
        size_t stat_buffer_len;
+
+       /* The bits which needs to be overridden */
+       u64 health_bitmap_inject_mask;
+
 };
 
 static int papr_scm_pmem_flush(struct nd_region *nd_region,
@@ -347,19 +351,29 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p,
 static int __drc_pmem_query_health(struct papr_scm_priv *p)
 {
        unsigned long ret[PLPAR_HCALL_BUFSIZE];
+       u64 bitmap = 0;
        long rc;
 
        /* issue the hcall */
        rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index);
-       if (rc != H_SUCCESS) {
+       if (rc == H_SUCCESS)
+               bitmap = ret[0] & ret[1];
+       else if (rc == H_FUNCTION)
+               dev_info_once(&p->pdev->dev,
+                             "Hcall H_SCM_HEALTH not implemented, assuming empty health bitmap");
+       else {
+
                dev_err(&p->pdev->dev,
                        "Failed to query health information, Err:%ld\n", rc);
                return -ENXIO;
        }
 
        p->lasthealth_jiffies = jiffies;
-       p->health_bitmap = ret[0] & ret[1];
-
+       /* Allow injecting specific health bits via inject mask. */
+       if (p->health_bitmap_inject_mask)
+               bitmap = (bitmap & ~p->health_bitmap_inject_mask) |
+                       p->health_bitmap_inject_mask;
+       WRITE_ONCE(p->health_bitmap, bitmap);
        dev_dbg(&p->pdev->dev,
                "Queried dimm health info. Bitmap:0x%016lx Mask:0x%016lx\n",
                ret[0], ret[1]);
@@ -669,6 +683,56 @@ out:
        return rc;
 }
 
+/* Inject a smart error Add the dirty-shutdown-counter value to the pdsm */
+static int papr_pdsm_smart_inject(struct papr_scm_priv *p,
+                                 union nd_pdsm_payload *payload)
+{
+       int rc;
+       u32 supported_flags = 0;
+       u64 inject_mask = 0, clear_mask = 0;
+       u64 mask;
+
+       /* Check for individual smart error flags and update inject/clear masks */
+       if (payload->smart_inject.flags & PDSM_SMART_INJECT_HEALTH_FATAL) {
+               supported_flags |= PDSM_SMART_INJECT_HEALTH_FATAL;
+               if (payload->smart_inject.fatal_enable)
+                       inject_mask |= PAPR_PMEM_HEALTH_FATAL;
+               else
+                       clear_mask |= PAPR_PMEM_HEALTH_FATAL;
+       }
+
+       if (payload->smart_inject.flags & PDSM_SMART_INJECT_BAD_SHUTDOWN) {
+               supported_flags |= PDSM_SMART_INJECT_BAD_SHUTDOWN;
+               if (payload->smart_inject.unsafe_shutdown_enable)
+                       inject_mask |= PAPR_PMEM_SHUTDOWN_DIRTY;
+               else
+                       clear_mask |= PAPR_PMEM_SHUTDOWN_DIRTY;
+       }
+
+       dev_dbg(&p->pdev->dev, "[Smart-inject] inject_mask=%#llx clear_mask=%#llx\n",
+               inject_mask, clear_mask);
+
+       /* Prevent concurrent access to dimm health bitmap related members */
+       rc = mutex_lock_interruptible(&p->health_mutex);
+       if (rc)
+               return rc;
+
+       /* Use inject/clear masks to set health_bitmap_inject_mask */
+       mask = READ_ONCE(p->health_bitmap_inject_mask);
+       mask = (mask & ~clear_mask) | inject_mask;
+       WRITE_ONCE(p->health_bitmap_inject_mask, mask);
+
+       /* Invalidate cached health bitmap */
+       p->lasthealth_jiffies = 0;
+
+       mutex_unlock(&p->health_mutex);
+
+       /* Return the supported flags back to userspace */
+       payload->smart_inject.flags = supported_flags;
+
+       return sizeof(struct nd_papr_pdsm_health);
+}
+
 /*
  * 'struct pdsm_cmd_desc'
  * Identifies supported PDSMs' expected length of in/out payloads
@@ -702,6 +766,12 @@ static const struct pdsm_cmd_desc __pdsm_cmd_descriptors[] = {
                .size_out = sizeof(struct nd_papr_pdsm_health),
                .service = papr_pdsm_health,
        },
+
+       [PAPR_PDSM_SMART_INJECT] = {
+               .size_in = sizeof(struct nd_papr_pdsm_smart_inject),
+               .size_out = sizeof(struct nd_papr_pdsm_smart_inject),
+               .service = papr_pdsm_smart_inject,
+       },
        /* Empty */
        [PAPR_PDSM_MAX] = {
                .size_in = 0,
@@ -838,6 +908,19 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
        return 0;
 }
 
+static ssize_t health_bitmap_inject_show(struct device *dev,
+                                        struct device_attribute *attr,
+                                        char *buf)
+{
+       struct nvdimm *dimm = to_nvdimm(dev);
+       struct papr_scm_priv *p = nvdimm_provider_data(dimm);
+
+       return sprintf(buf, "%#llx\n",
+                      READ_ONCE(p->health_bitmap_inject_mask));
+}
+
+static DEVICE_ATTR_ADMIN_RO(health_bitmap_inject);
+
 static ssize_t perf_stats_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
 {
@@ -952,6 +1035,7 @@ static struct attribute *papr_nd_attributes[] = {
        &dev_attr_flags.attr,
        &dev_attr_perf_stats.attr,
        &dev_attr_dirty_shutdown.attr,
+       &dev_attr_health_bitmap_inject.attr,
        NULL,
 };