From 1cbeb1020edc2e9d92a0e253cf9ac470b18f63d7 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Fri, 14 Jan 2022 13:50:19 +0900 Subject: [PATCH] Imported Upstream version 0.8.0 --- .gitignore | 3 +- Makefile.inc | 4 +- kpartx/dasd.c | 7 +- libmpathcmd/mpath_cmd.c | 4 + libmpathcmd/mpath_cmd.h | 6 + libmpathpersist/mpath_persist.c | 7 +- libmpathpersist/mpath_pr_ioctl.c | 12 +- libmultipath/Makefile | 17 +- libmultipath/blacklist.c | 56 +- libmultipath/blacklist.h | 2 +- libmultipath/callout.c | 13 +- libmultipath/checkers.c | 9 +- libmultipath/checkers.h | 6 +- libmultipath/checkers/tur.c | 6 +- libmultipath/config.c | 3 + libmultipath/config.h | 9 + libmultipath/configure.c | 136 +++- libmultipath/configure.h | 22 + libmultipath/devmapper.c | 41 +- libmultipath/dict.c | 41 +- libmultipath/discovery.c | 34 +- libmultipath/dmparser.c | 6 +- libmultipath/foreign/Makefile | 7 +- libmultipath/foreign/nvme.c | 193 +++-- libmultipath/log_pthread.c | 3 + libmultipath/nvme-lib.c | 49 ++ libmultipath/nvme-lib.h | 39 + libmultipath/nvme/argconfig.h | 99 +++ libmultipath/nvme/json.h | 87 ++ libmultipath/nvme/linux/nvme.h | 1450 ++++++++++++++++++++++++++++++++++ libmultipath/nvme/linux/nvme_ioctl.h | 67 ++ libmultipath/nvme/nvme-ioctl.c | 869 ++++++++++++++++++++ libmultipath/nvme/nvme-ioctl.h | 139 ++++ libmultipath/nvme/nvme.h | 163 ++++ libmultipath/nvme/plugin.h | 36 + libmultipath/prio.c | 2 +- libmultipath/prio.h | 1 + libmultipath/prioritizers/Makefile | 5 + libmultipath/prioritizers/ana.c | 232 ++++++ libmultipath/propsel.c | 151 +++- libmultipath/propsel.h | 3 + libmultipath/structs.h | 31 +- libmultipath/structs_vec.c | 33 +- libmultipath/structs_vec.h | 3 +- libmultipath/sysfs.c | 5 - libmultipath/uevent.c | 6 +- libmultipath/util.c | 13 +- libmultipath/util.h | 19 + libmultipath/version.h | 4 +- multipath/main.c | 136 ++-- multipath/multipath.conf.5 | 143 +++- multipathd/cli.c | 2 + multipathd/cli.h | 6 + multipathd/cli_handlers.c | 8 +- multipathd/dmevents.c | 8 +- multipathd/main.c | 192 +++-- tests/Makefile | 7 +- tests/blacklist.c | 7 +- tests/hwtable.c | 91 ++- tests/util.c | 98 +++ 60 files changed, 4453 insertions(+), 398 deletions(-) create mode 100644 libmultipath/nvme-lib.c create mode 100644 libmultipath/nvme-lib.h create mode 100644 libmultipath/nvme/argconfig.h create mode 100644 libmultipath/nvme/json.h create mode 100644 libmultipath/nvme/linux/nvme.h create mode 100644 libmultipath/nvme/linux/nvme_ioctl.h create mode 100644 libmultipath/nvme/nvme-ioctl.c create mode 100644 libmultipath/nvme/nvme-ioctl.h create mode 100644 libmultipath/nvme/nvme.h create mode 100644 libmultipath/nvme/plugin.h create mode 100644 libmultipath/prioritizers/ana.c diff --git a/.gitignore b/.gitignore index 35c59a7..9926756 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,5 @@ libdmmp/test/libdmmp_test libdmmp/test/libdmmp_speed_test tests/*-test tests/*.out - +libmultipath/nvme-ioctl.c +libmultipath/nvme-ioctl.h diff --git a/Makefile.inc b/Makefile.inc index a83f02c..fc728ca 100644 --- a/Makefile.inc +++ b/Makefile.inc @@ -9,9 +9,6 @@ # WITH_LOCAL_LIBDM = 1 # WITH_LOCAL_LIBSYSFS = 1 # -# Uncomment to disable RADOS support (e.g. if rados headers are missing). -# ENABLE_RADOS = 0 -# # Uncomment to disable libdmmp support # ENABLE_LIBDMMP = 0 # @@ -66,6 +63,7 @@ mpathpersistdir = $(TOPDIR)/libmpathpersist mpathcmddir = $(TOPDIR)/libmpathcmd thirdpartydir = $(TOPDIR)/third-party libdmmpdir = $(TOPDIR)/libdmmp +nvmedir = $(TOPDIR)/libmultipath/nvme includedir = $(prefix)/usr/include pkgconfdir = $(usrlibdir)/pkgconfig diff --git a/kpartx/dasd.c b/kpartx/dasd.c index 94ae81b..61b609a 100644 --- a/kpartx/dasd.c +++ b/kpartx/dasd.c @@ -137,7 +137,7 @@ read_dasd_pt(int fd, struct slice all, struct slice *sp, int ns) /* Not a DASD */ return -1; } else { - fd_dasd = fd; + fd_dasd = dup(fd); } if (ioctl(fd_dasd, BIODASDINFO, (unsigned long)&info) != 0) { @@ -190,7 +190,7 @@ read_dasd_pt(int fd, struct slice all, struct slice *sp, int ns) memcpy (&vlabel, data, sizeof(vlabel)); else { bzero(&vlabel,4); - memcpy (&vlabel.vollbl, data, sizeof(vlabel) - 4); + memcpy ((char *)&vlabel + 4, data, sizeof(vlabel) - 4); } vtoc_ebcdic_dec(vlabel.vollbl, type, 4); @@ -288,7 +288,6 @@ read_dasd_pt(int fd, struct slice all, struct slice *sp, int ns) out: if (data != NULL) free(data); - if (fd_dasd != -1 && fd_dasd != fd) - close(fd_dasd); + close(fd_dasd); return retval; } diff --git a/libmpathcmd/mpath_cmd.c b/libmpathcmd/mpath_cmd.c index 61e6a98..df4ca54 100644 --- a/libmpathcmd/mpath_cmd.c +++ b/libmpathcmd/mpath_cmd.c @@ -133,6 +133,10 @@ ssize_t mpath_recv_reply_len(int fd, unsigned int timeout) errno = EIO; return -1; } + if (len <= 0 || len >= MAX_REPLY_LEN) { + errno = ERANGE; + return -1; + } return len; } diff --git a/libmpathcmd/mpath_cmd.h b/libmpathcmd/mpath_cmd.h index df9d938..15aeb06 100644 --- a/libmpathcmd/mpath_cmd.h +++ b/libmpathcmd/mpath_cmd.h @@ -20,6 +20,12 @@ #ifndef LIB_MPATH_CMD_H #define LIB_MPATH_CMD_H +/* + * This should be sufficient for json output for >10000 maps, + * and >60000 paths. + */ +#define MAX_REPLY_LEN (32 * 1024 * 1024) + #ifdef __cplusplus extern "C" { #endif diff --git a/libmpathpersist/mpath_persist.c b/libmpathpersist/mpath_persist.c index 2ffe56e..6505774 100644 --- a/libmpathpersist/mpath_persist.c +++ b/libmpathpersist/mpath_persist.c @@ -188,7 +188,7 @@ int mpath_persistent_reserve_in (int fd, int rq_servact, condlog(3, "alias = %s", alias); map_present = dm_map_present(alias); - if (map_present && !dm_is_mpath(alias)){ + if (map_present && dm_is_mpath(alias) != 1){ condlog( 0, "%s: not a multipath device.", alias); ret = MPATH_PR_DMMP_ERROR; goto out; @@ -283,7 +283,7 @@ int mpath_persistent_reserve_out ( int fd, int rq_servact, int rq_scope, condlog(3, "alias = %s", alias); map_present = dm_map_present(alias); - if (map_present && !dm_is_mpath(alias)){ + if (map_present && dm_is_mpath(alias) != 1){ condlog(3, "%s: not a multipath device.", alias); ret = MPATH_PR_DMMP_ERROR; goto out; @@ -889,7 +889,8 @@ int update_map_pr(struct multipath *mpp) if (!get_be64(mpp->reservation_key)) { /* Nothing to do. Assuming pr mgmt feature is disabled*/ - condlog(3, "%s: reservation_key not set in multipath.conf", mpp->alias); + condlog(4, "%s: reservation_key not set in multipath.conf", + mpp->alias); return MPATH_PR_SUCCESS; } diff --git a/libmpathpersist/mpath_pr_ioctl.c b/libmpathpersist/mpath_pr_ioctl.c index a222b1e..cf528fe 100644 --- a/libmpathpersist/mpath_pr_ioctl.c +++ b/libmpathpersist/mpath_pr_ioctl.c @@ -211,7 +211,8 @@ void mpath_format_readfullstatus(struct prin_resp *pr_buff, int len, int noisy) unsigned char *p; char *ppbuff; uint32_t additional_length; - + char tempbuff[MPATH_MAX_PARAM_LEN]; + struct prin_fulldescr fdesc; convert_be32_to_cpu(&pr_buff->prin_descriptor.prin_readfd.prgeneration); convert_be32_to_cpu(&pr_buff->prin_descriptor.prin_readfd.number_of_descriptor); @@ -223,9 +224,12 @@ void mpath_format_readfullstatus(struct prin_resp *pr_buff, int len, int noisy) } additional_length = pr_buff->prin_descriptor.prin_readfd.number_of_descriptor; + if (additional_length > MPATH_MAX_PARAM_LEN) { + condlog(3, "PRIN length %u exceeds max length %d", additional_length, + MPATH_MAX_PARAM_LEN); + return; + } - char tempbuff[MPATH_MAX_PARAM_LEN]; - struct prin_fulldescr fdesc; memset(&fdesc, 0, sizeof(struct prin_fulldescr)); memcpy( tempbuff, pr_buff->prin_descriptor.prin_readfd.private_buffer,MPATH_MAX_PARAM_LEN ); @@ -241,7 +245,7 @@ void mpath_format_readfullstatus(struct prin_resp *pr_buff, int len, int noisy) fdesc.rtpi = get_unaligned_be16(&p[18]); tid_len_len = get_unaligned_be32(&p[20]); - if (tid_len_len + 24 + k >= additional_length) { + if (tid_len_len + 24 + k > additional_length) { condlog(0, "%s: corrupt PRIN response: status descriptor end %d exceeds length %d", __func__, tid_len_len + k + 24, diff --git a/libmultipath/Makefile b/libmultipath/Makefile index 33f5269..a2be42e 100644 --- a/libmultipath/Makefile +++ b/libmultipath/Makefile @@ -7,7 +7,7 @@ SONAME = 0 DEVLIB = libmultipath.so LIBS = $(DEVLIB).$(SONAME) -CFLAGS += $(LIB_CFLAGS) -I$(mpathcmddir) -I$(mpathpersistdir) +CFLAGS += $(LIB_CFLAGS) -I$(mpathcmddir) -I$(mpathpersistdir) -I$(nvmedir) LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -lurcu -laio @@ -43,10 +43,21 @@ OBJS = memory.o parser.o vector.o devmapper.o callout.o \ switchgroup.o uxsock.o print.o alias.o log_pthread.o \ log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \ lock.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o \ - io_err_stat.o dm-generic.o generic.o foreign.o + io_err_stat.o dm-generic.o generic.o foreign.o nvme-lib.o all: $(LIBS) +nvme-lib.o: nvme-lib.c nvme-ioctl.c nvme-ioctl.h + $(CC) $(CFLAGS) -Wno-unused-function -c -o $@ $< + +make_static = $(shell sed '/^static/!s/^\([a-z]\{1,\} \)/static \1/' <$1 >$2) + +nvme-ioctl.c: nvme/nvme-ioctl.c + $(call make_static,$<,$@) + +nvme-ioctl.h: nvme/nvme-ioctl.h + $(call make_static,$<,$@) + $(LIBS): $(OBJS) $(CC) $(LDFLAGS) $(SHARED_FLAGS) -Wl,-soname=$@ -o $@ $(OBJS) $(LIBDEPS) $(LN) $@ $(DEVLIB) @@ -62,7 +73,7 @@ uninstall: $(RM) $(DESTDIR)$(syslibdir)/$(DEVLIB) clean: dep_clean - $(RM) core *.a *.o *.so *.so.* *.gz + $(RM) core *.a *.o *.so *.so.* *.gz nvme-ioctl.c nvme-ioctl.h include $(wildcard $(OBJS:.o=.d)) diff --git a/libmultipath/blacklist.c b/libmultipath/blacklist.c index 318ec03..e0d0279 100644 --- a/libmultipath/blacklist.c +++ b/libmultipath/blacklist.c @@ -192,7 +192,7 @@ setup_default_blist (struct config * conf) char * str; int i; - str = STRDUP("^(ram|raw|loop|fd|md|dm-|sr|scd|st|dcssblk)[0-9]"); + str = STRDUP("^(ram|zram|raw|loop|fd|md|dm-|sr|scd|st|dcssblk)[0-9]"); if (!str) return 1; if (store_ble(conf->blist_devnode, str, ORIGIN_DEFAULT)) @@ -232,24 +232,24 @@ setup_default_blist (struct config * conf) return 0; } -#define LOG_BLIST(M,S) \ +#define LOG_BLIST(M, S, lvl) \ if (vendor && product) \ - condlog(3, "%s: (%s:%s) %s %s", \ + condlog(lvl, "%s: (%s:%s) %s %s", \ dev, vendor, product, (M), (S)); \ else if (wwid && !dev) \ - condlog(3, "%s: %s %s", wwid, (M), (S)); \ + condlog(lvl, "%s: %s %s", wwid, (M), (S)); \ else if (wwid) \ - condlog(3, "%s: %s %s %s", dev, (M), wwid, (S)); \ + condlog(lvl, "%s: %s %s %s", dev, (M), wwid, (S)); \ else if (env) \ - condlog(3, "%s: %s %s %s", dev, (M), env, (S)); \ + condlog(lvl, "%s: %s %s %s", dev, (M), env, (S)); \ else if (protocol) \ - condlog(3, "%s: %s %s %s", dev, (M), protocol, (S)); \ + condlog(lvl, "%s: %s %s %s", dev, (M), protocol, (S)); \ else \ - condlog(3, "%s: %s %s", dev, (M), (S)) + condlog(lvl, "%s: %s %s", dev, (M), (S)) -void +static void log_filter (const char *dev, char *vendor, char *product, char *wwid, - const char *env, const char *protocol, int r) + const char *env, const char *protocol, int r, int lvl) { /* * Try to sort from most likely to least. @@ -258,37 +258,37 @@ log_filter (const char *dev, char *vendor, char *product, char *wwid, case MATCH_NOTHING: break; case MATCH_DEVICE_BLIST: - LOG_BLIST("vendor/product", "blacklisted"); + LOG_BLIST("vendor/product", "blacklisted", lvl); break; case MATCH_WWID_BLIST: - LOG_BLIST("wwid", "blacklisted"); + LOG_BLIST("wwid", "blacklisted", lvl); break; case MATCH_DEVNODE_BLIST: - LOG_BLIST("device node name", "blacklisted"); + LOG_BLIST("device node name", "blacklisted", lvl); break; case MATCH_PROPERTY_BLIST: - LOG_BLIST("udev property", "blacklisted"); + LOG_BLIST("udev property", "blacklisted", lvl); break; case MATCH_PROTOCOL_BLIST: - LOG_BLIST("protocol", "blacklisted"); + LOG_BLIST("protocol", "blacklisted", lvl); break; case MATCH_DEVICE_BLIST_EXCEPT: - LOG_BLIST("vendor/product", "whitelisted"); + LOG_BLIST("vendor/product", "whitelisted", lvl); break; case MATCH_WWID_BLIST_EXCEPT: - LOG_BLIST("wwid", "whitelisted"); + LOG_BLIST("wwid", "whitelisted", lvl); break; case MATCH_DEVNODE_BLIST_EXCEPT: - LOG_BLIST("device node name", "whitelisted"); + LOG_BLIST("device node name", "whitelisted", lvl); break; case MATCH_PROPERTY_BLIST_EXCEPT: - LOG_BLIST("udev property", "whitelisted"); + LOG_BLIST("udev property", "whitelisted", lvl); break; case MATCH_PROPERTY_BLIST_MISSING: - LOG_BLIST("blacklisted,", "udev property missing"); + LOG_BLIST("blacklisted,", "udev property missing", lvl); break; case MATCH_PROTOCOL_BLIST_EXCEPT: - LOG_BLIST("protocol", "whitelisted"); + LOG_BLIST("protocol", "whitelisted", lvl); break; } } @@ -306,7 +306,7 @@ filter_device (vector blist, vector elist, char * vendor, char * product, r = MATCH_DEVICE_BLIST; } - log_filter(dev, vendor, product, NULL, NULL, NULL, r); + log_filter(dev, vendor, product, NULL, NULL, NULL, r, 3); return r; } @@ -322,7 +322,7 @@ filter_devnode (vector blist, vector elist, char * dev) r = MATCH_DEVNODE_BLIST; } - log_filter(dev, NULL, NULL, NULL, NULL, NULL, r); + log_filter(dev, NULL, NULL, NULL, NULL, NULL, r, 3); return r; } @@ -338,7 +338,7 @@ filter_wwid (vector blist, vector elist, char * wwid, char * dev) r = MATCH_WWID_BLIST; } - log_filter(dev, NULL, NULL, wwid, NULL, NULL, r); + log_filter(dev, NULL, NULL, wwid, NULL, NULL, r, 3); return r; } @@ -357,7 +357,7 @@ filter_protocol(vector blist, vector elist, struct path * pp) r = MATCH_PROTOCOL_BLIST; } - log_filter(pp->dev, NULL, NULL, NULL, NULL, buf, r); + log_filter(pp->dev, NULL, NULL, NULL, NULL, buf, r, 3); return r; } @@ -366,7 +366,7 @@ filter_path (struct config * conf, struct path * pp) { int r; - r = filter_property(conf, pp->udev); + r = filter_property(conf, pp->udev, 3); if (r > 0) return r; r = filter_devnode(conf->blist_devnode, conf->elist_devnode, pp->dev); @@ -384,7 +384,7 @@ filter_path (struct config * conf, struct path * pp) } int -filter_property(struct config * conf, struct udev_device * udev) +filter_property(struct config *conf, struct udev_device *udev, int lvl) { const char *devname = udev_device_get_sysname(udev); struct udev_list_entry *list_entry; @@ -415,7 +415,7 @@ filter_property(struct config * conf, struct udev_device * udev) } } - log_filter(devname, NULL, NULL, NULL, env, NULL, r); + log_filter(devname, NULL, NULL, NULL, env, NULL, r, lvl); return r; } diff --git a/libmultipath/blacklist.h b/libmultipath/blacklist.h index 18903b6..4c8ec99 100644 --- a/libmultipath/blacklist.h +++ b/libmultipath/blacklist.h @@ -37,7 +37,7 @@ int filter_devnode (vector, vector, char *); int filter_wwid (vector, vector, char *, char *); int filter_device (vector, vector, char *, char *, char *); int filter_path (struct config *, struct path *); -int filter_property(struct config *, struct udev_device *); +int filter_property(struct config *, struct udev_device *, int); int filter_protocol(vector, vector, struct path *); int store_ble (vector, char *, int); int set_ble_device (vector, char *, char *, int); diff --git a/libmultipath/callout.c b/libmultipath/callout.c index d5ca27b..dac088c 100644 --- a/libmultipath/callout.c +++ b/libmultipath/callout.c @@ -68,19 +68,20 @@ int execute_program(char *path, char *value, int len) switch(pid) { case 0: /* child */ - close(STDOUT_FILENO); /* dup write side of pipe to STDOUT */ - if (dup(fds[1]) < 0) + if (dup2(fds[1], STDOUT_FILENO) < 0) { + condlog(1, "failed to dup2 stdout: %m"); return -1; + } + close(fds[0]); + close(fds[1]); /* Ignore writes to stderr */ null_fd = open("/dev/null", O_WRONLY); if (null_fd > 0) { - int err_fd __attribute__ ((unused)); - - close(STDERR_FILENO); - err_fd = dup(null_fd); + if (dup2(null_fd, STDERR_FILENO) < 0) + condlog(1, "failed to dup2 stderr: %m"); close(null_fd); } diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c index 848c4c3..f4fdcae 100644 --- a/libmultipath/checkers.c +++ b/libmultipath/checkers.c @@ -261,13 +261,6 @@ int checker_check (struct checker * c, int path_state) return r; } -int checker_selected(const struct checker *c) -{ - if (!c) - return 0; - return c->cls != NULL; -} - const char *checker_name(const struct checker *c) { if (!c || !c->cls) @@ -295,7 +288,7 @@ const char *checker_message(const struct checker *c) { int id; - if (!c || c->msgid < 0 || + if (!c || !c->cls || c->msgid < 0 || (c->msgid >= CHECKER_GENERIC_MSGTABLE_SIZE && c->msgid < CHECKER_FIRST_MSGID)) goto bad_id; diff --git a/libmultipath/checkers.h b/libmultipath/checkers.h index b2e8f9a..dab197f 100644 --- a/libmultipath/checkers.h +++ b/libmultipath/checkers.h @@ -129,6 +129,11 @@ struct checker { you want to stuff data in. */ }; +static inline int checker_selected(const struct checker *c) +{ + return c != NULL && c->cls != NULL; +} + const char *checker_state_name(int); int init_checkers(const char *); void cleanup_checkers (void); @@ -142,7 +147,6 @@ void checker_set_fd (struct checker *, int); void checker_enable (struct checker *); void checker_disable (struct checker *); int checker_check (struct checker *, int); -int checker_selected(const struct checker *); int checker_is_sync(const struct checker *); const char *checker_name (const struct checker *); /* diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c index 63b1962..6b08dbb 100644 --- a/libmultipath/checkers/tur.c +++ b/libmultipath/checkers/tur.c @@ -261,7 +261,7 @@ static void *tur_thread(void *ctx) tur_thread_cleanup_push(ct); rcu_register_thread(); - condlog(3, "%d:%d : tur checker starting up", major(ct->devt), + condlog(4, "%d:%d : tur checker starting up", major(ct->devt), minor(ct->devt)); tur_deep_sleep(ct); @@ -275,7 +275,7 @@ static void *tur_thread(void *ctx) pthread_cond_signal(&ct->active); pthread_mutex_unlock(&ct->lock); - condlog(3, "%d:%d : tur checker finished, state %s", major(ct->devt), + condlog(4, "%d:%d : tur checker finished, state %s", major(ct->devt), minor(ct->devt), checker_state_name(state)); running = uatomic_xchg(&ct->running, 0); @@ -415,7 +415,7 @@ int libcheck_check(struct checker * c) } pthread_mutex_unlock(&ct->lock); if (tur_status == PATH_PENDING) { - condlog(3, "%d:%d : tur checker still running", + condlog(4, "%d:%d : tur checker still running", major(ct->devt), minor(ct->devt)); } else { int running = uatomic_xchg(&ct->running, 0); diff --git a/libmultipath/config.c b/libmultipath/config.c index 5af7af5..24d71ae 100644 --- a/libmultipath/config.c +++ b/libmultipath/config.c @@ -369,6 +369,9 @@ merge_hwe (struct hwentry * dst, struct hwentry * src) merge_num(max_sectors_kb); merge_num(ghost_delay); merge_num(all_tg_pt); + merge_num(san_path_err_threshold); + merge_num(san_path_err_forget_rate); + merge_num(san_path_err_recovery_time); snprintf(id, sizeof(id), "%s/%s", dst->vendor, dst->product); reconcile_features_with_options(id, &dst->features, diff --git a/libmultipath/config.h b/libmultipath/config.h index 7d0cd9a..b938c26 100644 --- a/libmultipath/config.h +++ b/libmultipath/config.h @@ -76,6 +76,9 @@ struct hwentry { int deferred_remove; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_forget_rate; + int san_path_err_recovery_time; int marginal_path_err_sample_time; int marginal_path_err_rate_threshold; int marginal_path_err_recheck_gap_time; @@ -112,6 +115,9 @@ struct mpentry { int deferred_remove; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_forget_rate; + int san_path_err_recovery_time; int marginal_path_err_sample_time; int marginal_path_err_rate_threshold; int marginal_path_err_recheck_gap_time; @@ -162,6 +168,9 @@ struct config { int processed_main_config; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_forget_rate; + int san_path_err_recovery_time; int marginal_path_err_sample_time; int marginal_path_err_rate_threshold; int marginal_path_err_recheck_gap_time; diff --git a/libmultipath/configure.c b/libmultipath/configure.c index ed3e30f..af4d78d 100644 --- a/libmultipath/configure.c +++ b/libmultipath/configure.c @@ -44,6 +44,10 @@ #include "sysfs.h" #include "io_err_stat.h" +/* Time in ms to wait for pending checkers in setup_map() */ +#define WAIT_CHECKERS_PENDING_MS 10 +#define WAIT_ALL_CHECKERS_PENDING_MS 90 + /* group paths in pg by host adapter */ int group_by_host_adapter(struct pathgroup *pgp, vector adapters) @@ -257,12 +261,43 @@ int rr_optimize_path_order(struct pathgroup *pgp) return 0; } +static int wait_for_pending_paths(struct multipath *mpp, + struct config *conf, + int n_pending, int goal, int wait_ms) +{ + static const struct timespec millisec = + { .tv_sec = 0, .tv_nsec = 1000*1000 }; + int i, j; + struct path *pp; + struct pathgroup *pgp; + struct timespec ts; + + do { + vector_foreach_slot(mpp->pg, pgp, i) { + vector_foreach_slot(pgp->paths, pp, j) { + if (pp->state != PATH_PENDING) + continue; + pp->state = get_state(pp, conf, + 0, PATH_PENDING); + if (pp->state != PATH_PENDING && + --n_pending <= goal) + return 0; + } + } + ts = millisec; + while (nanosleep(&ts, &ts) != 0 && errno == EINTR) + /* nothing */; + } while (--wait_ms > 0); + + return n_pending; +} + int setup_map(struct multipath *mpp, char *params, int params_size, struct vectors *vecs) { struct pathgroup * pgp; struct config *conf; - int i; + int i, n_paths; /* * don't bother if devmap size is unknown @@ -313,6 +348,9 @@ int setup_map(struct multipath *mpp, char *params, int params_size, select_marginal_path_err_rate_threshold(conf, mpp); select_marginal_path_err_recheck_gap_time(conf, mpp); select_marginal_path_double_failed_time(conf, mpp); + select_san_path_err_threshold(conf, mpp); + select_san_path_err_forget_rate(conf, mpp); + select_san_path_err_recovery_time(conf, mpp); select_skip_kpartx(conf, mpp); select_max_sectors_kb(conf, mpp); select_ghost_delay(conf, mpp); @@ -321,12 +359,24 @@ int setup_map(struct multipath *mpp, char *params, int params_size, sysfs_set_scsi_tmo(mpp, conf->checkint); pthread_cleanup_pop(1); - if (mpp->marginal_path_double_failed_time > 0 && - mpp->marginal_path_err_sample_time > 0 && - mpp->marginal_path_err_recheck_gap_time > 0 && - mpp->marginal_path_err_rate_threshold >= 0) + if (marginal_path_check_enabled(mpp)) { + if (delay_check_enabled(mpp)) { + condlog(1, "%s: WARNING: both marginal_path and delay_checks error detection selected", + mpp->alias); + condlog(0, "%s: unexpected behavior may occur!", + mpp->alias); + } start_io_err_stat_thread(vecs); - /* + } + if (san_path_check_enabled(mpp) && delay_check_enabled(mpp)) { + condlog(1, "%s: WARNING: both san_path_err and delay_checks error detection selected", + mpp->alias); + condlog(0, "%s: unexpected behavior may occur!", + mpp->alias); + } + + n_paths = VECTOR_SIZE(mpp->paths); + /* * assign paths to path groups -- start with no groups and all paths * in mpp->paths */ @@ -340,6 +390,30 @@ int setup_map(struct multipath *mpp, char *params, int params_size, if (mpp->pgpolicyfn && mpp->pgpolicyfn(mpp)) return 1; + /* + * If async state detection is used, see if pending state checks + * have finished, to get nr_active right. We can't wait until the + * checkers time out, as that may take 30s or more, and we are + * holding the vecs lock. + */ + if (conf->force_sync == 0 && n_paths > 0) { + int n_pending = pathcount(mpp, PATH_PENDING); + + if (n_pending > 0) + n_pending = wait_for_pending_paths( + mpp, conf, n_pending, 0, + WAIT_CHECKERS_PENDING_MS); + /* ALL paths pending - wait some more, but be satisfied + with only some paths finished */ + if (n_pending == n_paths) + n_pending = wait_for_pending_paths( + mpp, conf, n_pending, + n_paths >= 4 ? 2 : 1, + WAIT_ALL_CHECKERS_PENDING_MS); + if (n_pending > 0) + condlog(2, "%s: setting up map with %d/%d path checkers pending", + mpp->alias, n_pending, n_paths); + } mpp->nr_active = pathcount(mpp, PATH_UP) + pathcount(mpp, PATH_GHOST); /* @@ -788,15 +862,6 @@ fail: return 1; } -/* - * Return value: - */ -#define DOMAP_RETRY -1 -#define DOMAP_FAIL 0 -#define DOMAP_OK 1 -#define DOMAP_EXIST 2 -#define DOMAP_DRY 3 - int domap(struct multipath *mpp, char *params, int is_daemon) { int r = DOMAP_FAIL; @@ -976,7 +1041,7 @@ int check_daemon(void) if (recv_packet(fd, &reply, timeout) != 0) goto out; - if (strstr(reply, "shutdown")) + if (reply && strstr(reply, "shutdown")) goto out_free; ret = 1; @@ -998,8 +1063,8 @@ out: int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, int force_reload, enum mpath_cmds cmd) { - int r = 1; - int k, i; + int ret = CP_FAIL; + int k, i, r; int is_daemon = (cmd == CMD_NONE) ? 1 : 0; char params[PARAMS_SIZE]; struct multipath * mpp; @@ -1009,6 +1074,7 @@ int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, vector pathvec = vecs->pathvec; struct config *conf; int allow_queueing; + uint64_t *size_mismatch_seen; /* ignore refwwid if it's empty */ if (refwwid && !strlen(refwwid)) @@ -1019,6 +1085,14 @@ int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, pp1->mpp = NULL; } } + + if (VECTOR_SIZE(pathvec) == 0) + return CP_OK; + size_mismatch_seen = calloc((VECTOR_SIZE(pathvec) - 1) / 64 + 1, + sizeof(uint64_t)); + if (size_mismatch_seen == NULL) + return CP_FAIL; + vector_foreach_slot (pathvec, pp1, k) { int invalid; /* skip this path for some reason */ @@ -1038,8 +1112,8 @@ int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, continue; } - /* 2. if path already coalesced */ - if (pp1->mpp) + /* 2. if path already coalesced, or seen and discarded */ + if (pp1->mpp || is_bit_set_in_array(k, size_mismatch_seen)) continue; /* 3. if path has disappeared */ @@ -1088,9 +1162,10 @@ int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, * ouch, avoid feeding that to the DM */ condlog(0, "%s: size %llu, expected %llu. " - "Discard", pp2->dev_t, pp2->size, + "Discard", pp2->dev, pp2->size, mpp->size); mpp->action = ACT_REJECT; + set_bit_in_array(i, size_mismatch_seen); } } verify_paths(mpp, vecs); @@ -1119,8 +1194,10 @@ int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, "ignoring" : "removing"); remove_map(mpp, vecs, 0); continue; - } else /* if (r == DOMAP_RETRY) */ - return r; + } else /* if (r == DOMAP_RETRY && !is_daemon) */ { + ret = CP_RETRY; + goto out; + } } if (r == DOMAP_DRY) continue; @@ -1162,7 +1239,7 @@ int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, if (newmp) { if (mpp->action != ACT_REJECT) { if (!vector_alloc_slot(newmp)) - return 1; + goto out; vector_set_slot(newmp, mpp); } else @@ -1193,7 +1270,10 @@ int coalesce_paths (struct vectors * vecs, vector newmp, char * refwwid, condlog(2, "%s: remove (dead)", alias); } } - return 0; + ret = CP_OK; +out: + free(size_mismatch_seen); + return ret; } struct udev_device *get_udev_device(const char *dev, enum devtypes dev_type) @@ -1289,7 +1369,7 @@ int get_refwwid(enum mpath_cmds cmd, char *dev, enum devtypes dev_type, conf = get_multipath_config(); pthread_cleanup_push(put_multipath_config, conf); if (pp->udev && pp->uid_attribute && - filter_property(conf, pp->udev) > 0) + filter_property(conf, pp->udev, 3) > 0) invalid = 1; pthread_cleanup_pop(1); if (invalid) @@ -1329,7 +1409,7 @@ int get_refwwid(enum mpath_cmds cmd, char *dev, enum devtypes dev_type, conf = get_multipath_config(); pthread_cleanup_push(put_multipath_config, conf); if (pp->udev && pp->uid_attribute && - filter_property(conf, pp->udev) > 0) + filter_property(conf, pp->udev, 3) > 0) invalid = 1; pthread_cleanup_pop(1); if (invalid) @@ -1358,7 +1438,7 @@ int get_refwwid(enum mpath_cmds cmd, char *dev, enum devtypes dev_type, conf = get_multipath_config(); pthread_cleanup_push(put_multipath_config, conf); if (pp->udev && pp->uid_attribute && - filter_property(conf, pp->udev) > 0) + filter_property(conf, pp->udev, 3) > 0) invalid = 1; pthread_cleanup_pop(1); if (invalid) diff --git a/libmultipath/configure.h b/libmultipath/configure.h index 8b56d33..d750900 100644 --- a/libmultipath/configure.h +++ b/libmultipath/configure.h @@ -23,6 +23,28 @@ enum actions { ACT_IMPOSSIBLE, }; +/* + * Return value of domap() + * DAEMON_RETRY is only used for ACT_CREATE (see domap()). + */ +enum { + DOMAP_RETRY = -1, + DOMAP_FAIL = 0, + DOMAP_OK = 1, + DOMAP_EXIST = 2, + DOMAP_DRY = 3 +}; + +/* + * Return value of coalesce_paths() + * CP_RETRY is only used in non-daemon case (multipath). + */ +enum { + CP_OK = 0, + CP_FAIL, + CP_RETRY, +}; + #define FLUSH_ONE 1 #define FLUSH_ALL 2 diff --git a/libmultipath/devmapper.c b/libmultipath/devmapper.c index 0433b49..3294bd4 100644 --- a/libmultipath/devmapper.c +++ b/libmultipath/devmapper.c @@ -692,9 +692,15 @@ out: return r; } +/* + * returns: + * 1 : is multipath device + * 0 : is not multipath device + * -1 : error + */ int dm_is_mpath(const char *name) { - int r = 0; + int r = -1; struct dm_task *dmt; struct dm_info info; uint64_t start, length; @@ -703,33 +709,44 @@ int dm_is_mpath(const char *name) const char *uuid; if (!(dmt = libmp_dm_task_create(DM_DEVICE_TABLE))) - return 0; + goto out; if (!dm_task_set_name(dmt, name)) - goto out; + goto out_task; dm_task_no_open_count(dmt); if (!dm_task_run(dmt)) - goto out; + goto out_task; - if (!dm_task_get_info(dmt, &info) || !info.exists) - goto out; + if (!dm_task_get_info(dmt, &info)) + goto out_task; + + r = 0; + + if (!info.exists) + goto out_task; uuid = dm_task_get_uuid(dmt); if (!uuid || strncmp(uuid, UUID_PREFIX, UUID_PREFIX_LEN) != 0) - goto out; + goto out_task; /* Fetch 1st target */ - dm_get_next_target(dmt, NULL, &start, &length, &target_type, ¶ms); + if (dm_get_next_target(dmt, NULL, &start, &length, &target_type, + ¶ms) != NULL) + /* multiple targets */ + goto out_task; if (!target_type || strcmp(target_type, TGT_MPATH) != 0) - goto out; + goto out_task; r = 1; -out: +out_task: dm_task_destroy(dmt); +out: + if (r < 0) + condlog(2, "%s: dm command failed in %s", name, __FUNCTION__); return r; } @@ -823,7 +840,7 @@ int _dm_flush_map (const char * mapname, int need_sync, int deferred_remove, unsigned long long mapsize; char params[PARAMS_SIZE] = {0}; - if (!dm_is_mpath(mapname)) + if (dm_is_mpath(mapname) != 1) return 0; /* nothing to do */ /* if the device currently has no partitions, do not @@ -1087,7 +1104,7 @@ dm_get_maps (vector mp) } do { - if (!dm_is_mpath(names->name)) + if (dm_is_mpath(names->name) != 1) goto next; mpp = dm_get_multipath(names->name); diff --git a/libmultipath/dict.c b/libmultipath/dict.c index a81c051..eaad4f1 100644 --- a/libmultipath/dict.c +++ b/libmultipath/dict.c @@ -327,7 +327,7 @@ def_find_multipaths_handler(struct config *conf, vector strvec) int i; if (set_yes_no_undef(strvec, &conf->find_multipaths) == 0 && - conf->find_multipaths != YNU_UNDEF) + conf->find_multipaths != FIND_MULTIPATHS_UNDEF) return 0; buff = set_value(strvec); @@ -1217,6 +1217,33 @@ declare_hw_handler(delay_wait_checks, set_off_int_undef) declare_hw_snprint(delay_wait_checks, print_off_int_undef) declare_mp_handler(delay_wait_checks, set_off_int_undef) declare_mp_snprint(delay_wait_checks, print_off_int_undef) +declare_def_handler(san_path_err_threshold, set_off_int_undef) +declare_def_snprint_defint(san_path_err_threshold, print_off_int_undef, + DEFAULT_ERR_CHECKS) +declare_ovr_handler(san_path_err_threshold, set_off_int_undef) +declare_ovr_snprint(san_path_err_threshold, print_off_int_undef) +declare_hw_handler(san_path_err_threshold, set_off_int_undef) +declare_hw_snprint(san_path_err_threshold, print_off_int_undef) +declare_mp_handler(san_path_err_threshold, set_off_int_undef) +declare_mp_snprint(san_path_err_threshold, print_off_int_undef) +declare_def_handler(san_path_err_forget_rate, set_off_int_undef) +declare_def_snprint_defint(san_path_err_forget_rate, print_off_int_undef, + DEFAULT_ERR_CHECKS) +declare_ovr_handler(san_path_err_forget_rate, set_off_int_undef) +declare_ovr_snprint(san_path_err_forget_rate, print_off_int_undef) +declare_hw_handler(san_path_err_forget_rate, set_off_int_undef) +declare_hw_snprint(san_path_err_forget_rate, print_off_int_undef) +declare_mp_handler(san_path_err_forget_rate, set_off_int_undef) +declare_mp_snprint(san_path_err_forget_rate, print_off_int_undef) +declare_def_handler(san_path_err_recovery_time, set_off_int_undef) +declare_def_snprint_defint(san_path_err_recovery_time, print_off_int_undef, + DEFAULT_ERR_CHECKS) +declare_ovr_handler(san_path_err_recovery_time, set_off_int_undef) +declare_ovr_snprint(san_path_err_recovery_time, print_off_int_undef) +declare_hw_handler(san_path_err_recovery_time, set_off_int_undef) +declare_hw_snprint(san_path_err_recovery_time, print_off_int_undef) +declare_mp_handler(san_path_err_recovery_time, set_off_int_undef) +declare_mp_snprint(san_path_err_recovery_time, print_off_int_undef) declare_def_handler(marginal_path_err_sample_time, set_off_int_undef) declare_def_snprint_defint(marginal_path_err_sample_time, print_off_int_undef, DEFAULT_ERR_CHECKS) @@ -1620,6 +1647,9 @@ init_keywords(vector keywords) install_keyword("config_dir", &def_config_dir_handler, &snprint_def_config_dir); install_keyword("delay_watch_checks", &def_delay_watch_checks_handler, &snprint_def_delay_watch_checks); install_keyword("delay_wait_checks", &def_delay_wait_checks_handler, &snprint_def_delay_wait_checks); + install_keyword("san_path_err_threshold", &def_san_path_err_threshold_handler, &snprint_def_san_path_err_threshold); + install_keyword("san_path_err_forget_rate", &def_san_path_err_forget_rate_handler, &snprint_def_san_path_err_forget_rate); + install_keyword("san_path_err_recovery_time", &def_san_path_err_recovery_time_handler, &snprint_def_san_path_err_recovery_time); install_keyword("marginal_path_err_sample_time", &def_marginal_path_err_sample_time_handler, &snprint_def_marginal_path_err_sample_time); install_keyword("marginal_path_err_rate_threshold", &def_marginal_path_err_rate_threshold_handler, &snprint_def_marginal_path_err_rate_threshold); install_keyword("marginal_path_err_recheck_gap_time", &def_marginal_path_err_recheck_gap_time_handler, &snprint_def_marginal_path_err_recheck_gap_time); @@ -1714,6 +1744,9 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &hw_deferred_remove_handler, &snprint_hw_deferred_remove); install_keyword("delay_watch_checks", &hw_delay_watch_checks_handler, &snprint_hw_delay_watch_checks); install_keyword("delay_wait_checks", &hw_delay_wait_checks_handler, &snprint_hw_delay_wait_checks); + install_keyword("san_path_err_threshold", &hw_san_path_err_threshold_handler, &snprint_hw_san_path_err_threshold); + install_keyword("san_path_err_forget_rate", &hw_san_path_err_forget_rate_handler, &snprint_hw_san_path_err_forget_rate); + install_keyword("san_path_err_recovery_time", &hw_san_path_err_recovery_time_handler, &snprint_hw_san_path_err_recovery_time); install_keyword("marginal_path_err_sample_time", &hw_marginal_path_err_sample_time_handler, &snprint_hw_marginal_path_err_sample_time); install_keyword("marginal_path_err_rate_threshold", &hw_marginal_path_err_rate_threshold_handler, &snprint_hw_marginal_path_err_rate_threshold); install_keyword("marginal_path_err_recheck_gap_time", &hw_marginal_path_err_recheck_gap_time_handler, &snprint_hw_marginal_path_err_recheck_gap_time); @@ -1750,6 +1783,9 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &ovr_deferred_remove_handler, &snprint_ovr_deferred_remove); install_keyword("delay_watch_checks", &ovr_delay_watch_checks_handler, &snprint_ovr_delay_watch_checks); install_keyword("delay_wait_checks", &ovr_delay_wait_checks_handler, &snprint_ovr_delay_wait_checks); + install_keyword("san_path_err_threshold", &ovr_san_path_err_threshold_handler, &snprint_ovr_san_path_err_threshold); + install_keyword("san_path_err_forget_rate", &ovr_san_path_err_forget_rate_handler, &snprint_ovr_san_path_err_forget_rate); + install_keyword("san_path_err_recovery_time", &ovr_san_path_err_recovery_time_handler, &snprint_ovr_san_path_err_recovery_time); install_keyword("marginal_path_err_sample_time", &ovr_marginal_path_err_sample_time_handler, &snprint_ovr_marginal_path_err_sample_time); install_keyword("marginal_path_err_rate_threshold", &ovr_marginal_path_err_rate_threshold_handler, &snprint_ovr_marginal_path_err_rate_threshold); install_keyword("marginal_path_err_recheck_gap_time", &ovr_marginal_path_err_recheck_gap_time_handler, &snprint_ovr_marginal_path_err_recheck_gap_time); @@ -1785,6 +1821,9 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &mp_deferred_remove_handler, &snprint_mp_deferred_remove); install_keyword("delay_watch_checks", &mp_delay_watch_checks_handler, &snprint_mp_delay_watch_checks); install_keyword("delay_wait_checks", &mp_delay_wait_checks_handler, &snprint_mp_delay_wait_checks); + install_keyword("san_path_err_threshold", &mp_san_path_err_threshold_handler, &snprint_mp_san_path_err_threshold); + install_keyword("san_path_err_forget_rate", &mp_san_path_err_forget_rate_handler, &snprint_mp_san_path_err_forget_rate); + install_keyword("san_path_err_recovery_time", &mp_san_path_err_recovery_time_handler, &snprint_mp_san_path_err_recovery_time); install_keyword("marginal_path_err_sample_time", &mp_marginal_path_err_sample_time_handler, &snprint_mp_marginal_path_err_sample_time); install_keyword("marginal_path_err_rate_threshold", &mp_marginal_path_err_rate_threshold_handler, &snprint_mp_marginal_path_err_rate_threshold); install_keyword("marginal_path_err_recheck_gap_time", &mp_marginal_path_err_recheck_gap_time_handler, &snprint_mp_marginal_path_err_recheck_gap_time); diff --git a/libmultipath/discovery.c b/libmultipath/discovery.c index 63558ad..10bd8cd 100644 --- a/libmultipath/discovery.c +++ b/libmultipath/discovery.c @@ -520,7 +520,7 @@ sysfs_get_asymmetric_access_state(struct path *pp, char *buff, int buflen) /* Parse error, ignore */ return 0; } - return preferred; + return !!preferred; } static void @@ -711,7 +711,7 @@ sysfs_set_scsi_tmo (struct multipath *mpp, int checkint) int dev_loss_tmo = mpp->dev_loss; if (mpp->no_path_retry > 0) { - uint64_t no_path_retry_tmo = mpp->no_path_retry * checkint; + uint64_t no_path_retry_tmo = (uint64_t)mpp->no_path_retry * checkint; if (no_path_retry_tmo > MAX_DEV_LOSS_TMO) no_path_retry_tmo = MAX_DEV_LOSS_TMO; @@ -1106,7 +1106,9 @@ get_vpd_sgio (int fd, int pg, char * str, int maxlen) memset(buff, 0x0, 4096); if (sgio_get_vpd(buff, 4096, fd, pg) < 0) { - condlog(3, "failed to issue vpd inquiry for pg%02x", + int lvl = pg == 0x80 || pg == 0x83 ? 3 : 4; + + condlog(lvl, "failed to issue vpd inquiry for pg%02x", pg); return -errno; } @@ -1382,7 +1384,7 @@ common_sysfs_pathinfo (struct path * pp) devt = udev_device_get_devnum(pp->udev); snprintf(pp->dev_t, BLK_DEV_SIZE, "%d:%d", major(devt), minor(devt)); - condlog(3, "%s: dev_t = %s", pp->dev, pp->dev_t); + condlog(4, "%s: dev_t = %s", pp->dev, pp->dev_t); if (sysfs_get_size(pp, &pp->size)) return PATHINFO_FAILED; @@ -1433,7 +1435,7 @@ path_offline (struct path * pp) } - condlog(3, "%s: path state = %s", pp->dev, buff); + condlog(4, "%s: path state = %s", pp->dev, buff); if (pp->bus == SYSFS_BUS_SCSI) { if (!strncmp(buff, "offline", 7)) { @@ -1552,8 +1554,6 @@ get_state (struct path * pp, struct config *conf, int daemon, int oldstate) struct checker * c = &pp->checker; int state; - condlog(3, "%s: get_state", pp->dev); - if (!checker_selected(c)) { if (daemon) { if (pathinfo(pp, conf, DI_SYSFS) != PATHINFO_OK) { @@ -1601,6 +1601,7 @@ get_prio (struct path * pp) struct prio * p; struct config *conf; int checker_timeout; + int old_prio; if (!pp) return 0; @@ -1621,13 +1622,14 @@ get_prio (struct path * pp) conf = get_multipath_config(); checker_timeout = conf->checker_timeout; put_multipath_config(conf); + old_prio = pp->priority; pp->priority = prio_getprio(p, pp, checker_timeout); if (pp->priority < 0) { condlog(3, "%s: %s prio error", pp->dev, prio_name(p)); pp->priority = PRIO_UNDEF; return 1; } - condlog(3, "%s: %s prio = %u", + condlog((old_prio == pp->priority ? 4 : 3), "%s: %s prio = %u", pp->dev, prio_name(p), pp->priority); return 0; } @@ -1865,11 +1867,11 @@ int pathinfo(struct path *pp, struct config *conf, int mask) udev_device_get_sysattr_value(pp->udev, "hidden"); if (hidden && !strcmp(hidden, "1")) { - condlog(3, "%s: hidden", pp->dev); + condlog(4, "%s: hidden", pp->dev); return PATHINFO_SKIPPED; } if (is_claimed_by_foreign(pp->udev) || - filter_property(conf, pp->udev) > 0) + filter_property(conf, pp->udev, 4) > 0) return PATHINFO_SKIPPED; } @@ -1878,7 +1880,7 @@ int pathinfo(struct path *pp, struct config *conf, int mask) pp->dev) > 0) return PATHINFO_SKIPPED; - condlog(3, "%s: mask = 0x%x", pp->dev, mask); + condlog(4, "%s: mask = 0x%x", pp->dev, mask); /* * Sanity check: we need the device number to @@ -1964,8 +1966,12 @@ int pathinfo(struct path *pp, struct config *conf, int mask) if ((mask & DI_WWID) && !strlen(pp->wwid)) { get_uid(pp, path_state, pp->udev); if (!strlen(pp->wwid)) { - pp->initialized = INIT_MISSING_UDEV; - pp->tick = conf->retrigger_delay; + if (pp->bus == SYSFS_BUS_UNDEF) + return PATHINFO_SKIPPED; + if (pp->initialized != INIT_FAILED) { + pp->initialized = INIT_MISSING_UDEV; + pp->tick = conf->retrigger_delay; + } return PATHINFO_OK; } else @@ -1998,7 +2004,7 @@ blank: * Recoverable error, for example faulty or offline path */ pp->chkrstate = pp->state = PATH_DOWN; - if (pp->initialized == INIT_FAILED) + if (pp->initialized == INIT_NEW || pp->initialized == INIT_FAILED) memset(pp->wwid, 0, WWID_SIZE); return PATHINFO_OK; diff --git a/libmultipath/dmparser.c b/libmultipath/dmparser.c index 620f507..ac13ec0 100644 --- a/libmultipath/dmparser.c +++ b/libmultipath/dmparser.c @@ -117,7 +117,7 @@ assemble_map (struct multipath * mp, char * params, int len) } FREE(f); - condlog(3, "%s: assembled map [%s]", mp->alias, params); + condlog(4, "%s: assembled map [%s]", mp->alias, params); return 0; err: @@ -145,7 +145,7 @@ int disassemble_map(vector pathvec, char *params, struct multipath *mpp, p = params; - condlog(3, "%s: disassemble map [%s]", mpp->alias, params); + condlog(4, "%s: disassemble map [%s]", mpp->alias, params); /* * features @@ -410,7 +410,7 @@ int disassemble_status(char *params, struct multipath *mpp) p = params; - condlog(3, "%s: disassemble status [%s]", mpp->alias, params); + condlog(4, "%s: disassemble status [%s]", mpp->alias, params); /* * features diff --git a/libmultipath/foreign/Makefile b/libmultipath/foreign/Makefile index fe98ddf..fae58a0 100644 --- a/libmultipath/foreign/Makefile +++ b/libmultipath/foreign/Makefile @@ -1,13 +1,12 @@ # # Copyright (C) 2003 Christophe Varoqui, # +TOPDIR=../.. include ../../Makefile.inc -CFLAGS += $(LIB_CFLAGS) -I.. +CFLAGS += $(LIB_CFLAGS) -I.. -I$(nvmedir) -# If you add or remove a checker also update multipath/multipath.conf.5 -LIBS= \ - libforeign-nvme.so +LIBS = libforeign-nvme.so all: $(LIBS) diff --git a/libmultipath/foreign/nvme.c b/libmultipath/foreign/nvme.c index c753a74..7e654ec 100644 --- a/libmultipath/foreign/nvme.c +++ b/libmultipath/foreign/nvme.c @@ -15,6 +15,8 @@ along with this program. If not, see . */ +#include "nvme-lib.h" +#include #include #include #include @@ -27,6 +29,7 @@ #include #include #include +#include #include "util.h" #include "vector.h" #include "generic.h" @@ -40,17 +43,22 @@ static const char N_A[] = "n/a"; const char *THIS; struct nvme_map; +struct nvme_pathgroup { + struct gen_pathgroup gen; + struct _vector pathvec; +}; + struct nvme_path { struct gen_path gen; struct udev_device *udev; struct udev_device *ctl; struct nvme_map *map; bool seen; -}; - -struct nvme_pathgroup { - struct gen_pathgroup gen; - vector pathvec; + /* + * The kernel works in failover mode. + * Each path has a separate path group. + */ + struct nvme_pathgroup pg; }; struct nvme_map { @@ -58,12 +66,9 @@ struct nvme_map { struct udev_device *udev; struct udev_device *subsys; dev_t devt; - /* Just one static pathgroup for NVMe for now */ - struct nvme_pathgroup pg; - struct gen_pathgroup *gpg; struct _vector pgvec; - vector pathvec; int nr_live; + int ana_supported; }; #define NAME_LEN 64 /* buffer length for temp attributes */ @@ -76,29 +81,33 @@ struct nvme_map { #define const_gen_path_to_nvme(g) ((const struct nvme_path*)(g)) #define gen_path_to_nvme(g) ((struct nvme_path*)(g)) #define nvme_path_to_gen(n) &((n)->gen) +#define nvme_pg_to_path(x) (VECTOR_SLOT(&((x)->pathvec), 0)) +#define nvme_path_to_pg(x) &((x)->pg) static void cleanup_nvme_path(struct nvme_path *path) { condlog(5, "%s: %p %p", __func__, path, path->udev); if (path->udev) udev_device_unref(path->udev); + vector_reset(&path->pg.pathvec); + /* ctl is implicitly referenced by udev, no need to unref */ free(path); } static void cleanup_nvme_map(struct nvme_map *map) { - if (map->pathvec) { - struct nvme_path *path; - int i; + struct nvme_pathgroup *pg; + struct nvme_path *path; + int i; - vector_foreach_slot_backwards(map->pathvec, path, i) { - condlog(5, "%s: %d %p", __func__, i, path); - cleanup_nvme_path(path); - vector_del_slot(map->pathvec, i); - } + vector_foreach_slot_backwards(&map->pgvec, pg, i) { + path = nvme_pg_to_path(pg); + condlog(5, "%s: %d %p", __func__, i, path); + cleanup_nvme_path(path); + vector_del_slot(&map->pgvec, i); } - vector_free(map->pathvec); + vector_reset(&map->pgvec); if (map->udev) udev_device_unref(map->udev); /* subsys is implicitly referenced by udev, no need to unref */ @@ -139,10 +148,11 @@ static int snprint_nvme_map(const struct gen_multipath *gmp, return snprintf(buff, len, "%s", udev_device_get_sysname(nvm->udev)); case 'n': - return snprintf(buff, len, "%s:NQN:%s", - udev_device_get_sysname(nvm->subsys), + return snprintf(buff, len, "%s:nsid.%s", udev_device_get_sysattr_value(nvm->subsys, - "subsysnqn")); + "subsysnqn"), + udev_device_get_sysattr_value(nvm->udev, + "nsid")); case 'w': return snprintf(buff, len, "%s", udev_device_get_sysattr_value(nvm->udev, @@ -178,11 +188,14 @@ static int snprint_nvme_map(const struct gen_multipath *gmp, return snprintf(buff, len, "%s", "rw"); case 'G': return snprintf(buff, len, "%s", THIS); + case 'h': + if (nvm->ana_supported == YNU_YES) + return snprintf(buff, len, "ANA"); default: - return snprintf(buff, len, N_A); break; } - return 0; + + return snprintf(buff, len, N_A); } static const struct _vector* @@ -190,7 +203,7 @@ nvme_pg_get_paths(const struct gen_pathgroup *gpg) { const struct nvme_pathgroup *gp = const_gen_pg_to_nvme(gpg); /* This is all used under the lock, no need to copy */ - return gp->pathvec; + return &gp->pathvec; } static void @@ -199,12 +212,6 @@ nvme_pg_rel_paths(const struct gen_pathgroup *gpg, const struct _vector *v) /* empty */ } -static int snprint_nvme_pg(const struct gen_pathgroup *gmp, - char *buff, int len, char wildcard) -{ - return snprintf(buff, len, N_A); -} - static int snprint_hcil(const struct nvme_path *np, char *buf, int len) { unsigned int nvmeid, ctlid, nsid; @@ -242,8 +249,27 @@ static int snprint_nvme_path(const struct gen_path *gp, devt = udev_device_get_devnum(np->udev); return snprintf(buff, len, "%u:%u", major(devt), minor(devt)); case 'o': - sysfs_attr_get_value(np->ctl, "state", fld, sizeof(fld)); - return snprintf(buff, len, "%s", fld); + if (sysfs_attr_get_value(np->ctl, "state", + fld, sizeof(fld)) > 0) + return snprintf(buff, len, "%s", fld); + break; + case 'T': + if (sysfs_attr_get_value(np->udev, "ana_state", fld, + sizeof(fld)) > 0) + return snprintf(buff, len, "%s", fld); + break; + case 'p': + if (sysfs_attr_get_value(np->udev, "ana_state", fld, + sizeof(fld)) > 0) { + rstrip(fld); + if (!strcmp(fld, "optimized")) + return snprintf(buff, len, "%d", 50); + else if (!strcmp(fld, "non-optimized")) + return snprintf(buff, len, "%d", 10); + else + return snprintf(buff, len, "%d", 0); + } + break; case 's': snprintf(fld, sizeof(fld), "%s", udev_device_get_sysattr_value(np->ctl, @@ -281,12 +307,30 @@ static int snprint_nvme_path(const struct gen_path *gp, udev_device_get_sysname(pci)); /* fall through */ default: - return snprintf(buff, len, "%s", N_A); break; } + return snprintf(buff, len, "%s", N_A); return 0; } +static int snprint_nvme_pg(const struct gen_pathgroup *gmp, + char *buff, int len, char wildcard) +{ + const struct nvme_pathgroup *pg = const_gen_pg_to_nvme(gmp); + const struct nvme_path *path = nvme_pg_to_path(pg); + + switch (wildcard) { + case 't': + return snprint_nvme_path(nvme_path_to_gen(path), + buff, len, 'T'); + case 'p': + return snprint_nvme_path(nvme_path_to_gen(path), + buff, len, 'p'); + default: + return snprintf(buff, len, N_A); + } +} + static int nvme_style(const struct gen_multipath* gm, char *buf, int len, int verbosity) { @@ -432,7 +476,7 @@ static struct nvme_map *_find_nvme_map_by_devt(const struct context *ctx, static struct nvme_path * _find_path_by_syspath(struct nvme_map *map, const char *syspath) { - struct nvme_path *path; + struct nvme_pathgroup *pg; char real[PATH_MAX]; const char *ppath; int i; @@ -443,7 +487,9 @@ _find_path_by_syspath(struct nvme_map *map, const char *syspath) ppath = syspath; } - vector_foreach_slot(map->pathvec, path, i) { + vector_foreach_slot(&map->pgvec, pg, i) { + struct nvme_path *path = nvme_pg_to_path(pg); + if (!strcmp(ppath, udev_device_get_syspath(path->udev))) return path; @@ -531,20 +577,57 @@ out: return blkdev; } +static void test_ana_support(struct nvme_map *map, struct udev_device *ctl) +{ + const char *dev_t; + char sys_path[64]; + long fd; + int rc; + + if (map->ana_supported != YNU_UNDEF) + return; + + dev_t = udev_device_get_sysattr_value(ctl, "dev"); + if (snprintf(sys_path, sizeof(sys_path), "/dev/char/%s", dev_t) + >= sizeof(sys_path)) + return; + + fd = open(sys_path, O_RDONLY); + if (fd == -1) { + condlog(2, "%s: error opening %s", __func__, sys_path); + return; + } + + pthread_cleanup_push(close_fd, (void *)fd); + rc = nvme_id_ctrl_ana(fd, NULL); + if (rc < 0) + condlog(2, "%s: error in nvme_id_ctrl: %s", __func__, + strerror(errno)); + else { + map->ana_supported = (rc == 1 ? YNU_YES : YNU_NO); + condlog(3, "%s: NVMe ctrl %s: ANA %s supported", __func__, dev_t, + rc == 1 ? "is" : "is not"); + } + pthread_cleanup_pop(1); +} + static void _find_controllers(struct context *ctx, struct nvme_map *map) { char pathbuf[PATH_MAX], realbuf[PATH_MAX]; struct dirent **di = NULL; struct scandir_result sr; struct udev_device *subsys; + struct nvme_pathgroup *pg; struct nvme_path *path; int r, i, n; if (map == NULL || map->udev == NULL) return; - vector_foreach_slot(map->pathvec, path, i) + vector_foreach_slot(&map->pgvec, pg, i) { + path = nvme_pg_to_path(pg); path->seen = false; + } subsys = udev_device_get_parent_with_subsystem_devtype(map->udev, "nvme-subsystem", @@ -606,7 +689,8 @@ static void _find_controllers(struct context *ctx, struct nvme_map *map) if (udev == NULL) continue; - path = _find_path_by_syspath(map, udev_device_get_syspath(udev)); + path = _find_path_by_syspath(map, + udev_device_get_syspath(udev)); if (path != NULL) { path->seen = true; condlog(4, "%s: %s already known", @@ -630,24 +714,32 @@ static void _find_controllers(struct context *ctx, struct nvme_map *map) cleanup_nvme_path(path); continue; } + test_ana_support(map, path->ctl); - if (vector_alloc_slot(map->pathvec) == NULL) { + path->pg.gen.ops = &nvme_pg_ops; + if (vector_alloc_slot(&path->pg.pathvec) == NULL) { cleanup_nvme_path(path); continue; } + vector_set_slot(&path->pg.pathvec, path); + if (vector_alloc_slot(&map->pgvec) == NULL) { + cleanup_nvme_path(path); + continue; + } + vector_set_slot(&map->pgvec, &path->pg); condlog(3, "%s: %s: new path %s added to %s", __func__, THIS, udev_device_get_sysname(udev), udev_device_get_sysname(map->udev)); - vector_set_slot(map->pathvec, path); } pthread_cleanup_pop(1); map->nr_live = 0; - vector_foreach_slot_backwards(map->pathvec, path, i) { + vector_foreach_slot_backwards(&map->pgvec, pg, i) { + path = nvme_pg_to_path(pg); if (!path->seen) { condlog(1, "path %d not found in %s any more", i, udev_device_get_sysname(map->udev)); - vector_del_slot(map->pathvec, i); + vector_del_slot(&map->pgvec, i); cleanup_nvme_path(path); } else { static const char live_state[] = "live"; @@ -661,7 +753,7 @@ static void _find_controllers(struct context *ctx, struct nvme_map *map) } condlog(3, "%s: %s: map %s has %d/%d live paths", __func__, THIS, udev_device_get_sysname(map->udev), map->nr_live, - VECTOR_SIZE(map->pathvec)); + VECTOR_SIZE(&map->pgvec)); } static int _add_map(struct context *ctx, struct udev_device *ud, @@ -686,19 +778,6 @@ static int _add_map(struct context *ctx, struct udev_device *ud, map->subsys = subsys; map->gen.ops = &nvme_map_ops; - map->pathvec = vector_alloc(); - if (map->pathvec == NULL) { - cleanup_nvme_map(map); - return FOREIGN_ERR; - } - - map->pg.gen.ops = &nvme_pg_ops; - map->pg.pathvec = map->pathvec; - map->gpg = nvme_pg_to_gen(&map->pg); - - map->pgvec.allocated = 1; - map->pgvec.slot = (void**)&map->gpg; - if (vector_alloc_slot(ctx->mpvec) == NULL) { cleanup_nvme_map(map); return FOREIGN_ERR; @@ -842,8 +921,8 @@ const struct _vector * get_paths(const struct context *ctx) condlog(5, "%s called for \"%s\"", __func__, THIS); vector_foreach_slot(ctx->mpvec, gm, i) { const struct nvme_map *nm = const_gen_mp_to_nvme(gm); - paths = vector_convert(paths, nm->pathvec, - struct gen_path, identity); + paths = vector_convert(paths, &nm->pgvec, + struct nvme_pathgroup, nvme_pg_to_path); } return paths; } diff --git a/libmultipath/log_pthread.c b/libmultipath/log_pthread.c index bb35dfc..be57bb1 100644 --- a/libmultipath/log_pthread.c +++ b/libmultipath/log_pthread.c @@ -25,6 +25,9 @@ static int log_messages_pending; void log_safe (int prio, const char * fmt, va_list ap) { + if (prio > LOG_DEBUG) + prio = LOG_DEBUG; + if (log_thr == (pthread_t)0) { vsyslog(prio, fmt, ap); return; diff --git a/libmultipath/nvme-lib.c b/libmultipath/nvme-lib.c new file mode 100644 index 0000000..f30e769 --- /dev/null +++ b/libmultipath/nvme-lib.c @@ -0,0 +1,49 @@ +#include +/* avoid inclusion of standard API */ +#define _NVME_LIB_C 1 +#include "nvme-lib.h" +#include "nvme-ioctl.c" +#include "debug.h" + +int log_nvme_errcode(int err, const char *dev, const char *msg) +{ + if (err > 0) + condlog(3, "%s: %s: NVMe status %d", dev, msg, err); + else if (err < 0) + condlog(3, "%s: %s: %s", dev, msg, strerror(errno)); + return err; +} + +int libmp_nvme_get_nsid(int fd) +{ + return nvme_get_nsid(fd); +} + +int libmp_nvme_identify_ctrl(int fd, struct nvme_id_ctrl *ctrl) +{ + return nvme_identify_ctrl(fd, ctrl); +} + +int libmp_nvme_identify_ns(int fd, __u32 nsid, bool present, + struct nvme_id_ns *ns) +{ + return nvme_identify_ns(fd, nsid, present, ns); +} + +int libmp_nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo) +{ + return nvme_ana_log(fd, ana_log, ana_log_len, rgo); +} + +int nvme_id_ctrl_ana(int fd, struct nvme_id_ctrl *ctrl) +{ + int rc; + struct nvme_id_ctrl c; + + rc = nvme_identify_ctrl(fd, &c); + if (rc < 0) + return rc; + if (ctrl) + *ctrl = c; + return c.cmic & (1 << 3) ? 1 : 0; +} diff --git a/libmultipath/nvme-lib.h b/libmultipath/nvme-lib.h new file mode 100644 index 0000000..448dd99 --- /dev/null +++ b/libmultipath/nvme-lib.h @@ -0,0 +1,39 @@ +#ifndef NVME_LIB_H +#define NVME_LIB_H + +#include "nvme.h" + +int log_nvme_errcode(int err, const char *dev, const char *msg); +int libmp_nvme_get_nsid(int fd); +int libmp_nvme_identify_ctrl(int fd, struct nvme_id_ctrl *ctrl); +int libmp_nvme_identify_ns(int fd, __u32 nsid, bool present, + struct nvme_id_ns *ns); +int libmp_nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo); +/* + * Identify controller, and return true if ANA is supported + * ctrl will be filled in if controller is identified, even w/o ANA + * ctrl may be NULL + */ +int nvme_id_ctrl_ana(int fd, struct nvme_id_ctrl *ctrl); + +#ifndef _NVME_LIB_C +/* + * In all files except nvme-lib.c, the nvme functions can be called + * by their usual name. + */ +#define nvme_get_nsid libmp_nvme_get_nsid +#define nvme_identify_ctrl libmp_nvme_identify_ctrl +#define nvme_identify_ns libmp_nvme_identify_ns +#define nvme_ana_log libmp_nvme_ana_log +/* + * Undefine these to avoid clashes with libmultipath's byteorder.h + */ +#undef cpu_to_le16 +#undef cpu_to_le32 +#undef cpu_to_le64 +#undef le16_to_cpu +#undef le32_to_cpu +#undef le64_to_cpu +#endif + +#endif /* NVME_LIB_H */ diff --git a/libmultipath/nvme/argconfig.h b/libmultipath/nvme/argconfig.h new file mode 100644 index 0000000..adb192b --- /dev/null +++ b/libmultipath/nvme/argconfig.h @@ -0,0 +1,99 @@ +//////////////////////////////////////////////////////////////////////// +// +// Copyright 2014 PMC-Sierra, Inc. +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 2 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +// +//////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////// +// +// Author: Logan Gunthorpe +// Logan Gunthorpe +// +// Date: Oct 23 2014 +// +// Description: +// Header file for argconfig.c +// +//////////////////////////////////////////////////////////////////////// + +#ifndef argconfig_H +#define argconfig_H + +#include +#include +#include + +enum argconfig_types { + CFG_NONE, + CFG_STRING, + CFG_INT, + CFG_SIZE, + CFG_LONG, + CFG_LONG_SUFFIX, + CFG_DOUBLE, + CFG_BOOL, + CFG_BYTE, + CFG_SHORT, + CFG_POSITIVE, + CFG_INCREMENT, + CFG_SUBOPTS, + CFG_FILE_A, + CFG_FILE_W, + CFG_FILE_R, + CFG_FILE_AP, + CFG_FILE_WP, + CFG_FILE_RP, +}; + +struct argconfig_commandline_options { + const char *option; + const char short_option; + const char *meta; + enum argconfig_types config_type; + void *default_value; + int argument_type; + const char *help; +}; + +#define CFG_MAX_SUBOPTS 500 +#define MAX_HELP_FUNC 20 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void argconfig_help_func(); +void argconfig_append_usage(const char *str); +void argconfig_print_help(const char *program_desc, + const struct argconfig_commandline_options *options); +int argconfig_parse(int argc, char *argv[], const char *program_desc, + const struct argconfig_commandline_options *options, + void *config_out, size_t config_size); +int argconfig_parse_subopt_string(char *string, char **options, + size_t max_options); +unsigned argconfig_parse_comma_sep_array(char *string, int *ret, + unsigned max_length); +unsigned argconfig_parse_comma_sep_array_long(char *string, + unsigned long long *ret, + unsigned max_length); +void argconfig_register_help_func(argconfig_help_func * f); + +void print_word_wrapped(const char *s, int indent, int start); +#ifdef __cplusplus +} +#endif +#endif diff --git a/libmultipath/nvme/json.h b/libmultipath/nvme/json.h new file mode 100644 index 0000000..c4ea531 --- /dev/null +++ b/libmultipath/nvme/json.h @@ -0,0 +1,87 @@ +#ifndef __JSON__H +#define __JSON__H + +struct json_object; +struct json_array; +struct json_pair; + +#define JSON_TYPE_STRING 0 +#define JSON_TYPE_INTEGER 1 +#define JSON_TYPE_FLOAT 2 +#define JSON_TYPE_OBJECT 3 +#define JSON_TYPE_ARRAY 4 +#define JSON_TYPE_UINT 5 +#define JSON_PARENT_TYPE_PAIR 0 +#define JSON_PARENT_TYPE_ARRAY 1 +struct json_value { + int type; + union { + long long integer_number; + unsigned long long uint_number; + long double float_number; + char *string; + struct json_object *object; + struct json_array *array; + }; + int parent_type; + union { + struct json_pair *parent_pair; + struct json_array *parent_array; + }; +}; + +struct json_array { + struct json_value **values; + int value_cnt; + struct json_value *parent; +}; + +struct json_object { + struct json_pair **pairs; + int pair_cnt; + struct json_value *parent; +}; + +struct json_pair { + char *name; + struct json_value *value; + struct json_object *parent; +}; + +struct json_object *json_create_object(void); +struct json_array *json_create_array(void); + +void json_free_object(struct json_object *obj); + +int json_object_add_value_type(struct json_object *obj, const char *name, int type, ...); +#define json_object_add_value_int(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_INTEGER, (long long) (val)) +#define json_object_add_value_uint(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_UINT, (unsigned long long) (val)) +#define json_object_add_value_float(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_FLOAT, (val)) +#define json_object_add_value_string(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_STRING, (val)) +#define json_object_add_value_object(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_OBJECT, (val)) +#define json_object_add_value_array(obj, name, val) \ + json_object_add_value_type((obj), name, JSON_TYPE_ARRAY, (val)) +int json_array_add_value_type(struct json_array *array, int type, ...); +#define json_array_add_value_int(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_INTEGER, (val)) +#define json_array_add_value_uint(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_UINT, (val)) +#define json_array_add_value_float(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_FLOAT, (val)) +#define json_array_add_value_string(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_STRING, (val)) +#define json_array_add_value_object(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_OBJECT, (val)) +#define json_array_add_value_array(obj, val) \ + json_array_add_value_type((obj), JSON_TYPE_ARRAY, (val)) + +#define json_array_last_value_object(obj) \ + (obj->values[obj->value_cnt - 1]->object) + +void json_print_object(struct json_object *obj, void *); +#endif diff --git a/libmultipath/nvme/linux/nvme.h b/libmultipath/nvme/linux/nvme.h new file mode 100644 index 0000000..68000eb --- /dev/null +++ b/libmultipath/nvme/linux/nvme.h @@ -0,0 +1,1450 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_NVME_H +#define _LINUX_NVME_H + +#include +#include + +/* NQN names in commands fields specified one size */ +#define NVMF_NQN_FIELD_LEN 256 + +/* However the max length of a qualified name is another size */ +#define NVMF_NQN_SIZE 223 + +#define NVMF_TRSVCID_SIZE 32 +#define NVMF_TRADDR_SIZE 256 +#define NVMF_TSAS_SIZE 256 + +#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" + +#define NVME_RDMA_IP_PORT 4420 + +#define NVME_NSID_ALL 0xffffffff + +enum nvme_subsys_type { + NVME_NQN_DISC = 1, /* Discovery type target subsystem */ + NVME_NQN_NVME = 2, /* NVME type target subsystem */ +}; + +/* Address Family codes for Discovery Log Page entry ADRFAM field */ +enum { + NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ + NVMF_ADDR_FAMILY_IP4 = 1, /* IP4 */ + NVMF_ADDR_FAMILY_IP6 = 2, /* IP6 */ + NVMF_ADDR_FAMILY_IB = 3, /* InfiniBand */ + NVMF_ADDR_FAMILY_FC = 4, /* Fibre Channel */ +}; + +/* Transport Type codes for Discovery Log Page entry TRTYPE field */ +enum { + NVMF_TRTYPE_RDMA = 1, /* RDMA */ + NVMF_TRTYPE_FC = 2, /* Fibre Channel */ + NVMF_TRTYPE_TCP = 3, /* TCP */ + NVMF_TRTYPE_LOOP = 254, /* Reserved for host usage */ + NVMF_TRTYPE_MAX, +}; + +/* Transport Requirements codes for Discovery Log Page entry TREQ field */ +enum { + NVMF_TREQ_NOT_SPECIFIED = 0, /* Not specified */ + NVMF_TREQ_REQUIRED = 1, /* Required */ + NVMF_TREQ_NOT_REQUIRED = 2, /* Not Required */ + NVMF_TREQ_DISABLE_SQFLOW = (1 << 2), /* SQ flow control disable supported */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ +}; + +/* RDMA QP Service Type codes for Discovery Log Page entry TSAS + * RDMA_QPTYPE field + */ +enum { + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 2, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 3, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 4, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 5, /* IWARP */ +}; + +/* RDMA Connection Management Service Type codes for Discovery Log Page + * entry TSAS RDMA_CMS field + */ +enum { + NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ +}; + +/* TCP port security type for Discovery Log Page entry TSAS + */ +enum { + NVMF_TCP_SECTYPE_NONE = 0, /* No Security */ + NVMF_TCP_SECTYPE_TLS = 1, /* Transport Layer Security */ +}; + +#define NVME_AQ_DEPTH 32 +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLK_MQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) + +/* + * Subtract one to leave an empty queue entry for 'Full Queue' condition. See + * NVM-Express 1.2 specification, section 4.1.2. + */ +#define NVME_AQ_MQ_TAG_DEPTH (NVME_AQ_BLK_MQ_DEPTH - 1) + +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Clear */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin CQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ + NVME_REG_BPINFO = 0x0040, /* Boot Partition Information */ + NVME_REG_BPRSEL = 0x0044, /* Boot Partition Read Select */ + NVME_REG_BPMBL = 0x0048, /* Boot Partition Memory Buffer Location */ + NVME_REG_DBS = 0x1000, /* SQ 0 Tail Doorbell */ +}; + +#define NVME_CAP_MQES(cap) ((cap) & 0xffff) +#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) +#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) +#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) +#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) + +#define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) +#define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) +#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) +#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) + +#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) +#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) +#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) +#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) +#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) + +/* + * Submission and Completion Queue Entry Sizes for the NVM command set. + * (In bytes and specified as a power of two (2^n)). + */ +#define NVME_NVM_IOSQES 6 +#define NVME_NVM_IOCQES 4 + +enum { + NVME_CC_ENABLE = 1 << 0, + NVME_CC_CSS_NVM = 0 << 4, + NVME_CC_EN_SHIFT = 0, + NVME_CC_CSS_SHIFT = 4, + NVME_CC_MPS_SHIFT = 7, + NVME_CC_AMS_SHIFT = 11, + NVME_CC_SHN_SHIFT = 14, + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, + NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CSTS_RDY = 1 << 0, + NVME_CSTS_CFS = 1 << 1, + NVME_CSTS_NSSRO = 1 << 4, + NVME_CSTS_PP = 1 << 5, + NVME_CSTS_SHST_NORMAL = 0 << 2, + NVME_CSTS_SHST_OCCUR = 1 << 2, + NVME_CSTS_SHST_CMPLT = 2 << 2, + NVME_CSTS_SHST_MASK = 3 << 2, +}; + +struct nvme_id_power_state { + __le16 max_power; /* centiwatts */ + __u8 rsvd2; + __u8 flags; + __le32 entry_lat; /* microseconds */ + __le32 exit_lat; /* microseconds */ + __u8 read_tput; + __u8 read_lat; + __u8 write_tput; + __u8 write_lat; + __le16 idle_power; + __u8 idle_scale; + __u8 rsvd19; + __le16 active_power; + __u8 active_work_scale; + __u8 rsvd23[9]; +}; + +enum { + NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0, + NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 cmic; + __u8 mdts; + __le16 cntlid; + __le32 ver; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __le32 ctratt; + __le16 rrls; + __u8 rsvd102[154]; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 avscc; + __u8 apsta; + __le16 wctemp; + __le16 cctemp; + __le16 mtfa; + __le32 hmpre; + __le32 hmmin; + __u8 tnvmcap[16]; + __u8 unvmcap[16]; + __le32 rpmbs; + __le16 edstt; + __u8 dsto; + __u8 fwug; + __le16 kas; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __le32 hmminds; + __le16 hmmaxd; + __le16 nsetidmax; + __u8 rsvd340[2]; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __u8 rsvd352[160]; + __u8 sqes; + __u8 cqes; + __le16 maxcmd; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 nvscc; + __u8 nwpc; + __le16 acwu; + __u8 rsvd534[2]; + __le32 sgls; + __le32 mnan; + __u8 rsvd544[224]; + char subnqn[256]; + __u8 rsvd1024[768]; + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 ctrattr; + __u8 msdbd; + __u8 rsvd1804[244]; + struct nvme_id_power_state psd[32]; + __u8 vs[1024]; +}; + +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, + NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, + NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, + NVME_CTRL_VWC_PRESENT = 1 << 0, + NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DIRECTIVES = 1 << 5, + NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, + NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, + NVME_CTRL_CTRATT_128_ID = 1 << 0, + NVME_CTRL_CTRATT_NON_OP_PSP = 1 << 1, + NVME_CTRL_CTRATT_NVM_SETS = 1 << 2, + NVME_CTRL_CTRATT_READ_RECV_LVLS = 1 << 3, + NVME_CTRL_CTRATT_ENDURANCE_GROUPS = 1 << 4, + NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 nmic; + __u8 rescap; + __u8 fpi; + __u8 dlfeat; + __le16 nawun; + __le16 nawupf; + __le16 nacwu; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __le16 noiob; + __u8 nvmcap[16]; + __u8 rsvd64[28]; + __le32 anagrpid; + __u8 rsvd96[3]; + __u8 nsattr; + __le16 nvmsetid; + __le16 endgid; + __u8 nguid[16]; + __u8 eui64[8]; + struct nvme_lbaf lbaf[16]; + __u8 rsvd192[192]; + __u8 vs[3712]; +}; + +enum { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESC_LIST = 0x03, + NVME_ID_CNS_NVMSET_LIST = 0x04, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_CTRL_NS_LIST = 0x12, + NVME_ID_CNS_CTRL_LIST = 0x13, +}; + +enum { + NVME_DIR_IDENTIFY = 0x00, + NVME_DIR_STREAMS = 0x01, + NVME_DIR_SND_ID_OP_ENABLE = 0x01, + NVME_DIR_SND_ST_OP_REL_ID = 0x01, + NVME_DIR_SND_ST_OP_REL_RSC = 0x02, + NVME_DIR_RCV_ID_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_PARAM = 0x01, + NVME_DIR_RCV_ST_OP_STATUS = 0x02, + NVME_DIR_RCV_ST_OP_RESOURCE = 0x03, + NVME_DIR_ENDIR = 0x01, +}; + +enum { + NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, + NVME_LBAF_RP_BEST = 0, + NVME_LBAF_RP_BETTER = 1, + NVME_LBAF_RP_GOOD = 2, + NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, +}; + +struct nvme_ns_id_desc { + __u8 nidt; + __u8 nidl; + __le16 reserved; +}; + +#define NVME_NIDT_EUI64_LEN 8 +#define NVME_NIDT_NGUID_LEN 16 +#define NVME_NIDT_UUID_LEN 16 + +enum { + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, +}; + +#define NVME_MAX_NVMSET 31 + +struct nvme_nvmset_attr_entry { + __le16 id; + __le16 endurance_group_id; + __u8 rsvd4[4]; + __le32 random_4k_read_typical; + __le32 opt_write_size; + __u8 total_nvmset_cap[16]; + __u8 unalloc_nvmset_cap[16]; + __u8 rsvd48[80]; +}; + +struct nvme_id_nvmset { + __u8 nid; + __u8 rsvd1[127]; + struct nvme_nvmset_attr_entry ent[NVME_MAX_NVMSET]; +}; + +/* Derived from 1.3a Figure 101: Get Log Page – Telemetry Host + * -Initiated Log (Log Identifier 07h) + */ +struct nvme_telemetry_log_page_hdr { + __u8 lpi; /* Log page identifier */ + __u8 rsvd[4]; + __u8 iee_oui[3]; + __u16 dalb1; /* Data area 1 last block */ + __u16 dalb2; /* Data area 2 last block */ + __u16 dalb3; /* Data area 3 last block */ + __u8 rsvd1[368]; /* TODO verify */ + __u8 ctrlavail; /* Controller initiated data avail?*/ + __u8 ctrldgn; /* Controller initiated telemetry Data Gen # */ + __u8 rsnident[128]; + /* We'll have to double fetch so we can get the header, + * parse dalb1->3 determine how much size we need for the + * log then alloc below. Or just do a secondary non-struct + * allocation. + */ + __u8 telemetry_dataarea[0]; +}; + +struct nvme_endurance_group_log { + __u32 rsvd0; + __u8 avl_spare_threshold; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 endurance_estimate[16]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 media_units_written[16]; + __u8 rsvd96[416]; +}; + +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __le32 warning_temp_time; + __le32 critical_comp_time; + __le16 temp_sensor[8]; + __le32 thm_temp1_trans_count; + __le32 thm_temp2_trans_count; + __le32 thm_temp1_total_time; + __le32 thm_temp2_total_time; + __u8 rsvd232[280]; +}; + +struct nvme_self_test_res { + __u8 device_self_test_status; + __u8 segment_num; + __u8 valid_diagnostic_info; + __u8 rsvd; + __le64 power_on_hours; + __le32 nsid; + __le64 failing_lba; + __u8 status_code_type; + __u8 status_code; + __u8 vendor_specific[2]; +} __attribute__((packed)); + +struct nvme_self_test_log { + __u8 crnt_dev_selftest_oprn; + __u8 crnt_dev_selftest_compln; + __u8 rsvd[2]; + struct nvme_self_test_res result[20]; +} __attribute__((packed)); + +struct nvme_fw_slot_info_log { + __u8 afi; + __u8 rsvd1[7]; + __le64 frs[7]; + __u8 rsvd64[448]; +}; + +/* NVMe Namespace Write Protect State */ +enum { + NVME_NS_NO_WRITE_PROTECT = 0, + NVME_NS_WRITE_PROTECT, + NVME_NS_WRITE_PROTECT_POWER_CYCLE, + NVME_NS_WRITE_PROTECT_PERMANENT, +}; + +#define NVME_MAX_CHANGED_NAMESPACES 1024 + +struct nvme_changed_ns_list_log { + __le32 log[NVME_MAX_CHANGED_NAMESPACES]; +}; + +enum { + NVME_CMD_EFFECTS_CSUPP = 1 << 0, + NVME_CMD_EFFECTS_LBCC = 1 << 1, + NVME_CMD_EFFECTS_NCC = 1 << 2, + NVME_CMD_EFFECTS_NIC = 1 << 3, + NVME_CMD_EFFECTS_CCC = 1 << 4, + NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, +}; + +struct nvme_effects_log { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + +enum nvme_ana_state { + NVME_ANA_OPTIMIZED = 0x01, + NVME_ANA_NONOPTIMIZED = 0x02, + NVME_ANA_INACCESSIBLE = 0x03, + NVME_ANA_PERSISTENT_LOSS = 0x04, + NVME_ANA_CHANGE = 0x0f, +}; + +struct nvme_ana_group_desc { + __le32 grpid; + __le32 nnsids; + __le64 chgcnt; + __u8 state; + __u8 rsvd17[15]; + __le32 nsids[]; +}; + +/* flag for the log specific field of the ANA log */ +#define NVME_ANA_LOG_RGO (1 << 0) + +struct nvme_ana_rsp_hdr { + __le64 chgcnt; + __le16 ngrps; + __le16 rsvd10[3]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + +enum { + NVME_AER_ERROR = 0, + NVME_AER_SMART = 1, + NVME_AER_CSS = 6, + NVME_AER_VS = 7, + NVME_AER_NOTICE_NS_CHANGED = 0x0002, + NVME_AER_NOTICE_ANA = 0x0003, + NVME_AER_NOTICE_FW_ACT_STARTING = 0x0102, +}; + +struct nvme_lba_range_type { + __u8 type; + __u8 attributes; + __u8 rsvd2[14]; + __u64 slba; + __u64 nlb; + __u8 guid[16]; + __u8 rsvd48[16]; +}; + +enum { + NVME_LBART_TYPE_FS = 0x01, + NVME_LBART_TYPE_RAID = 0x02, + NVME_LBART_TYPE_CACHE = 0x03, + NVME_LBART_TYPE_SWAP = 0x04, + + NVME_LBART_ATTRIB_TEMP = 1 << 0, + NVME_LBART_ATTRIB_HIDE = 1 << 1, +}; + +struct nvme_plm_config { + __u16 enable_event; + __u8 rsvd2[30]; + __u64 dtwin_reads_thresh; + __u64 dtwin_writes_thresh; + __u64 dtwin_time_thresh; + __u8 rsvd56[456]; +}; + +struct nvme_reservation_status { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[13]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 hostid; + __le64 rkey; + } regctl_ds[]; +}; + +struct nvme_reservation_status_ext { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[14]; + __u8 resv24[40]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 rkey; + __u8 hostid[16]; + __u8 resv32[32]; + } regctl_eds[]; +}; + +enum nvme_async_event_type { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, +}; + +/* I/O commands */ + +enum nvme_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_write_zeroes = 0x08, + nvme_cmd_dsm = 0x09, + nvme_cmd_resv_register = 0x0d, + nvme_cmd_resv_report = 0x0e, + nvme_cmd_resv_acquire = 0x11, + nvme_cmd_resv_release = 0x15, +}; + +/* + * Descriptor subtype - lower 4 bits of nvme_(keyed_)sgl_desc identifier + * + * @NVME_SGL_FMT_ADDRESS: absolute address of the data block + * @NVME_SGL_FMT_OFFSET: relative offset of the in-capsule data block + * @NVME_SGL_FMT_TRANSPORT_A: transport defined format, value 0xA + * @NVME_SGL_FMT_INVALIDATE: RDMA transport specific remote invalidation + * request subtype + */ +enum { + NVME_SGL_FMT_ADDRESS = 0x00, + NVME_SGL_FMT_OFFSET = 0x01, + NVME_SGL_FMT_TRANSPORT_A = 0x0A, + NVME_SGL_FMT_INVALIDATE = 0x0f, +}; + +/* + * Descriptor type - upper 4 bits of nvme_(keyed_)sgl_desc identifier + * + * For struct nvme_sgl_desc: + * @NVME_SGL_FMT_DATA_DESC: data block descriptor + * @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor + * @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor + * + * For struct nvme_keyed_sgl_desc: + * @NVME_KEY_SGL_FMT_DATA_DESC: keyed data block descriptor + * + * Transport-specific SGL types: + * @NVME_TRANSPORT_SGL_DATA_DESC: Transport SGL data dlock descriptor + */ +enum { + NVME_SGL_FMT_DATA_DESC = 0x00, + NVME_SGL_FMT_SEG_DESC = 0x02, + NVME_SGL_FMT_LAST_SEG_DESC = 0x03, + NVME_KEY_SGL_FMT_DATA_DESC = 0x04, + NVME_TRANSPORT_SGL_DATA_DESC = 0x05, +}; + +struct nvme_sgl_desc { + __le64 addr; + __le32 length; + __u8 rsvd[3]; + __u8 type; +}; + +struct nvme_keyed_sgl_desc { + __le64 addr; + __u8 length[3]; + __u8 key[4]; + __u8 type; +}; + +union nvme_data_ptr { + struct { + __le64 prp1; + __le64 prp2; + }; + struct nvme_sgl_desc sgl; + struct nvme_keyed_sgl_desc ksgl; +}; + +/* + * Lowest two bits of our flags field (FUSE field in the spec): + * + * @NVME_CMD_FUSE_FIRST: Fused Operation, first command + * @NVME_CMD_FUSE_SECOND: Fused Operation, second command + * + * Highest two bits in our flags field (PSDT field in the spec): + * + * @NVME_CMD_PSDT_SGL_METABUF: Use SGLS for this transfer, + * If used, MPTR contains addr of single physical buffer (byte aligned). + * @NVME_CMD_PSDT_SGL_METASEG: Use SGLS for this transfer, + * If used, MPTR contains an address of an SGL segment containing + * exactly 1 SGL descriptor (qword aligned). + */ +enum { + NVME_CMD_FUSE_FIRST = (1 << 0), + NVME_CMD_FUSE_SECOND = (1 << 1), + + NVME_CMD_SGL_METABUF = (1 << 6), + NVME_CMD_SGL_METASEG = (1 << 7), + NVME_CMD_SGL_ALL = NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG, +}; + +struct nvme_common_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + union nvme_data_ptr dptr; + __le32 cdw10[6]; +}; + +struct nvme_rw_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DEAC = 1 << 9, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_DTYPE_STREAMS = 1 << 4, +}; + +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +#define NVME_DSM_MAX_RANGES 256 + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +struct nvme_write_zeroes_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +/* Features */ + +struct nvme_feat_auto_pst { + __le64 entries[32]; +}; + +enum { + NVME_HOST_MEM_ENABLE = (1 << 0), + NVME_HOST_MEM_RETURN = (1 << 1), +}; + +/* Admin commands */ + +enum nvme_admin_opcode { + nvme_admin_delete_sq = 0x00, + nvme_admin_create_sq = 0x01, + nvme_admin_get_log_page = 0x02, + nvme_admin_delete_cq = 0x04, + nvme_admin_create_cq = 0x05, + nvme_admin_identify = 0x06, + nvme_admin_abort_cmd = 0x08, + nvme_admin_set_features = 0x09, + nvme_admin_get_features = 0x0a, + nvme_admin_async_event = 0x0c, + nvme_admin_ns_mgmt = 0x0d, + nvme_admin_activate_fw = 0x10, + nvme_admin_download_fw = 0x11, + nvme_admin_dev_self_test = 0x14, + nvme_admin_ns_attach = 0x15, + nvme_admin_keep_alive = 0x18, + nvme_admin_directive_send = 0x19, + nvme_admin_directive_recv = 0x1a, + nvme_admin_virtual_mgmt = 0x1c, + nvme_admin_nvme_mi_send = 0x1d, + nvme_admin_nvme_mi_recv = 0x1e, + nvme_admin_dbbuf = 0x7C, + nvme_admin_format_nvm = 0x80, + nvme_admin_security_send = 0x81, + nvme_admin_security_recv = 0x82, + nvme_admin_sanitize_nvm = 0x84, +}; + +enum { + NVME_QUEUE_PHYS_CONTIG = (1 << 0), + NVME_CQ_IRQ_ENABLED = (1 << 1), + NVME_SQ_PRIO_URGENT = (0 << 1), + NVME_SQ_PRIO_HIGH = (1 << 1), + NVME_SQ_PRIO_MEDIUM = (2 << 1), + NVME_SQ_PRIO_LOW = (3 << 1), + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MGMT = 0x02, + NVME_FEAT_LBA_RANGE = 0x03, + NVME_FEAT_TEMP_THRESH = 0x04, + NVME_FEAT_ERR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WC = 0x06, + NVME_FEAT_NUM_QUEUES = 0x07, + NVME_FEAT_IRQ_COALESCE = 0x08, + NVME_FEAT_IRQ_CONFIG = 0x09, + NVME_FEAT_WRITE_ATOMIC = 0x0a, + NVME_FEAT_ASYNC_EVENT = 0x0b, + NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_HOST_MEM_BUF = 0x0d, + NVME_FEAT_TIMESTAMP = 0x0e, + NVME_FEAT_KATO = 0x0f, + NVME_FEAT_HCTM = 0X10, + NVME_FEAT_NOPSC = 0X11, + NVME_FEAT_RRL = 0x12, + NVME_FEAT_PLM_CONFIG = 0x13, + NVME_FEAT_PLM_WINDOW = 0x14, + NVME_FEAT_SW_PROGRESS = 0x80, + NVME_FEAT_HOST_ID = 0x81, + NVME_FEAT_RESV_MASK = 0x82, + NVME_FEAT_RESV_PERSIST = 0x83, + NVME_FEAT_WRITE_PROTECT = 0x84, + NVME_LOG_ERROR = 0x01, + NVME_LOG_SMART = 0x02, + NVME_LOG_FW_SLOT = 0x03, + NVME_LOG_CHANGED_NS = 0x04, + NVME_LOG_CMD_EFFECTS = 0x05, + NVME_LOG_DEVICE_SELF_TEST = 0x06, + NVME_LOG_TELEMETRY_HOST = 0x07, + NVME_LOG_TELEMETRY_CTRL = 0x08, + NVME_LOG_ENDURANCE_GROUP = 0x09, + NVME_LOG_ANA = 0x0c, + NVME_LOG_DISC = 0x70, + NVME_LOG_RESERVATION = 0x80, + NVME_LOG_SANITIZE = 0x81, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), +}; + +enum { + NVME_NO_LOG_LSP = 0x0, + NVME_NO_LOG_LPO = 0x0, + NVME_LOG_ANA_LSP_RGO = 0x1, + NVME_TELEM_LSP_CREATE = 0x1, +}; + +/* Sanitize and Sanitize Monitor/Log */ +enum { + /* Sanitize */ + NVME_SANITIZE_NO_DEALLOC = 0x00000200, + NVME_SANITIZE_OIPBP = 0x00000100, + NVME_SANITIZE_OWPASS_SHIFT = 0x00000004, + NVME_SANITIZE_AUSE = 0x00000008, + NVME_SANITIZE_ACT_CRYPTO_ERASE = 0x00000004, + NVME_SANITIZE_ACT_OVERWRITE = 0x00000003, + NVME_SANITIZE_ACT_BLOCK_ERASE = 0x00000002, + NVME_SANITIZE_ACT_EXIT = 0x00000001, + + /* Sanitize Monitor/Log */ + NVME_SANITIZE_LOG_DATA_LEN = 0x0014, + NVME_SANITIZE_LOG_GLOBAL_DATA_ERASED = 0x0100, + NVME_SANITIZE_LOG_NUM_CMPLTED_PASS_MASK = 0x00F8, + NVME_SANITIZE_LOG_STATUS_MASK = 0x0007, + NVME_SANITIZE_LOG_NEVER_SANITIZED = 0x0000, + NVME_SANITIZE_LOG_COMPLETED_SUCCESS = 0x0001, + NVME_SANITIZE_LOG_IN_PROGESS = 0x0002, + NVME_SANITIZE_LOG_COMPLETED_FAILED = 0x0003, +}; + +enum { + /* Self-test log Validation bits */ + NVME_SELF_TEST_VALID_NSID = 1 << 0, + NVME_SELF_TEST_VALID_FLBA = 1 << 1, + NVME_SELF_TEST_VALID_SCT = 1 << 2, + NVME_SELF_TEST_VALID_SC = 1 << 3, + NVME_SELF_TEST_REPORTS = 20, +}; + +struct nvme_identify { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 cns; + __u8 rsvd3; + __le16 ctrlid; + __u32 rsvd11[5]; +}; + +#define NVME_IDENTIFY_DATA_SIZE 4096 + +struct nvme_features { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 fid; + __le32 dword11; + __le32 dword12; + __le32 dword13; + __le32 dword14; + __le32 dword15; +}; + +struct nvme_host_mem_buf_desc { + __le64 addr; + __le32 size; + __u32 rsvd; +}; + +struct nvme_create_cq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 cqid; + __le16 qsize; + __le16 cq_flags; + __le16 irq_vector; + __u32 rsvd12[4]; +}; + +struct nvme_create_sq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 sqid; + __le16 qsize; + __le16 sq_flags; + __le16 cqid; + __u32 rsvd12[4]; +}; + +struct nvme_delete_queue { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 qid; + __u16 rsvd10; + __u32 rsvd11[5]; +}; + +struct nvme_abort_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 sqid; + __u16 cid; + __u32 rsvd11[5]; +}; + +struct nvme_download_firmware { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + union nvme_data_ptr dptr; + __le32 numd; + __le32 offset; + __u32 rsvd12[4]; +}; + +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + +struct nvme_get_log_page_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __u8 lid; + __u8 lsp; + __le16 numdl; + __le16 numdu; + __u16 rsvd11; + __le32 lpol; + __le32 lpou; + __u32 rsvd14[2]; +}; + +struct nvme_directive_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + union nvme_data_ptr dptr; + __le32 numd; + __u8 doper; + __u8 dtype; + __le16 dspec; + __u8 endir; + __u8 tdtype; + __u16 rsvd15; + + __u32 rsvd16[3]; +}; + +/* Sanitize Log Page */ +struct nvme_sanitize_log_page { + __le16 progress; + __le16 status; + __le32 cdw10_info; + __le32 est_ovrwrt_time; + __le32 est_blk_erase_time; + __le32 est_crypto_erase_time; +}; + +/* + * Fabrics subcommands. + */ +enum nvmf_fabrics_opcode { + nvme_fabrics_command = 0x7f, +}; + +enum nvmf_capsule_command { + nvme_fabrics_type_property_set = 0x00, + nvme_fabrics_type_connect = 0x01, + nvme_fabrics_type_property_get = 0x04, +}; + +struct nvmf_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 ts[24]; +}; + +/* + * The legal cntlid range a NVMe Target will provide. + * Note that cntlid of value 0 is considered illegal in the fabrics world. + * Devices based on earlier specs did not have the subsystem concept; + * therefore, those devices had their cntlid value set to 0 as a result. + */ +#define NVME_CNTLID_MIN 1 +#define NVME_CNTLID_MAX 0xffef +#define NVME_CNTLID_DYNAMIC 0xffff + +#define MAX_DISC_LOGS 255 + +/* Discovery log page entry */ +struct nvmf_disc_rsp_page_entry { + __u8 trtype; + __u8 adrfam; + __u8 subtype; + __u8 treq; + __le16 portid; + __le16 cntlid; + __le16 asqsz; + __u8 resv8[22]; + char trsvcid[NVMF_TRSVCID_SIZE]; + __u8 resv64[192]; + char subnqn[NVMF_NQN_FIELD_LEN]; + char traddr[NVMF_TRADDR_SIZE]; + union tsas { + char common[NVMF_TSAS_SIZE]; + struct rdma { + __u8 qptype; + __u8 prtype; + __u8 cms; + __u8 resv3[5]; + __u16 pkey; + __u8 resv10[246]; + } rdma; + struct tcp { + __u8 sectype; + } tcp; + } tsas; +}; + +/* Discovery log page header */ +struct nvmf_disc_rsp_page_hdr { + __le64 genctr; + __le64 numrec; + __le16 recfmt; + __u8 resv14[1006]; + struct nvmf_disc_rsp_page_entry entries[0]; +}; + +struct nvmf_connect_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __le16 recfmt; + __le16 qid; + __le16 sqsize; + __u8 cattr; + __u8 resv3; + __le32 kato; + __u8 resv4[12]; +}; + +struct nvmf_connect_data { + uuid_t hostid; + __le16 cntlid; + char resv4[238]; + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; + char resv5[256]; +}; + +struct nvmf_property_set_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __le64 value; + __u8 resv4[8]; +}; + +struct nvmf_property_get_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[35]; + __u8 attrib; + __u8 resv3[3]; + __le32 offset; + __u8 resv4[16]; +}; + +struct nvme_dbbuf { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __u32 rsvd12[6]; +}; + +struct streams_directive_params { + __le16 msl; + __le16 nssa; + __le16 nsso; + __u8 rsvd[10]; + __le32 sws; + __le16 sgs; + __le16 nsa; + __le16 nso; + __u8 rsvd2[6]; +}; + +struct nvme_command { + union { + struct nvme_common_command common; + struct nvme_rw_command rw; + struct nvme_identify identify; + struct nvme_features features; + struct nvme_create_cq create_cq; + struct nvme_create_sq create_sq; + struct nvme_delete_queue delete_queue; + struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; + struct nvme_dsm_cmd dsm; + struct nvme_write_zeroes_cmd write_zeroes; + struct nvme_abort_cmd abort; + struct nvme_get_log_page_command get_log_page; + struct nvmf_common_command fabrics; + struct nvmf_connect_command connect; + struct nvmf_property_set_command prop_set; + struct nvmf_property_get_command prop_get; + struct nvme_dbbuf dbbuf; + struct nvme_directive_cmd directive; + }; +}; + +static inline bool nvme_is_write(struct nvme_command *cmd) +{ + /* + * What a mess... + * + * Why can't we simply have a Fabrics In and Fabrics out command? + */ + if (unlikely(cmd->common.opcode == nvme_fabrics_command)) + return cmd->fabrics.fctype & 1; + return cmd->common.opcode & 1; +} + +enum { + /* + * Generic Command Status: + */ + NVME_SC_SUCCESS = 0x0, + NVME_SC_INVALID_OPCODE = 0x1, + NVME_SC_INVALID_FIELD = 0x2, + NVME_SC_CMDID_CONFLICT = 0x3, + NVME_SC_DATA_XFER_ERROR = 0x4, + NVME_SC_POWER_LOSS = 0x5, + NVME_SC_INTERNAL = 0x6, + NVME_SC_ABORT_REQ = 0x7, + NVME_SC_ABORT_QUEUE = 0x8, + NVME_SC_FUSED_FAIL = 0x9, + NVME_SC_FUSED_MISSING = 0xa, + NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_SGL_INVALID_LAST = 0xd, + NVME_SC_SGL_INVALID_COUNT = 0xe, + NVME_SC_SGL_INVALID_DATA = 0xf, + NVME_SC_SGL_INVALID_METADATA = 0x10, + NVME_SC_SGL_INVALID_TYPE = 0x11, + + NVME_SC_SGL_INVALID_OFFSET = 0x16, + NVME_SC_SGL_INVALID_SUBTYPE = 0x17, + + NVME_SC_SANITIZE_FAILED = 0x1C, + NVME_SC_SANITIZE_IN_PROGRESS = 0x1D, + + NVME_SC_NS_WRITE_PROTECTED = 0x20, + + NVME_SC_LBA_RANGE = 0x80, + NVME_SC_CAP_EXCEEDED = 0x81, + NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, + + /* + * Command Specific Status: + */ + NVME_SC_CQ_INVALID = 0x100, + NVME_SC_QID_INVALID = 0x101, + NVME_SC_QUEUE_SIZE = 0x102, + NVME_SC_ABORT_LIMIT = 0x103, + NVME_SC_ABORT_MISSING = 0x104, + NVME_SC_ASYNC_LIMIT = 0x105, + NVME_SC_FIRMWARE_SLOT = 0x106, + NVME_SC_FIRMWARE_IMAGE = 0x107, + NVME_SC_INVALID_VECTOR = 0x108, + NVME_SC_INVALID_LOG_PAGE = 0x109, + NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_FW_NEEDS_CONV_RESET = 0x10b, + NVME_SC_INVALID_QUEUE = 0x10c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, + NVME_SC_FEATURE_NOT_PER_NS = 0x10f, + NVME_SC_FW_NEEDS_SUBSYS_RESET = 0x110, + NVME_SC_FW_NEEDS_RESET = 0x111, + NVME_SC_FW_NEEDS_MAX_TIME = 0x112, + NVME_SC_FW_ACIVATE_PROHIBITED = 0x113, + NVME_SC_OVERLAPPING_RANGE = 0x114, + NVME_SC_NS_INSUFFICENT_CAP = 0x115, + NVME_SC_NS_ID_UNAVAILABLE = 0x116, + NVME_SC_NS_ALREADY_ATTACHED = 0x118, + NVME_SC_NS_IS_PRIVATE = 0x119, + NVME_SC_NS_NOT_ATTACHED = 0x11a, + NVME_SC_THIN_PROV_NOT_SUPP = 0x11b, + NVME_SC_CTRL_LIST_INVALID = 0x11c, + NVME_SC_BP_WRITE_PROHIBITED = 0x11e, + + /* + * I/O Command Set Specific - NVM commands: + */ + NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_INVALID_PI = 0x181, + NVME_SC_READ_ONLY = 0x182, + NVME_SC_ONCS_NOT_SUPPORTED = 0x183, + + /* + * I/O Command Set Specific - Fabrics commands: + */ + NVME_SC_CONNECT_FORMAT = 0x180, + NVME_SC_CONNECT_CTRL_BUSY = 0x181, + NVME_SC_CONNECT_INVALID_PARAM = 0x182, + NVME_SC_CONNECT_RESTART_DISC = 0x183, + NVME_SC_CONNECT_INVALID_HOST = 0x184, + + NVME_SC_DISCOVERY_RESTART = 0x190, + NVME_SC_AUTH_REQUIRED = 0x191, + + /* + * Media and Data Integrity Errors: + */ + NVME_SC_WRITE_FAULT = 0x280, + NVME_SC_READ_ERROR = 0x281, + NVME_SC_GUARD_CHECK = 0x282, + NVME_SC_APPTAG_CHECK = 0x283, + NVME_SC_REFTAG_CHECK = 0x284, + NVME_SC_COMPARE_FAILED = 0x285, + NVME_SC_ACCESS_DENIED = 0x286, + NVME_SC_UNWRITTEN_BLOCK = 0x287, + + /* + * Path-related Errors: + */ + NVME_SC_ANA_PERSISTENT_LOSS = 0x301, + NVME_SC_ANA_INACCESSIBLE = 0x302, + NVME_SC_ANA_TRANSITION = 0x303, + + NVME_SC_DNR = 0x4000, +}; + +struct nvme_completion { + /* + * Used by Admin and Fabrics commands to return data: + */ + union nvme_result { + __le16 u16; + __le32 u32; + __le64 u64; + } result; + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +#define NVME_VS(major, minor, tertiary) \ + (((major) << 16) | ((minor) << 8) | (tertiary)) + +#define NVME_MAJOR(ver) ((ver) >> 16) +#define NVME_MINOR(ver) (((ver) >> 8) & 0xff) +#define NVME_TERTIARY(ver) ((ver) & 0xff) + +#endif /* _LINUX_NVME_H */ diff --git a/libmultipath/nvme/linux/nvme_ioctl.h b/libmultipath/nvme/linux/nvme_ioctl.h new file mode 100644 index 0000000..d25a532 --- /dev/null +++ b/libmultipath/nvme/linux/nvme_ioctl.h @@ -0,0 +1,67 @@ +/* + * Definitions for the NVM Express ioctl interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _UAPI_LINUX_NVME_IOCTL_H +#define _UAPI_LINUX_NVME_IOCTL_H + +#include +#include + +struct nvme_user_io { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +}; + +struct nvme_passthru_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 result; +}; + +#define nvme_admin_cmd nvme_passthru_cmd + +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) +#define NVME_IOCTL_RESET _IO('N', 0x44) +#define NVME_IOCTL_SUBSYS_RESET _IO('N', 0x45) +#define NVME_IOCTL_RESCAN _IO('N', 0x46) + +#endif /* _UAPI_LINUX_NVME_IOCTL_H */ diff --git a/libmultipath/nvme/nvme-ioctl.c b/libmultipath/nvme/nvme-ioctl.c new file mode 100644 index 0000000..70a16ce --- /dev/null +++ b/libmultipath/nvme/nvme-ioctl.c @@ -0,0 +1,869 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvme-ioctl.h" + +static int nvme_verify_chr(int fd) +{ + static struct stat nvme_stat; + int err = fstat(fd, &nvme_stat); + + if (err < 0) { + perror("fstat"); + return errno; + } + if (!S_ISCHR(nvme_stat.st_mode)) { + fprintf(stderr, + "Error: requesting reset on non-controller handle\n"); + return ENOTBLK; + } + return 0; +} + +int nvme_subsystem_reset(int fd) +{ + int ret; + + ret = nvme_verify_chr(fd); + if (ret) + return ret; + return ioctl(fd, NVME_IOCTL_SUBSYS_RESET); +} + +int nvme_reset_controller(int fd) +{ + int ret; + + ret = nvme_verify_chr(fd); + if (ret) + return ret; + return ioctl(fd, NVME_IOCTL_RESET); +} + +int nvme_ns_rescan(int fd) +{ + int ret; + + ret = nvme_verify_chr(fd); + if (ret) + return ret; + return ioctl(fd, NVME_IOCTL_RESCAN); +} + +int nvme_get_nsid(int fd) +{ + static struct stat nvme_stat; + int err = fstat(fd, &nvme_stat); + + if (err < 0) + return -errno; + + if (!S_ISBLK(nvme_stat.st_mode)) { + fprintf(stderr, + "Error: requesting namespace-id from non-block device\n"); + errno = ENOTBLK; + return -errno; + } + return ioctl(fd, NVME_IOCTL_ID); +} + +int nvme_submit_passthru(int fd, unsigned long ioctl_cmd, + struct nvme_passthru_cmd *cmd) +{ + return ioctl(fd, ioctl_cmd, cmd); +} + +static int nvme_submit_admin_passthru(int fd, struct nvme_passthru_cmd *cmd) +{ + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, cmd); +} + +static int nvme_submit_io_passthru(int fd, struct nvme_passthru_cmd *cmd) +{ + return ioctl(fd, NVME_IOCTL_IO_CMD, cmd); +} + +int nvme_passthru(int fd, unsigned long ioctl_cmd, __u8 opcode, + __u8 flags, __u16 rsvd, + __u32 nsid, __u32 cdw2, __u32 cdw3, __u32 cdw10, __u32 cdw11, + __u32 cdw12, __u32 cdw13, __u32 cdw14, __u32 cdw15, + __u32 data_len, void *data, __u32 metadata_len, + void *metadata, __u32 timeout_ms, __u32 *result) +{ + struct nvme_passthru_cmd cmd = { + .opcode = opcode, + .flags = flags, + .rsvd1 = rsvd, + .nsid = nsid, + .cdw2 = cdw2, + .cdw3 = cdw3, + .metadata = (__u64)(uintptr_t) metadata, + .addr = (__u64)(uintptr_t) data, + .metadata_len = metadata_len, + .data_len = data_len, + .cdw10 = cdw10, + .cdw11 = cdw11, + .cdw12 = cdw12, + .cdw13 = cdw13, + .cdw14 = cdw14, + .cdw15 = cdw15, + .timeout_ms = timeout_ms, + .result = 0, + }; + int err; + + err = nvme_submit_passthru(fd, ioctl_cmd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +int nvme_io(int fd, __u8 opcode, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + struct nvme_user_io io = { + .opcode = opcode, + .flags = 0, + .control = control, + .nblocks = nblocks, + .rsvd = 0, + .metadata = (__u64)(uintptr_t) metadata, + .addr = (__u64)(uintptr_t) data, + .slba = slba, + .dsmgmt = dsmgmt, + .reftag = reftag, + .appmask = appmask, + .apptag = apptag, + }; + return ioctl(fd, NVME_IOCTL_SUBMIT_IO, &io); +} + +int nvme_read(int fd, __u64 slba, __u16 nblocks, __u16 control, __u32 dsmgmt, + __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + return nvme_io(fd, nvme_cmd_read, slba, nblocks, control, dsmgmt, + reftag, apptag, appmask, data, metadata); +} + +int nvme_write(int fd, __u64 slba, __u16 nblocks, __u16 control, __u32 dsmgmt, + __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + return nvme_io(fd, nvme_cmd_write, slba, nblocks, control, dsmgmt, + reftag, apptag, appmask, data, metadata); +} + +int nvme_compare(int fd, __u64 slba, __u16 nblocks, __u16 control, __u32 dsmgmt, + __u32 reftag, __u16 apptag, __u16 appmask, void *data, + void *metadata) +{ + return nvme_io(fd, nvme_cmd_compare, slba, nblocks, control, dsmgmt, + reftag, apptag, appmask, data, metadata); +} + +int nvme_passthru_io(int fd, __u8 opcode, __u8 flags, __u16 rsvd, + __u32 nsid, __u32 cdw2, __u32 cdw3, __u32 cdw10, + __u32 cdw11, __u32 cdw12, __u32 cdw13, __u32 cdw14, + __u32 cdw15, __u32 data_len, void *data, + __u32 metadata_len, void *metadata, __u32 timeout_ms) +{ + return nvme_passthru(fd, NVME_IOCTL_IO_CMD, opcode, flags, rsvd, nsid, + cdw2, cdw3, cdw10, cdw11, cdw12, cdw13, cdw14, + cdw15, data_len, data, metadata_len, metadata, + timeout_ms, NULL); +} + +int nvme_write_zeros(int fd, __u32 nsid, __u64 slba, __u16 nlb, + __u16 control, __u32 reftag, __u16 apptag, __u16 appmask) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_write_zeroes, + .nsid = nsid, + .cdw10 = slba & 0xffffffff, + .cdw11 = slba >> 32, + .cdw12 = nlb | (control << 16), + .cdw14 = reftag, + .cdw15 = apptag | (appmask << 16), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +int nvme_write_uncorrectable(int fd, __u32 nsid, __u64 slba, __u16 nlb) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_write_uncor, + .nsid = nsid, + .cdw10 = slba & 0xffffffff, + .cdw11 = slba >> 32, + .cdw12 = nlb, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +int nvme_flush(int fd, __u32 nsid) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_flush, + .nsid = nsid, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +int nvme_dsm(int fd, __u32 nsid, __u32 cdw11, struct nvme_dsm_range *dsm, + __u16 nr_ranges) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_dsm, + .nsid = nsid, + .addr = (__u64)(uintptr_t) dsm, + .data_len = nr_ranges * sizeof(*dsm), + .cdw10 = nr_ranges - 1, + .cdw11 = cdw11, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +struct nvme_dsm_range *nvme_setup_dsm_range(__u32 *ctx_attrs, __u32 *llbas, + __u64 *slbas, __u16 nr_ranges) +{ + int i; + struct nvme_dsm_range *dsm = malloc(nr_ranges * sizeof(*dsm)); + + if (!dsm) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + return NULL; + } + for (i = 0; i < nr_ranges; i++) { + dsm[i].cattr = cpu_to_le32(ctx_attrs[i]); + dsm[i].nlb = cpu_to_le32(llbas[i]); + dsm[i].slba = cpu_to_le64(slbas[i]); + } + return dsm; +} + +int nvme_resv_acquire(int fd, __u32 nsid, __u8 rtype, __u8 racqa, + bool iekey, __u64 crkey, __u64 nrkey) +{ + __le64 payload[2] = { cpu_to_le64(crkey), cpu_to_le64(nrkey) }; + __u32 cdw10 = (racqa & 0x7) | (iekey ? 1 << 3 : 0) | rtype << 8; + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_acquire, + .nsid = nsid, + .cdw10 = cdw10, + .addr = (__u64)(uintptr_t) (payload), + .data_len = sizeof(payload), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +int nvme_resv_register(int fd, __u32 nsid, __u8 rrega, __u8 cptpl, + bool iekey, __u64 crkey, __u64 nrkey) +{ + __le64 payload[2] = { cpu_to_le64(crkey), cpu_to_le64(nrkey) }; + __u32 cdw10 = (rrega & 0x7) | (iekey ? 1 << 3 : 0) | cptpl << 30; + + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_register, + .nsid = nsid, + .cdw10 = cdw10, + .addr = (__u64)(uintptr_t) (payload), + .data_len = sizeof(payload), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +int nvme_resv_release(int fd, __u32 nsid, __u8 rtype, __u8 rrela, + bool iekey, __u64 crkey) +{ + __le64 payload[1] = { cpu_to_le64(crkey) }; + __u32 cdw10 = (rrela & 0x7) | (iekey ? 1 << 3 : 0) | rtype << 8; + + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_release, + .nsid = nsid, + .cdw10 = cdw10, + .addr = (__u64)(uintptr_t) (payload), + .data_len = sizeof(payload), + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +int nvme_resv_report(int fd, __u32 nsid, __u32 numd, __u32 cdw11, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_resv_report, + .nsid = nsid, + .cdw10 = numd, + .cdw11 = cdw11, + .addr = (__u64)(uintptr_t) data, + .data_len = (numd + 1) << 2, + }; + + return nvme_submit_io_passthru(fd, &cmd); +} + +int nvme_identify13(int fd, __u32 nsid, __u32 cdw10, __u32 cdw11, void *data) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_identify, + .nsid = nsid, + .addr = (__u64)(uintptr_t) data, + .data_len = NVME_IDENTIFY_DATA_SIZE, + .cdw10 = cdw10, + .cdw11 = cdw11, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +int nvme_identify(int fd, __u32 nsid, __u32 cdw10, void *data) +{ + return nvme_identify13(fd, nsid, cdw10, 0, data); +} + +int nvme_identify_ctrl(int fd, void *data) +{ + return nvme_identify(fd, 0, 1, data); +} + +int nvme_identify_ns(int fd, __u32 nsid, bool present, void *data) +{ + int cns = present ? NVME_ID_CNS_NS_PRESENT : NVME_ID_CNS_NS; + + return nvme_identify(fd, nsid, cns, data); +} + +int nvme_identify_ns_list(int fd, __u32 nsid, bool all, void *data) +{ + int cns = all ? NVME_ID_CNS_NS_PRESENT_LIST : NVME_ID_CNS_NS_ACTIVE_LIST; + + return nvme_identify(fd, nsid, cns, data); +} + +int nvme_identify_ctrl_list(int fd, __u32 nsid, __u16 cntid, void *data) +{ + int cns = nsid ? NVME_ID_CNS_CTRL_NS_LIST : NVME_ID_CNS_CTRL_LIST; + + return nvme_identify(fd, nsid, (cntid << 16) | cns, data); +} + +int nvme_identify_ns_descs(int fd, __u32 nsid, void *data) +{ + + return nvme_identify(fd, nsid, NVME_ID_CNS_NS_DESC_LIST, data); +} + +int nvme_identify_nvmset(int fd, __u16 nvmset_id, void *data) +{ + return nvme_identify13(fd, 0, NVME_ID_CNS_NVMSET_LIST, nvmset_id, data); +} + +int nvme_get_log13(int fd, __u32 nsid, __u8 log_id, __u8 lsp, __u64 lpo, + __u16 lsi, bool rae, __u32 data_len, void *data) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_get_log_page, + .nsid = nsid, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + }; + __u32 numd = (data_len >> 2) - 1; + __u16 numdu = numd >> 16, numdl = numd & 0xffff; + + cmd.cdw10 = log_id | (numdl << 16) | (rae ? 1 << 15 : 0); + if (lsp) + cmd.cdw10 |= lsp << 8; + + cmd.cdw11 = numdu | (lsi << 16); + cmd.cdw12 = lpo; + cmd.cdw13 = (lpo >> 32); + + return nvme_submit_admin_passthru(fd, &cmd); + +} + +int nvme_get_log(int fd, __u32 nsid, __u8 log_id, bool rae, + __u32 data_len, void *data) +{ + void *ptr = data; + __u32 offset = 0, xfer_len = data_len; + int ret; + + /* + * 4k is the smallest possible transfer unit, so by + * restricting ourselves for 4k transfers we avoid having + * to check the MDTS value of the controller. + */ + do { + xfer_len = data_len - offset; + if (xfer_len > 4096) + xfer_len = 4096; + + ret = nvme_get_log13(fd, nsid, log_id, NVME_NO_LOG_LSP, + offset, 0, rae, xfer_len, ptr); + if (ret) + return ret; + + offset += xfer_len; + ptr += xfer_len; + } while (offset < data_len); + + return 0; +} + +int nvme_get_telemetry_log(int fd, void *lp, int generate_report, + int ctrl_init, size_t log_page_size, __u64 offset) +{ + if (ctrl_init) + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_TELEMETRY_CTRL, + NVME_NO_LOG_LSP, offset, + 0, 1, log_page_size, lp); + if (generate_report) + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_TELEMETRY_HOST, + NVME_TELEM_LSP_CREATE, offset, + 0, 1, log_page_size, lp); + else + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_TELEMETRY_HOST, + NVME_NO_LOG_LSP, offset, + 0, 1, log_page_size, lp); +} + +int nvme_fw_log(int fd, struct nvme_firmware_log_page *fw_log) +{ + return nvme_get_log(fd, NVME_NSID_ALL, NVME_LOG_FW_SLOT, true, + sizeof(*fw_log), fw_log); +} + +int nvme_changed_ns_list_log(int fd, struct nvme_changed_ns_list_log *changed_ns_list_log) +{ + return nvme_get_log(fd, 0, NVME_LOG_CHANGED_NS, true, + sizeof(changed_ns_list_log->log), + changed_ns_list_log->log); +} + +int nvme_error_log(int fd, int entries, struct nvme_error_log_page *err_log) +{ + return nvme_get_log(fd, NVME_NSID_ALL, NVME_LOG_ERROR, false, + entries * sizeof(*err_log), err_log); +} + +int nvme_endurance_log(int fd, __u16 group_id, struct nvme_endurance_group_log *endurance_log) +{ + return nvme_get_log13(fd, 0, NVME_LOG_ENDURANCE_GROUP, 0, 0, group_id, 0, + sizeof(*endurance_log), endurance_log); +} + +int nvme_smart_log(int fd, __u32 nsid, struct nvme_smart_log *smart_log) +{ + return nvme_get_log(fd, nsid, NVME_LOG_SMART, false, + sizeof(*smart_log), smart_log); +} + +int nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo) +{ + __u64 lpo = 0; + + return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_ANA, rgo, lpo, 0, + true, ana_log_len, ana_log); +} + +int nvme_self_test_log(int fd, struct nvme_self_test_log *self_test_log) +{ + return nvme_get_log(fd, NVME_NSID_ALL, NVME_LOG_DEVICE_SELF_TEST, false, + sizeof(*self_test_log), self_test_log); +} + +int nvme_effects_log(int fd, struct nvme_effects_log_page *effects_log) +{ + return nvme_get_log(fd, 0, NVME_LOG_CMD_EFFECTS, false, + sizeof(*effects_log), effects_log); +} + +int nvme_discovery_log(int fd, struct nvmf_disc_rsp_page_hdr *log, __u32 size) +{ + return nvme_get_log(fd, 0, NVME_LOG_DISC, false, size, log); +} + +int nvme_sanitize_log(int fd, struct nvme_sanitize_log_page *sanitize_log) +{ + return nvme_get_log(fd, 0, NVME_LOG_SANITIZE, false, + sizeof(*sanitize_log), sanitize_log); +} + +int nvme_feature(int fd, __u8 opcode, __u32 nsid, __u32 cdw10, __u32 cdw11, + __u32 cdw12, __u32 data_len, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = opcode, + .nsid = nsid, + .cdw10 = cdw10, + .cdw11 = cdw11, + .cdw12 = cdw12, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +int nvme_set_feature(int fd, __u32 nsid, __u8 fid, __u32 value, __u32 cdw12, + bool save, __u32 data_len, void *data, __u32 *result) +{ + __u32 cdw10 = fid | (save ? 1 << 31 : 0); + + return nvme_feature(fd, nvme_admin_set_features, nsid, cdw10, value, + cdw12, data_len, data, result); +} + +static int nvme_property(int fd, __u8 fctype, __le32 off, __le64 *value, __u8 attrib) +{ + int err; + struct nvme_admin_cmd cmd = { + .opcode = nvme_fabrics_command, + .cdw10 = attrib, + .cdw11 = off, + }; + + if (!value) { + errno = EINVAL; + return -errno; + } + + if (fctype == nvme_fabrics_type_property_get){ + cmd.nsid = nvme_fabrics_type_property_get; + } else if(fctype == nvme_fabrics_type_property_set) { + cmd.nsid = nvme_fabrics_type_property_set; + cmd.cdw12 = *value; + } else { + errno = EINVAL; + return -errno; + } + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && fctype == nvme_fabrics_type_property_get) + *value = cpu_to_le64(cmd.result); + return err; +} + +static int get_property_helper(int fd, int offset, void *value, int *advance) +{ + __le64 value64; + int err = -EINVAL; + + switch (offset) { + case NVME_REG_CAP: + case NVME_REG_ASQ: + case NVME_REG_ACQ: + *advance = 8; + break; + default: + *advance = 4; + } + + if (!value) + return err; + + err = nvme_property(fd, nvme_fabrics_type_property_get, + cpu_to_le32(offset), &value64, (*advance == 8)); + + if (!err) { + if (*advance == 8) + *((uint64_t *)value) = le64_to_cpu(value64); + else + *((uint32_t *)value) = le32_to_cpu(value64); + } + + return err; +} + +int nvme_get_property(int fd, int offset, uint64_t *value) +{ + int advance; + return get_property_helper(fd, offset, value, &advance); +} + +int nvme_get_properties(int fd, void **pbar) +{ + int offset, advance; + int err, ret = -EINVAL; + int size = getpagesize(); + + *pbar = malloc(size); + if (!*pbar) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + return -ENOMEM; + } + + memset(*pbar, 0xff, size); + for (offset = NVME_REG_CAP; offset <= NVME_REG_CMBSZ; offset += advance) { + err = get_property_helper(fd, offset, *pbar + offset, &advance); + if (!err) + ret = 0; + } + + return ret; +} + +int nvme_set_property(int fd, int offset, int value) +{ + __le64 val = cpu_to_le64(value); + __le32 off = cpu_to_le32(offset); + bool is64bit; + + switch (off) { + case NVME_REG_CAP: + case NVME_REG_ASQ: + case NVME_REG_ACQ: + is64bit = true; + break; + default: + is64bit = false; + } + + return nvme_property(fd, nvme_fabrics_type_property_set, + off, &val, is64bit ? 1: 0); +} + +int nvme_get_feature(int fd, __u32 nsid, __u8 fid, __u8 sel, __u32 cdw11, + __u32 data_len, void *data, __u32 *result) +{ + __u32 cdw10 = fid | sel << 8; + + return nvme_feature(fd, nvme_admin_get_features, nsid, cdw10, cdw11, + 0, data_len, data, result); +} + +int nvme_format(int fd, __u32 nsid, __u8 lbaf, __u8 ses, __u8 pi, + __u8 pil, __u8 ms, __u32 timeout) +{ + __u32 cdw10 = lbaf | ms << 4 | pi << 5 | pil << 8 | ses << 9; + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_format_nvm, + .nsid = nsid, + .cdw10 = cdw10, + .timeout_ms = timeout, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +int nvme_ns_create(int fd, __u64 nsze, __u64 ncap, __u8 flbas, + __u8 dps, __u8 nmic, __u32 *result) +{ + struct nvme_id_ns ns = { + .nsze = cpu_to_le64(nsze), + .ncap = cpu_to_le64(ncap), + .flbas = flbas, + .dps = dps, + .nmic = nmic, + }; + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_ns_mgmt, + .addr = (__u64)(uintptr_t) ((void *)&ns), + .cdw10 = 0, + .data_len = 0x1000, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +int nvme_ns_delete(int fd, __u32 nsid) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_ns_mgmt, + .nsid = nsid, + .cdw10 = 1, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +int nvme_ns_attachment(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist, + bool attach) +{ + int i; + __u8 buf[0x1000]; + struct nvme_controller_list *cntlist = + (struct nvme_controller_list *)buf; + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_ns_attach, + .nsid = nsid, + .addr = (__u64)(uintptr_t) cntlist, + .cdw10 = attach ? 0 : 1, + .data_len = 0x1000, + }; + + memset(buf, 0, sizeof(buf)); + cntlist->num = cpu_to_le16(num_ctrls); + for (i = 0; i < num_ctrls; i++) + cntlist->identifier[i] = cpu_to_le16(ctrlist[i]); + + return nvme_submit_admin_passthru(fd, &cmd); +} + +int nvme_ns_attach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist) +{ + return nvme_ns_attachment(fd, nsid, num_ctrls, ctrlist, true); +} + +int nvme_ns_detach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist) +{ + return nvme_ns_attachment(fd, nsid, num_ctrls, ctrlist, false); +} + +int nvme_fw_download(int fd, __u32 offset, __u32 data_len, void *data) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_download_fw, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .cdw10 = (data_len >> 2) - 1, + .cdw11 = offset >> 2, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +int nvme_fw_commit(int fd, __u8 slot, __u8 action, __u8 bpid) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_activate_fw, + .cdw10 = (bpid << 31) | (action << 3) | slot, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +int nvme_sec_send(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 tl, __u32 data_len, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_security_send, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .nsid = nsid, + .cdw10 = secp << 24 | spsp << 8 | nssf, + .cdw11 = tl, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +int nvme_sec_recv(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 al, __u32 data_len, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_security_recv, + .nsid = nsid, + .cdw10 = secp << 24 | spsp << 8 | nssf, + .cdw11 = al, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +int nvme_dir_send(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_directive_send, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .nsid = nsid, + .cdw10 = data_len? (data_len >> 2) - 1 : 0, + .cdw11 = dspec << 16 | dtype << 8 | doper, + .cdw12 = dw12, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +int nvme_dir_recv(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_directive_recv, + .addr = (__u64)(uintptr_t) data, + .data_len = data_len, + .nsid = nsid, + .cdw10 = data_len? (data_len >> 2) - 1 : 0, + .cdw11 = dspec << 16 | dtype << 8 | doper, + .cdw12 = dw12, + }; + int err; + + err = nvme_submit_admin_passthru(fd, &cmd); + if (!err && result) + *result = cmd.result; + return err; +} + +int nvme_sanitize(int fd, __u8 sanact, __u8 ause, __u8 owpass, __u8 oipbp, + __u8 no_dealloc, __u32 ovrpat) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_sanitize_nvm, + .cdw10 = no_dealloc << 9 | oipbp << 8 | + owpass << NVME_SANITIZE_OWPASS_SHIFT | + ause << 3 | sanact, + .cdw11 = ovrpat, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} + +int nvme_self_test_start(int fd, __u32 nsid, __u32 cdw10) +{ + struct nvme_admin_cmd cmd = { + .opcode = nvme_admin_dev_self_test, + .nsid = nsid, + .cdw10 = cdw10, + }; + + return nvme_submit_admin_passthru(fd, &cmd); +} diff --git a/libmultipath/nvme/nvme-ioctl.h b/libmultipath/nvme/nvme-ioctl.h new file mode 100644 index 0000000..3fb740c --- /dev/null +++ b/libmultipath/nvme/nvme-ioctl.h @@ -0,0 +1,139 @@ +#ifndef _NVME_LIB_H +#define _NVME_LIB_H + +#include +#include +#include "linux/nvme_ioctl.h" +#include "nvme.h" + +int nvme_get_nsid(int fd); + +/* Generic passthrough */ +int nvme_submit_passthru(int fd, unsigned long ioctl_cmd, + struct nvme_passthru_cmd *cmd); + +int nvme_passthru(int fd, unsigned long ioctl_cmd, __u8 opcode, __u8 flags, + __u16 rsvd, __u32 nsid, __u32 cdw2, __u32 cdw3, + __u32 cdw10, __u32 cdw11, __u32 cdw12, + __u32 cdw13, __u32 cdw14, __u32 cdw15, + __u32 data_len, void *data, __u32 metadata_len, + void *metadata, __u32 timeout_ms, __u32 *result); + +/* NVME_SUBMIT_IO */ +int nvme_io(int fd, __u8 opcode, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +int nvme_read(int fd, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +int nvme_write(int fd, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +int nvme_compare(int fd, __u64 slba, __u16 nblocks, __u16 control, + __u32 dsmgmt, __u32 reftag, __u16 apptag, + __u16 appmask, void *data, void *metadata); + +/* NVME_IO_CMD */ +int nvme_passthru_io(int fd, __u8 opcode, __u8 flags, __u16 rsvd, + __u32 nsid, __u32 cdw2, __u32 cdw3, + __u32 cdw10, __u32 cdw11, __u32 cdw12, + __u32 cdw13, __u32 cdw14, __u32 cdw15, + __u32 data_len, void *data, __u32 metadata_len, + void *metadata, __u32 timeout); + +int nvme_write_zeros(int fd, __u32 nsid, __u64 slba, __u16 nlb, + __u16 control, __u32 reftag, __u16 apptag, __u16 appmask); + +int nvme_write_uncorrectable(int fd, __u32 nsid, __u64 slba, __u16 nlb); + +int nvme_flush(int fd, __u32 nsid); + +int nvme_dsm(int fd, __u32 nsid, __u32 cdw11, struct nvme_dsm_range *dsm, + __u16 nr_ranges); +struct nvme_dsm_range *nvme_setup_dsm_range(__u32 *ctx_attrs, + __u32 *llbas, __u64 *slbas, + __u16 nr_ranges); + +int nvme_resv_acquire(int fd, __u32 nsid, __u8 rtype, __u8 racqa, + bool iekey, __u64 crkey, __u64 nrkey); +int nvme_resv_register(int fd, __u32 nsid, __u8 rrega, __u8 cptpl, + bool iekey, __u64 crkey, __u64 nrkey); +int nvme_resv_release(int fd, __u32 nsid, __u8 rtype, __u8 rrela, + bool iekey, __u64 crkey); +int nvme_resv_report(int fd, __u32 nsid, __u32 numd, __u32 cdw11, void *data); + +int nvme_identify13(int fd, __u32 nsid, __u32 cdw10, __u32 cdw11, void *data); +int nvme_identify(int fd, __u32 nsid, __u32 cdw10, void *data); +int nvme_identify_ctrl(int fd, void *data); +int nvme_identify_ns(int fd, __u32 nsid, bool present, void *data); +int nvme_identify_ns_list(int fd, __u32 nsid, bool all, void *data); +int nvme_identify_ctrl_list(int fd, __u32 nsid, __u16 cntid, void *data); +int nvme_identify_ns_descs(int fd, __u32 nsid, void *data); +int nvme_identify_nvmset(int fd, __u16 nvmset_id, void *data); +int nvme_get_log13(int fd, __u32 nsid, __u8 log_id, __u8 lsp, __u64 lpo, + __u16 group_id, bool rae, __u32 data_len, void *data); +int nvme_get_log(int fd, __u32 nsid, __u8 log_id, bool rae, + __u32 data_len, void *data); + + +int nvme_get_telemetry_log(int fd, void *lp, int generate_report, + int ctrl_gen, size_t log_page_size, __u64 offset); +int nvme_fw_log(int fd, struct nvme_firmware_log_page *fw_log); +int nvme_changed_ns_list_log(int fd, + struct nvme_changed_ns_list_log *changed_ns_list_log); +int nvme_error_log(int fd, int entries, struct nvme_error_log_page *err_log); +int nvme_smart_log(int fd, __u32 nsid, struct nvme_smart_log *smart_log); +int nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo); +int nvme_effects_log(int fd, struct nvme_effects_log_page *effects_log); +int nvme_discovery_log(int fd, struct nvmf_disc_rsp_page_hdr *log, __u32 size); +int nvme_sanitize_log(int fd, struct nvme_sanitize_log_page *sanitize_log); +int nvme_endurance_log(int fd, __u16 group_id, + struct nvme_endurance_group_log *endurance_log); + +int nvme_feature(int fd, __u8 opcode, __u32 nsid, __u32 cdw10, + __u32 cdw11, __u32 cdw12, __u32 data_len, void *data, + __u32 *result); +int nvme_set_feature(int fd, __u32 nsid, __u8 fid, __u32 value, __u32 cdw12, + bool save, __u32 data_len, void *data, __u32 *result); +int nvme_get_feature(int fd, __u32 nsid, __u8 fid, __u8 sel, + __u32 cdw11, __u32 data_len, void *data, __u32 *result); + +int nvme_format(int fd, __u32 nsid, __u8 lbaf, __u8 ses, __u8 pi, + __u8 pil, __u8 ms, __u32 timeout); + +int nvme_ns_create(int fd, __u64 nsze, __u64 ncap, __u8 flbas, + __u8 dps, __u8 nmic, __u32 *result); +int nvme_ns_delete(int fd, __u32 nsid); + +int nvme_ns_attachment(int fd, __u32 nsid, __u16 num_ctrls, + __u16 *ctrlist, bool attach); +int nvme_ns_attach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist); +int nvme_ns_detach_ctrls(int fd, __u32 nsid, __u16 num_ctrls, __u16 *ctrlist); + +int nvme_fw_download(int fd, __u32 offset, __u32 data_len, void *data); +int nvme_fw_commit(int fd, __u8 slot, __u8 action, __u8 bpid); + +int nvme_sec_send(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 tl, __u32 data_len, void *data, __u32 *result); +int nvme_sec_recv(int fd, __u32 nsid, __u8 nssf, __u16 spsp, + __u8 secp, __u32 al, __u32 data_len, void *data, __u32 *result); + +int nvme_subsystem_reset(int fd); +int nvme_reset_controller(int fd); +int nvme_ns_rescan(int fd); + +int nvme_dir_send(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result); +int nvme_dir_recv(int fd, __u32 nsid, __u16 dspec, __u8 dtype, __u8 doper, + __u32 data_len, __u32 dw12, void *data, __u32 *result); +int nvme_get_properties(int fd, void **pbar); +int nvme_set_property(int fd, int offset, int value); +int nvme_get_property(int fd, int offset, uint64_t *value); +int nvme_sanitize(int fd, __u8 sanact, __u8 ause, __u8 owpass, __u8 oipbp, + __u8 no_dealloc, __u32 ovrpat); +int nvme_self_test_start(int fd, __u32 nsid, __u32 cdw10); +int nvme_self_test_log(int fd, struct nvme_self_test_log *self_test_log); +#endif /* _NVME_LIB_H */ diff --git a/libmultipath/nvme/nvme.h b/libmultipath/nvme/nvme.h new file mode 100644 index 0000000..685d179 --- /dev/null +++ b/libmultipath/nvme/nvme.h @@ -0,0 +1,163 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVME_H +#define _NVME_H + +#include +#include +#include +#include "plugin.h" +#include "json.h" + +#define unlikely(x) x + +#ifdef LIBUUID +#include +#else +typedef struct { + uint8_t b[16]; +} uuid_t; +#endif + +#include "linux/nvme.h" + +struct nvme_effects_log_page { + __le32 acs[256]; + __le32 iocs[256]; + __u8 resv[2048]; +}; + +struct nvme_error_log_page { + __u64 error_count; + __u16 sqid; + __u16 cmdid; + __u16 status_field; + __u16 parm_error_location; + __u64 lba; + __u32 nsid; + __u8 vs; + __u8 resv[3]; + __u64 cs; + __u8 resv2[24]; +}; + +struct nvme_firmware_log_page { + __u8 afi; + __u8 resv[7]; + __u64 frs[7]; + __u8 resv2[448]; +}; + +/* idle and active power scales occupy the last 2 bits of the field */ +#define POWER_SCALE(s) ((s) >> 6) + +struct nvme_host_mem_buffer { + __u32 hsize; + __u32 hmdlal; + __u32 hmdlau; + __u32 hmdlec; + __u8 rsvd16[4080]; +}; + +struct nvme_auto_pst { + __u32 data; + __u32 rsvd32; +}; + +struct nvme_timestamp { + __u8 timestamp[6]; + __u8 attr; + __u8 rsvd; +}; + +struct nvme_controller_list { + __le16 num; + __le16 identifier[]; +}; + +struct nvme_bar_cap { + __u16 mqes; + __u8 ams_cqr; + __u8 to; + __u16 bps_css_nssrs_dstrd; + __u8 mpsmax_mpsmin; + __u8 reserved; +}; + +#ifdef __CHECKER__ +#define __force __attribute__((force)) +#else +#define __force +#endif + +#define cpu_to_le16(x) \ + ((__force __le16)htole16(x)) +#define cpu_to_le32(x) \ + ((__force __le32)htole32(x)) +#define cpu_to_le64(x) \ + ((__force __le64)htole64(x)) + +#define le16_to_cpu(x) \ + le16toh((__force __u16)(x)) +#define le32_to_cpu(x) \ + le32toh((__force __u32)(x)) +#define le64_to_cpu(x) \ + le64toh((__force __u64)(x)) + +#define MAX_LIST_ITEMS 256 +struct list_item { + char node[1024]; + struct nvme_id_ctrl ctrl; + int nsid; + struct nvme_id_ns ns; + unsigned block; +}; + +struct ctrl_list_item { + char *name; + char *address; + char *transport; + char *state; + char *ana_state; +}; + +struct subsys_list_item { + char *name; + char *subsysnqn; + int nctrls; + struct ctrl_list_item *ctrls; +}; + +enum { + NORMAL, + JSON, + BINARY, +}; + +void register_extension(struct plugin *plugin); + +#include "argconfig.h" +int parse_and_open(int argc, char **argv, const char *desc, + const struct argconfig_commandline_options *clo, void *cfg, size_t size); + +extern const char *devicename; + +int __id_ctrl(int argc, char **argv, struct command *cmd, struct plugin *plugin, void (*vs)(__u8 *vs, struct json_object *root)); +int validate_output_format(char *format); + +struct subsys_list_item *get_subsys_list(int *subcnt, char *subsysnqn, __u32 nsid); +void free_subsys_list(struct subsys_list_item *slist, int n); +char *nvme_char_from_block(char *block); +#endif /* _NVME_H */ diff --git a/libmultipath/nvme/plugin.h b/libmultipath/nvme/plugin.h new file mode 100644 index 0000000..91079fb --- /dev/null +++ b/libmultipath/nvme/plugin.h @@ -0,0 +1,36 @@ +#ifndef PLUGIN_H +#define PLUGIN_H + +#include + +struct program { + const char *name; + const char *version; + const char *usage; + const char *desc; + const char *more; + struct command **commands; + struct plugin *extensions; +}; + +struct plugin { + const char *name; + const char *desc; + struct command **commands; + struct program *parent; + struct plugin *next; + struct plugin *tail; +}; + +struct command { + char *name; + char *help; + int (*fn)(int argc, char **argv, struct command *command, struct plugin *plugin); + char *alias; +}; + +void usage(struct plugin *plugin); +void general_help(struct plugin *plugin); +int handle_plugin(int argc, char **argv, struct plugin *plugin); + +#endif diff --git a/libmultipath/prio.c b/libmultipath/prio.c index 17acfd0..0590218 100644 --- a/libmultipath/prio.c +++ b/libmultipath/prio.c @@ -42,7 +42,7 @@ void free_prio (struct prio * p) return; p->refcount--; if (p->refcount) { - condlog(3, "%s prioritizer refcount %d", + condlog(4, "%s prioritizer refcount %d", p->name, p->refcount); return; } diff --git a/libmultipath/prio.h b/libmultipath/prio.h index aa587cc..599d1d8 100644 --- a/libmultipath/prio.h +++ b/libmultipath/prio.h @@ -30,6 +30,7 @@ struct path; #define PRIO_WEIGHTED_PATH "weightedpath" #define PRIO_SYSFS "sysfs" #define PRIO_PATH_LATENCY "path_latency" +#define PRIO_ANA "ana" /* * Value used to mark the fact prio was not defined diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile index ab7bc07..4d80c20 100644 --- a/libmultipath/prioritizers/Makefile +++ b/libmultipath/prioritizers/Makefile @@ -21,6 +21,11 @@ LIBS = \ libpriopath_latency.so \ libpriosysfs.so +ifneq ($(call check_file,/usr/include/linux/nvme_ioctl.h),0) + LIBS += libprioana.so + CFLAGS += -I../nvme +endif + all: $(LIBS) libprioalua.so: alua.o alua_rtpg.o diff --git a/libmultipath/prioritizers/ana.c b/libmultipath/prioritizers/ana.c new file mode 100644 index 0000000..990d935 --- /dev/null +++ b/libmultipath/prioritizers/ana.c @@ -0,0 +1,232 @@ +/* + * (C) Copyright HUAWEI Technology Corp. 2017 All Rights Reserved. + * + * ana.c + * Version 1.00 + * + * Tool to make use of a NVMe-feature called Asymmetric Namespace Access. + * It determines the ANA state of a device and prints a priority value to stdout. + * + * Author(s): Cheng Jike + * Li Jie + * + * This file is released under the GPL version 2, or any later version. + */ +#include +#include +#include +#include +#include +#include + +#include "debug.h" +#include "nvme-lib.h" +#include "prio.h" +#include "util.h" +#include "structs.h" + +enum { + ANA_ERR_GETCTRL_FAILED = 1, + ANA_ERR_NOT_NVME, + ANA_ERR_NOT_SUPPORTED, + ANA_ERR_GETANAS_OVERFLOW, + ANA_ERR_GETANAS_NOTFOUND, + ANA_ERR_GETANALOG_FAILED, + ANA_ERR_GETNSID_FAILED, + ANA_ERR_GETNS_FAILED, + ANA_ERR_NO_MEMORY, + ANA_ERR_NO_INFORMATION, +}; + +static const char *ana_errmsg[] = { + [ANA_ERR_GETCTRL_FAILED] = "couldn't get ctrl info", + [ANA_ERR_NOT_NVME] = "not an NVMe device", + [ANA_ERR_NOT_SUPPORTED] = "ANA not supported", + [ANA_ERR_GETANAS_OVERFLOW] = "buffer overflow in ANA log", + [ANA_ERR_GETANAS_NOTFOUND] = "NSID or ANAGRPID not found", + [ANA_ERR_GETANALOG_FAILED] = "couldn't get ana log", + [ANA_ERR_GETNSID_FAILED] = "couldn't get NSID", + [ANA_ERR_GETNS_FAILED] = "couldn't get namespace info", + [ANA_ERR_NO_MEMORY] = "out of memory", + [ANA_ERR_NO_INFORMATION] = "invalid fd", +}; + +static const char *anas_string[] = { + [NVME_ANA_OPTIMIZED] = "ANA Optimized State", + [NVME_ANA_NONOPTIMIZED] = "ANA Non-Optimized State", + [NVME_ANA_INACCESSIBLE] = "ANA Inaccessible State", + [NVME_ANA_PERSISTENT_LOSS] = "ANA Persistent Loss State", + [NVME_ANA_CHANGE] = "ANA Change state", +}; + +static const char *aas_print_string(int rc) +{ + rc &= 0xff; + if (rc >= 0 && rc < ARRAY_SIZE(anas_string) && + anas_string[rc] != NULL) + return anas_string[rc]; + + return "invalid ANA state"; +} + +static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log, + size_t ana_log_len) +{ + void *base = ana_log; + struct nvme_ana_rsp_hdr *hdr = base; + struct nvme_ana_group_desc *ana_desc; + size_t offset = sizeof(struct nvme_ana_rsp_hdr); + __u32 nr_nsids; + size_t nsid_buf_size; + int i, j; + + for (i = 0; i < le16_to_cpu(hdr->ngrps); i++) { + ana_desc = base + offset; + + offset += sizeof(*ana_desc); + if (offset > ana_log_len) + return -ANA_ERR_GETANAS_OVERFLOW; + + nr_nsids = le32_to_cpu(ana_desc->nnsids); + nsid_buf_size = nr_nsids * sizeof(__le32); + + offset += nsid_buf_size; + if (offset > ana_log_len) + return -ANA_ERR_GETANAS_OVERFLOW; + + for (j = 0; j < nr_nsids; j++) { + if (nsid == le32_to_cpu(ana_desc->nsids[j])) + return ana_desc->state; + } + + if (anagrpid != 0 && anagrpid == le32_to_cpu(ana_desc->grpid)) + return ana_desc->state; + + } + return -ANA_ERR_GETANAS_NOTFOUND; +} + +int get_ana_info(struct path * pp, unsigned int timeout) +{ + int rc; + __u32 nsid; + struct nvme_id_ctrl ctrl; + struct nvme_id_ns ns; + void *ana_log; + size_t ana_log_len; + bool is_anagrpid_const; + + rc = nvme_id_ctrl_ana(pp->fd, &ctrl); + if (rc < 0) { + log_nvme_errcode(rc, pp->dev, "nvme_identify_ctrl"); + return -ANA_ERR_GETCTRL_FAILED; + } else if (rc == 0) + return -ANA_ERR_NOT_SUPPORTED; + + nsid = nvme_get_nsid(pp->fd); + if (nsid <= 0) { + log_nvme_errcode(rc, pp->dev, "nvme_get_nsid"); + return -ANA_ERR_GETNSID_FAILED; + } + is_anagrpid_const = ctrl.anacap & (1 << 6); + + /* + * Code copied from nvme-cli/nvme.c. We don't need to allocate an + * [nanagrpid*mnan] array of NSIDs because each NSID can occur at most + * in one ANA group. + */ + ana_log_len = sizeof(struct nvme_ana_rsp_hdr) + + le32_to_cpu(ctrl.nanagrpid) + * sizeof(struct nvme_ana_group_desc); + + if (is_anagrpid_const) { + rc = nvme_identify_ns(pp->fd, nsid, 0, &ns); + if (rc) { + log_nvme_errcode(rc, pp->dev, "nvme_identify_ns"); + return -ANA_ERR_GETNS_FAILED; + } + } else + ana_log_len += le32_to_cpu(ctrl.mnan) * sizeof(__le32); + + ana_log = malloc(ana_log_len); + if (!ana_log) + return -ANA_ERR_NO_MEMORY; + pthread_cleanup_push(free, ana_log); + rc = nvme_ana_log(pp->fd, ana_log, ana_log_len, + is_anagrpid_const ? NVME_ANA_LOG_RGO : 0); + if (rc) { + log_nvme_errcode(rc, pp->dev, "nvme_ana_log"); + rc = -ANA_ERR_GETANALOG_FAILED; + } else + rc = get_ana_state(nsid, + is_anagrpid_const ? + le32_to_cpu(ns.anagrpid) : 0, + ana_log, ana_log_len); + pthread_cleanup_pop(1); + if (rc >= 0) + condlog(3, "%s: ana state = %02x [%s]", pp->dev, rc, + aas_print_string(rc)); + return rc; +} + +/* + * Priorities modeled roughly after the ALUA model (alua.c/sysfs.c) + * Reference: ANA Base Protocol (NVMe TP 4004a, 11/13/2018). + * + * Differences: + * + * - The ANA base spec defines no implicit or explicit (STPG) state management. + * If a state is encountered that doesn't allow normal I/O (all except + * OPTIMIZED and NON_OPTIMIZED), we can't do anything but either wait for a + * Access State Change Notice (can't do that in multipathd as we don't receive + * those), or retry commands in regular time intervals until ANATT is expired + * (not implemented). Mapping UNAVAILABLE state to ALUA STANDBY is the best we + * can currently do. + * + * FIXME: Waiting for ANATT could be implemented with a "delayed failback" + * mechanism. The current "failback" method can't be used, as it would + * affect failback to every state, and here only failback to UNAVAILABLE + * should be delayed. + * + * - PERSISTENT_LOSS state is even below ALUA's UNAVAILABLE state. + * FIXME: According to the ANA TP, accessing paths in PERSISTENT_LOSS state + * in any way makes no sense (e.g. §8.19.6 - paths in this state shouldn't + * even be checked under "all paths down" conditions). Device mapper can, + * and will, select a PG for IO if it has non-failed paths, even if the + * PG has priority 0. We could avoid that only with an "ANA path checker". + * + * - ALUA has no CHANGE state. The ANA TP §8.18.3 / §8.19.4 suggests + * that CHANGE state should be treated in roughly the same way as + * INACCESSIBLE. Therefore we assign the same prio to it. + * + * - ALUA's LBA-dependent state has no ANA equivalent. + */ + +int getprio(struct path *pp, char *args, unsigned int timeout) +{ + int rc; + + if (pp->fd < 0) + rc = -ANA_ERR_NO_INFORMATION; + else + rc = get_ana_info(pp, timeout); + + switch (rc) { + case NVME_ANA_OPTIMIZED: + return 50; + case NVME_ANA_NONOPTIMIZED: + return 10; + case NVME_ANA_INACCESSIBLE: + case NVME_ANA_CHANGE: + return 1; + case NVME_ANA_PERSISTENT_LOSS: + return 0; + default: + break; + } + if (rc < 0 && -rc < ARRAY_SIZE(ana_errmsg)) + condlog(2, "%s: ANA error: %s", pp->dev, ana_errmsg[-rc]); + else + condlog(1, "%s: invalid ANA rc code %d", pp->dev, rc); + return -1; +} diff --git a/libmultipath/propsel.c b/libmultipath/propsel.c index 970a3b5..98068f3 100644 --- a/libmultipath/propsel.c +++ b/libmultipath/propsel.c @@ -5,6 +5,7 @@ */ #include +#include "nvme-lib.h" #include "checkers.h" #include "memory.h" #include "vector.h" @@ -74,6 +75,8 @@ static const char cmdline_origin[] = "(setting: multipath command line [-p] flag)"; static const char autodetect_origin[] = "(setting: storage device autodetected)"; +static const char marginal_path_origin[] = + "(setting: implied by marginal_path check)"; #define do_default(dest, value) \ do { \ @@ -548,13 +551,25 @@ detect_prio(struct config *conf, struct path * pp) { struct prio *p = &pp->prio; char buff[512]; - char *default_prio = PRIO_ALUA; - - if (pp->tpgs <= 0) - return; - if (pp->tpgs == 2 || !check_rdac(pp)) { - if (sysfs_get_asymmetric_access_state(pp, buff, 512) >= 0) + char *default_prio; + + switch(pp->bus) { + case SYSFS_BUS_NVME: + if (nvme_id_ctrl_ana(pp->fd, NULL) == 0) + return; + default_prio = PRIO_ANA; + break; + case SYSFS_BUS_SCSI: + if (pp->tpgs <= 0) + return; + if ((pp->tpgs == 2 || !check_rdac(pp)) && + sysfs_get_asymmetric_access_state(pp, buff, 512) >= 0) default_prio = PRIO_SYSFS; + else + default_prio = PRIO_ALUA; + break; + default: + return; } prio_get(conf->multipath_dir, p, default_prio, DEFAULT_PRIO_ARGS); } @@ -855,8 +870,9 @@ int select_delay_watch_checks(struct config *conf, struct multipath *mp) mp_set_conf(delay_watch_checks); mp_set_default(delay_watch_checks, DEFAULT_DELAY_CHECKS); out: - print_off_int_undef(buff, 12, mp->delay_watch_checks); - condlog(3, "%s: delay_watch_checks = %s %s", mp->alias, buff, origin); + if (print_off_int_undef(buff, 12, mp->delay_watch_checks) != 0) + condlog(3, "%s: delay_watch_checks = %s %s", + mp->alias, buff, origin); return 0; } @@ -871,8 +887,91 @@ int select_delay_wait_checks(struct config *conf, struct multipath *mp) mp_set_conf(delay_wait_checks); mp_set_default(delay_wait_checks, DEFAULT_DELAY_CHECKS); out: - print_off_int_undef(buff, 12, mp->delay_wait_checks); - condlog(3, "%s: delay_wait_checks = %s %s", mp->alias, buff, origin); + if (print_off_int_undef(buff, 12, mp->delay_wait_checks) != 0) + condlog(3, "%s: delay_wait_checks = %s %s", + mp->alias, buff, origin); + return 0; + +} + +static int san_path_deprecated_warned; +#define warn_san_path_deprecated(v, x) \ + do { \ + if (v->x > 0 && !san_path_deprecated_warned) { \ + san_path_deprecated_warned = 1; \ + condlog(1, "WARNING: option %s is deprecated, " \ + "please use marginal_path options instead", \ + #x); \ + } \ + } while(0) + +int select_san_path_err_threshold(struct config *conf, struct multipath *mp) +{ + const char *origin; + char buff[12]; + + if (marginal_path_check_enabled(mp)) { + mp->san_path_err_threshold = NU_NO; + origin = marginal_path_origin; + goto out; + } + mp_set_mpe(san_path_err_threshold); + mp_set_ovr(san_path_err_threshold); + mp_set_hwe(san_path_err_threshold); + mp_set_conf(san_path_err_threshold); + mp_set_default(san_path_err_threshold, DEFAULT_ERR_CHECKS); +out: + if (print_off_int_undef(buff, 12, mp->san_path_err_threshold) != 0) + condlog(3, "%s: san_path_err_threshold = %s %s", + mp->alias, buff, origin); + warn_san_path_deprecated(mp, san_path_err_threshold); + return 0; +} + +int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp) +{ + const char *origin; + char buff[12]; + + if (marginal_path_check_enabled(mp)) { + mp->san_path_err_forget_rate = NU_NO; + origin = marginal_path_origin; + goto out; + } + mp_set_mpe(san_path_err_forget_rate); + mp_set_ovr(san_path_err_forget_rate); + mp_set_hwe(san_path_err_forget_rate); + mp_set_conf(san_path_err_forget_rate); + mp_set_default(san_path_err_forget_rate, DEFAULT_ERR_CHECKS); +out: + if (print_off_int_undef(buff, 12, mp->san_path_err_forget_rate) != 0) + condlog(3, "%s: san_path_err_forget_rate = %s %s", mp->alias, + buff, origin); + warn_san_path_deprecated(mp, san_path_err_forget_rate); + return 0; + +} + +int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp) +{ + const char *origin; + char buff[12]; + + if (marginal_path_check_enabled(mp)) { + mp->san_path_err_recovery_time = NU_NO; + origin = marginal_path_origin; + goto out; + } + mp_set_mpe(san_path_err_recovery_time); + mp_set_ovr(san_path_err_recovery_time); + mp_set_hwe(san_path_err_recovery_time); + mp_set_conf(san_path_err_recovery_time); + mp_set_default(san_path_err_recovery_time, DEFAULT_ERR_CHECKS); +out: + if (print_off_int_undef(buff, 12, mp->san_path_err_recovery_time) != 0) + condlog(3, "%s: san_path_err_recovery_time = %s %s", mp->alias, + buff, origin); + warn_san_path_deprecated(mp, san_path_err_recovery_time); return 0; } @@ -888,9 +987,10 @@ int select_marginal_path_err_sample_time(struct config *conf, struct multipath * mp_set_conf(marginal_path_err_sample_time); mp_set_default(marginal_path_err_sample_time, DEFAULT_ERR_CHECKS); out: - print_off_int_undef(buff, 12, mp->marginal_path_err_sample_time); - condlog(3, "%s: marginal_path_err_sample_time = %s %s", mp->alias, buff, - origin); + if (print_off_int_undef(buff, 12, mp->marginal_path_err_sample_time) + != 0) + condlog(3, "%s: marginal_path_err_sample_time = %s %s", + mp->alias, buff, origin); return 0; } @@ -905,9 +1005,10 @@ int select_marginal_path_err_rate_threshold(struct config *conf, struct multipat mp_set_conf(marginal_path_err_rate_threshold); mp_set_default(marginal_path_err_rate_threshold, DEFAULT_ERR_CHECKS); out: - print_off_int_undef(buff, 12, mp->marginal_path_err_rate_threshold); - condlog(3, "%s: marginal_path_err_rate_threshold = %s %s", mp->alias, buff, - origin); + if (print_off_int_undef(buff, 12, mp->marginal_path_err_rate_threshold) + != 0) + condlog(3, "%s: marginal_path_err_rate_threshold = %s %s", + mp->alias, buff, origin); return 0; } @@ -922,9 +1023,10 @@ int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multip mp_set_conf(marginal_path_err_recheck_gap_time); mp_set_default(marginal_path_err_recheck_gap_time, DEFAULT_ERR_CHECKS); out: - print_off_int_undef(buff, 12, mp->marginal_path_err_recheck_gap_time); - condlog(3, "%s: marginal_path_err_recheck_gap_time = %s %s", mp->alias, buff, - origin); + if (print_off_int_undef(buff, 12, + mp->marginal_path_err_recheck_gap_time) != 0) + condlog(3, "%s: marginal_path_err_recheck_gap_time = %s %s", + mp->alias, buff, origin); return 0; } @@ -939,9 +1041,10 @@ int select_marginal_path_double_failed_time(struct config *conf, struct multipat mp_set_conf(marginal_path_double_failed_time); mp_set_default(marginal_path_double_failed_time, DEFAULT_ERR_CHECKS); out: - print_off_int_undef(buff, 12, mp->marginal_path_double_failed_time); - condlog(3, "%s: marginal_path_double_failed_time = %s %s", mp->alias, buff, - origin); + if (print_off_int_undef(buff, 12, mp->marginal_path_double_failed_time) + != 0) + condlog(3, "%s: marginal_path_double_failed_time = %s %s", + mp->alias, buff, origin); return 0; } @@ -993,8 +1096,8 @@ int select_ghost_delay (struct config *conf, struct multipath * mp) mp_set_conf(ghost_delay); mp_set_default(ghost_delay, DEFAULT_GHOST_DELAY); out: - print_off_int_undef(buff, 12, mp->ghost_delay); - condlog(3, "%s: ghost_delay = %s %s", mp->alias, buff, origin); + if (print_off_int_undef(buff, 12, mp->ghost_delay) != 0) + condlog(3, "%s: ghost_delay = %s %s", mp->alias, buff, origin); return 0; } diff --git a/libmultipath/propsel.h b/libmultipath/propsel.h index ae99b92..b352c16 100644 --- a/libmultipath/propsel.h +++ b/libmultipath/propsel.h @@ -26,6 +26,9 @@ int select_delay_watch_checks (struct config *conf, struct multipath * mp); int select_delay_wait_checks (struct config *conf, struct multipath * mp); int select_skip_kpartx (struct config *conf, struct multipath * mp); int select_max_sectors_kb (struct config *conf, struct multipath * mp); +int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp); +int select_san_path_err_threshold(struct config *conf, struct multipath *mp); +int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp); int select_marginal_path_err_sample_time(struct config *conf, struct multipath *mp); int select_marginal_path_err_rate_threshold(struct config *conf, struct multipath *mp); int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multipath *mp); diff --git a/libmultipath/structs.h b/libmultipath/structs.h index 0a2623a..b794b0d 100644 --- a/libmultipath/structs.h +++ b/libmultipath/structs.h @@ -9,7 +9,7 @@ #include "generic.h" #define WWID_SIZE 128 -#define SERIAL_SIZE 65 +#define SERIAL_SIZE 128 #define NODE_NAME_SIZE 224 #define PATH_STR_SIZE 16 #define PARAMS_SIZE 4096 @@ -202,6 +202,7 @@ enum ghost_delay_states { }; enum initialized_states { + INIT_NEW, INIT_FAILED, INIT_MISSING_UDEV, INIT_REQUESTED_UDEV, @@ -280,6 +281,10 @@ struct path { int initialized; int retriggers; int wwid_changed; + unsigned int path_failures; + time_t dis_reinstate_time; + int disable_reinstate; + int san_path_err_forget_rate; time_t io_err_dis_reinstate_time; int io_err_disable_reinstate; int io_err_pathfail_cnt; @@ -318,6 +323,9 @@ struct multipath { int deferred_remove; int delay_watch_checks; int delay_wait_checks; + int san_path_err_threshold; + int san_path_err_forget_rate; + int san_path_err_recovery_time; int marginal_path_err_sample_time; int marginal_path_err_rate_threshold; int marginal_path_err_recheck_gap_time; @@ -370,6 +378,27 @@ struct multipath { struct gen_multipath generic_mp; }; +static inline int marginal_path_check_enabled(const struct multipath *mpp) +{ + return mpp->marginal_path_double_failed_time > 0 && + mpp->marginal_path_err_sample_time > 0 && + mpp->marginal_path_err_recheck_gap_time > 0 && + mpp->marginal_path_err_rate_threshold >= 0; +} + +static inline int san_path_check_enabled(const struct multipath *mpp) +{ + return mpp->san_path_err_threshold > 0 && + mpp->san_path_err_forget_rate > 0 && + mpp->san_path_err_recovery_time > 0; +} + +static inline int delay_check_enabled(const struct multipath *mpp) +{ + return mpp->delay_watch_checks != NU_NO || + mpp->delay_wait_checks != NU_NO; +} + struct pathgroup { long id; int status; diff --git a/libmultipath/structs_vec.c b/libmultipath/structs_vec.c index c85823a..db5d19d 100644 --- a/libmultipath/structs_vec.c +++ b/libmultipath/structs_vec.c @@ -18,6 +18,7 @@ #include "configure.h" #include "libdevmapper.h" #include "io_err_stat.h" +#include "switchgroup.h" /* * creates or updates mpp->paths reading mpp->pg @@ -60,6 +61,12 @@ int adopt_paths(vector pathvec, struct multipath *mpp) vector_foreach_slot (pathvec, pp, i) { if (!strncmp(mpp->wwid, pp->wwid, WWID_SIZE)) { + if (pp->size != 0 && mpp->size != 0 && + pp->size != mpp->size) { + condlog(3, "%s: size mismatch for %s, not adding path", + pp->dev, mpp->alias); + continue; + } condlog(3, "%s: ownership set to %s", pp->dev, mpp->alias); pp->mpp = mpp; @@ -96,14 +103,14 @@ void orphan_path(struct path *pp, const char *reason) pp->fd = -1; } -void orphan_paths(vector pathvec, struct multipath *mpp) +void orphan_paths(vector pathvec, struct multipath *mpp, const char *reason) { int i; struct path * pp; vector_foreach_slot (pathvec, pp, i) { if (pp->mpp == mpp) { - orphan_path(pp, "map flushed"); + orphan_path(pp, reason); } } } @@ -113,12 +120,10 @@ remove_map(struct multipath * mpp, struct vectors * vecs, int purge_vec) { int i; - condlog(4, "%s: remove multipath map", mpp->alias); - /* * clear references to this map */ - orphan_paths(vecs->pathvec, mpp); + orphan_paths(vecs->pathvec, mpp, "map removed internally"); if (purge_vec && (i = find_slot(vecs->mpvec, (void *)mpp)) != -1) @@ -134,8 +139,10 @@ void remove_map_by_alias(const char *alias, struct vectors * vecs, int purge_vec) { struct multipath * mpp = find_mp_by_alias(vecs->mpvec, alias); - if (mpp) + if (mpp) { + condlog(2, "%s: removing map by alias", alias); remove_map(mpp, vecs, purge_vec); + } } void @@ -255,6 +262,9 @@ void sync_paths(struct multipath *mpp, vector pathvec) int update_multipath_strings(struct multipath *mpp, vector pathvec, int is_daemon) { + struct pathgroup *pgp; + int i; + if (!mpp) return 1; @@ -272,6 +282,10 @@ update_multipath_strings(struct multipath *mpp, vector pathvec, int is_daemon) if (update_multipath_status(mpp)) return 1; + vector_foreach_slot(mpp->pg, pgp, i) + if (pgp->paths) + path_group_prio_update(pgp); + return 0; } @@ -407,6 +421,12 @@ int verify_paths(struct multipath *mpp, struct vectors *vecs) vector_del_slot(mpp->paths, i); i--; + /* Make sure mpp->hwe doesn't point to freed memory. + * We call extract_hwe_from_path() below to restore + * mpp->hwe + */ + if (mpp->hwe == pp->hwe) + mpp->hwe = NULL; if ((j = find_slot(vecs->pathvec, (void *)pp)) != -1) vector_del_slot(vecs->pathvec, j); @@ -416,6 +436,7 @@ int verify_paths(struct multipath *mpp, struct vectors *vecs) mpp->alias, pp->dev, pp->dev_t); } } + extract_hwe_from_path(mpp); return count; } diff --git a/libmultipath/structs_vec.h b/libmultipath/structs_vec.h index f7777aa..f8b9f63 100644 --- a/libmultipath/structs_vec.h +++ b/libmultipath/structs_vec.h @@ -14,7 +14,8 @@ struct vectors { void enter_recovery_mode(struct multipath *mpp); int adopt_paths (vector pathvec, struct multipath * mpp); -void orphan_paths (vector pathvec, struct multipath * mpp); +void orphan_paths(vector pathvec, struct multipath *mpp, + const char *reason); void orphan_path (struct path * pp, const char *reason); int verify_paths(struct multipath * mpp, struct vectors * vecs); diff --git a/libmultipath/sysfs.c b/libmultipath/sysfs.c index 558c8d6..65904d7 100644 --- a/libmultipath/sysfs.c +++ b/libmultipath/sysfs.c @@ -295,11 +295,6 @@ static int select_dm_devs(const struct dirent *di) return fnmatch("dm-*", di->d_name, FNM_FILE_NAME) == 0; } -static void close_fd(void *arg) -{ - close((long)arg); -} - bool sysfs_is_multipathed(const struct path *pp) { char pathbuf[PATH_MAX]; diff --git a/libmultipath/uevent.c b/libmultipath/uevent.c index 5f910e6..f73de8c 100644 --- a/libmultipath/uevent.c +++ b/libmultipath/uevent.c @@ -806,7 +806,7 @@ int uevent_listen(struct udev *udev) monitor = udev_monitor_new_from_netlink(udev, "udev"); if (!monitor) { condlog(2, "failed to create udev monitor"); - goto out; + goto failback; } pthread_cleanup_push(monitor_cleanup, monitor); #ifdef LIBUDEV_API_RECVBUF @@ -893,8 +893,8 @@ int uevent_listen(struct udev *udev) } need_failback = 0; out: - if (monitor) - pthread_cleanup_pop(1); + pthread_cleanup_pop(1); +failback: if (need_failback) err = failback_listen(); pthread_cleanup_pop(1); diff --git a/libmultipath/util.c b/libmultipath/util.c index 66c4761..5b838d5 100644 --- a/libmultipath/util.c +++ b/libmultipath/util.c @@ -104,7 +104,7 @@ get_word (char * sentence, char ** word) } strncpy(*word, sentence, len); strchop(*word); - condlog(4, "*word = %s, len = %i", *word, len); + condlog(5, "*word = %s, len = %i", *word, len); if (*p == '\0') return 0; @@ -176,6 +176,7 @@ int devt2devname(char *devname, int devname_len, char *devt) if (stat("/sys/dev/block", &statbuf) == 0) { /* Newer kernels have /sys/dev/block */ sprintf(block_path,"/sys/dev/block/%u:%u", major, minor); + dev[FILE_NAME_SIZE - 1] = '\0'; if (lstat(block_path, &statbuf) == 0) { if (S_ISLNK(statbuf.st_mode) && readlink(block_path, dev, FILE_NAME_SIZE-1) > 0) { @@ -191,7 +192,8 @@ int devt2devname(char *devname, int devname_len, char *devt) return 0; } } - goto skip_proc; + condlog(4, "%s is invalid", block_path); + return 1; } memset(block_path, 0, sizeof(block_path)); @@ -220,7 +222,7 @@ int devt2devname(char *devname, int devname_len, char *devt) } } fclose(fd); -skip_proc: + if (strncmp(block_path,"/sys/block", 10)) { condlog(3, "No device found for %u:%u", major, minor); return 1; @@ -505,3 +507,8 @@ void free_scandir_result(struct scandir_result *res) FREE(res->di[i]); FREE(res->di); } + +void close_fd(void *arg) +{ + close((long)arg); +} diff --git a/libmultipath/util.h b/libmultipath/util.h index a818e29..1e0d832 100644 --- a/libmultipath/util.h +++ b/libmultipath/util.h @@ -3,6 +3,7 @@ #include #include +#include size_t strchop(char *); int basenamecpy (const char *src, char *dst, size_t size); @@ -24,6 +25,7 @@ int safe_write(int fd, const void *buf, size_t count); void set_max_fds(int max_fds); #define KERNEL_VERSION(maj, min, ptc) ((((maj) * 256) + (min)) * 256 + (ptc)) +#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) #define safe_sprintf(var, format, args...) \ snprintf(var, sizeof(var), format, ##args) >= sizeof(var) @@ -33,10 +35,27 @@ void set_max_fds(int max_fds); #define pthread_cleanup_push_cast(f, arg) \ pthread_cleanup_push(((void (*)(void *))&f), (arg)) +void close_fd(void *arg); + struct scandir_result { struct dirent **di; int n; }; void free_scandir_result(struct scandir_result *); +static inline bool is_bit_set_in_array(unsigned int bit, const uint64_t *arr) +{ + return arr[bit / 64] & (1ULL << (bit % 64)) ? 1 : 0; +} + +static inline void set_bit_in_array(unsigned int bit, uint64_t *arr) +{ + arr[bit / 64] |= (1ULL << (bit % 64)); +} + +static inline void clear_bit_in_array(unsigned int bit, uint64_t *arr) +{ + arr[bit / 64] &= ~(1ULL << (bit % 64)); +} + #endif /* _UTIL_H */ diff --git a/libmultipath/version.h b/libmultipath/version.h index 65d0522..f3c7a51 100644 --- a/libmultipath/version.h +++ b/libmultipath/version.h @@ -20,8 +20,8 @@ #ifndef _VERSION_H #define _VERSION_H -#define VERSION_CODE 0x000709 -#define DATE_CODE 0x0b0e12 +#define VERSION_CODE 0x000800 +#define DATE_CODE 0x020e13 #define PROG "multipath-tools" diff --git a/multipath/main.c b/multipath/main.c index 05b7bf0..5abb118 100644 --- a/multipath/main.c +++ b/multipath/main.c @@ -68,6 +68,19 @@ int logsink; struct udev *udev; struct config *multipath_conf; +/* + * Return values of configure(), print_cmd_valid(), and main(). + * RTVL_{YES,NO} are synonyms for RTVL_{OK,FAIL} for the CMD_VALID_PATH case. + */ +enum { + RTVL_OK = 0, + RTVL_YES = RTVL_OK, + RTVL_FAIL = 1, + RTVL_NO = RTVL_FAIL, + RTVL_MAYBE, /* only used internally, never returned */ + RTVL_RETRY, /* returned by configure(), not by main() */ +}; + struct config *get_multipath_config(void) { return multipath_conf; @@ -319,7 +332,7 @@ static int check_usable_paths(struct config *conf, goto out; } - if (!dm_is_mpath(mapname)) { + if (dm_is_mpath(mapname) != 1) { condlog(1, "%s is not a multipath map", devpath); goto free; } @@ -375,10 +388,6 @@ enum { }; static const char shm_find_mp_dir[] = MULTIPATH_SHM_BASE "find_multipaths"; -static void close_fd(void *arg) -{ - close((long)arg); -} /** * find_multipaths_check_timeout(wwid, tmo) @@ -401,7 +410,7 @@ static int find_multipaths_check_timeout(const struct path *pp, long tmo, struct timespec now, ftimes[2], tdiff; struct stat st; long fd; - int r, err, retries = 0; + int r, retries = 0; clock_gettime(CLOCK_REALTIME, &now); @@ -421,8 +430,6 @@ retry: if (fd != -1) { pthread_cleanup_push(close_fd, (void *)fd); r = fstat(fd, &st); - if (r != 0) - err = errno; pthread_cleanup_pop(1); } else if (tmo > 0) { @@ -451,15 +458,12 @@ retry: path, strerror(errno)); } r = fstat(fd, &st); - if (r != 0) - err = errno; pthread_cleanup_pop(1); } else return FIND_MULTIPATHS_NEVER; if (r != 0) { - condlog(1, "%s: error in fstat for %s: %s", __func__, - path, strerror(err)); + condlog(1, "%s: error in fstat for %s: %m", __func__, path); return FIND_MULTIPATHS_ERROR; } @@ -475,15 +479,14 @@ retry: static int print_cmd_valid(int k, const vector pathvec, struct config *conf) { - static const int vals[] = { 1, 0, 2 }; int wait = FIND_MULTIPATHS_NEVER; struct timespec until; struct path *pp; - if (k < 0 || k >= (sizeof(vals) / sizeof(int))) - return 1; + if (k != RTVL_YES && k != RTVL_NO && k != RTVL_MAYBE) + return RTVL_NO; - if (k == 2) { + if (k == RTVL_MAYBE) { /* * Caller ensures that pathvec[0] is the path to * examine. @@ -493,7 +496,7 @@ static int print_cmd_valid(int k, const vector pathvec, wait = find_multipaths_check_timeout( pp, pp->find_multipaths_timeout, &until); if (wait != FIND_MULTIPATHS_WAITING) - k = 1; + k = RTVL_NO; } else if (pathvec != NULL && (pp = VECTOR_SLOT(pathvec, 0))) wait = find_multipaths_check_timeout(pp, 0, &until); if (wait == FIND_MULTIPATHS_WAITING) @@ -501,8 +504,10 @@ static int print_cmd_valid(int k, const vector pathvec, until.tv_sec, until.tv_nsec/1000); else if (wait == FIND_MULTIPATHS_WAIT_DONE) printf("FIND_MULTIPATHS_WAIT_UNTIL=\"0\"\n"); - printf("DM_MULTIPATH_DEVICE_PATH=\"%d\"\n", vals[k]); - return k == 1; + printf("DM_MULTIPATH_DEVICE_PATH=\"%d\"\n", + k == RTVL_MAYBE ? 2 : k == RTVL_YES ? 1 : 0); + /* Never return RTVL_MAYBE */ + return k == RTVL_NO ? RTVL_NO : RTVL_YES; } /* @@ -524,12 +529,6 @@ static bool released_to_systemd(void) return ret; } -/* - * Return value: - * -1: Retry - * 0: Success - * 1: Failure - */ static int configure (struct config *conf, enum mpath_cmds cmd, enum devtypes dev_type, char *devpath) @@ -537,7 +536,7 @@ configure (struct config *conf, enum mpath_cmds cmd, vector curmp = NULL; vector pathvec = NULL; struct vectors vecs; - int r = 1; + int r = RTVL_FAIL, rc; int di_flag = 0; char * refwwid = NULL; char * dev = NULL; @@ -585,21 +584,23 @@ configure (struct config *conf, enum mpath_cmds cmd, goto out; } if (cmd == CMD_REMOVE_WWID) { - r = remove_wwid(refwwid); - if (r == 0) + rc = remove_wwid(refwwid); + if (rc == 0) { printf("wwid '%s' removed\n", refwwid); - else if (r == 1) { + r = RTVL_OK; + } else if (rc == 1) { printf("wwid '%s' not in wwids file\n", refwwid); - r = 0; + r = RTVL_OK; } goto out; } if (cmd == CMD_ADD_WWID) { - r = remember_wwid(refwwid); - if (r >= 0) + rc = remember_wwid(refwwid); + if (rc >= 0) { printf("wwid '%s' added\n", refwwid); - else + r = RTVL_OK; + } else printf("failed adding '%s' to wwids file\n", refwwid); goto out; @@ -614,13 +615,13 @@ configure (struct config *conf, enum mpath_cmds cmd, */ if (cmd == CMD_VALID_PATH) { if (is_failed_wwid(refwwid) == WWID_IS_FAILED) { - r = 1; + r = RTVL_NO; goto print_valid; } if ((!find_multipaths_on(conf) && ignore_wwids_on(conf)) || check_wwids_file(refwwid, 0) == 0) - r = 0; + r = RTVL_YES; if (!ignore_wwids_on(conf)) goto print_valid; /* At this point, either r==0 or find_multipaths_on. */ @@ -630,7 +631,7 @@ configure (struct config *conf, enum mpath_cmds cmd, * Quick check if path is already multipathed. */ if (sysfs_is_multipathed(VECTOR_SLOT(pathvec, 0))) { - r = 0; + r = RTVL_YES; goto print_valid; } @@ -644,10 +645,10 @@ configure (struct config *conf, enum mpath_cmds cmd, * Leave DM_MULTIPATH_DEVICE_PATH="0". */ if (released) { - r = 1; + r = RTVL_NO; goto print_valid; } - if (r == 0) + if (r == RTVL_YES) goto print_valid; /* find_multipaths_on: Fall through to path detection */ } @@ -703,13 +704,12 @@ configure (struct config *conf, enum mpath_cmds cmd, * the refwwid, or there is more than one path matching * the refwwid, then the path is valid */ if (VECTOR_SIZE(curmp) != 0) { - r = 0; + r = RTVL_YES; goto print_valid; } else if (VECTOR_SIZE(pathvec) > 1) - r = 0; + r = RTVL_YES; else - /* Use r=2 as an indication for "maybe" */ - r = 2; + r = RTVL_MAYBE; /* * If opening the path with O_EXCL fails, the path @@ -739,21 +739,23 @@ configure (struct config *conf, enum mpath_cmds cmd, /* * Check if we raced with multipathd */ - r = !sysfs_is_multipathed(VECTOR_SLOT(pathvec, 0)); + r = sysfs_is_multipathed(VECTOR_SLOT(pathvec, 0)) ? + RTVL_YES : RTVL_NO; } goto print_valid; } if (cmd != CMD_CREATE && cmd != CMD_DRY_RUN) { - r = 0; + r = RTVL_OK; goto out; } /* * core logic entry point */ - r = coalesce_paths(&vecs, NULL, refwwid, + rc = coalesce_paths(&vecs, NULL, refwwid, conf->force_reload, cmd); + r = rc == CP_RETRY ? RTVL_RETRY : rc == CP_OK ? RTVL_OK : RTVL_FAIL; print_valid: if (cmd == CMD_VALID_PATH) @@ -854,7 +856,7 @@ main (int argc, char *argv[]) int arg; extern char *optarg; extern int optind; - int r = 1; + int r = RTVL_FAIL; enum mpath_cmds cmd = CMD_CREATE; enum devtypes dev_type = DEV_NONE; char *dev = NULL; @@ -865,7 +867,7 @@ main (int argc, char *argv[]) logsink = 0; conf = load_config(DEFAULT_CONFIGFILE); if (!conf) - exit(1); + exit(RTVL_FAIL); multipath_conf = conf; conf->retrigger_tries = 0; while ((arg = getopt(argc, argv, ":adcChl::FfM:v:p:b:BrR:itTquUwW")) != EOF ) { @@ -876,7 +878,7 @@ main (int argc, char *argv[]) if (sizeof(optarg) > sizeof(char *) || !isdigit(optarg[0])) { usage (argv[0]); - exit(1); + exit(RTVL_FAIL); } conf->verbosity = atoi(optarg); @@ -923,7 +925,7 @@ main (int argc, char *argv[]) if (conf->pgpolicy_flag == IOPOLICY_UNDEF) { printf("'%s' is not a valid policy\n", optarg); usage(argv[0]); - exit(1); + exit(RTVL_FAIL); } break; case 'r': @@ -933,14 +935,14 @@ main (int argc, char *argv[]) conf->find_multipaths |= _FIND_MULTIPATHS_I; break; case 't': - r = dump_config(conf, NULL, NULL); + r = dump_config(conf, NULL, NULL) ? RTVL_FAIL : RTVL_OK; goto out_free_config; case 'T': cmd = CMD_DUMP_CONFIG; break; case 'h': usage(argv[0]); - exit(0); + exit(RTVL_OK); case 'u': cmd = CMD_VALID_PATH; dev_type = DEV_UEVENT; @@ -964,20 +966,20 @@ main (int argc, char *argv[]) case ':': fprintf(stderr, "Missing option argument\n"); usage(argv[0]); - exit(1); + exit(RTVL_FAIL); case '?': fprintf(stderr, "Unknown switch: %s\n", optarg); usage(argv[0]); - exit(1); + exit(RTVL_FAIL); default: usage(argv[0]); - exit(1); + exit(RTVL_FAIL); } } if (getuid() != 0) { fprintf(stderr, "need to be root\n"); - exit(1); + exit(RTVL_FAIL); } if (optind < argc) { @@ -1015,7 +1017,8 @@ main (int argc, char *argv[]) /* Failing here is non-fatal */ init_foreign(conf->multipath_dir); if (cmd == CMD_USABLE_PATHS) { - r = check_usable_paths(conf, dev, dev_type); + r = check_usable_paths(conf, dev, dev_type) ? + RTVL_FAIL : RTVL_OK; goto out; } if (cmd == CMD_VALID_PATH && @@ -1031,7 +1034,7 @@ main (int argc, char *argv[]) if (fd == -1) { condlog(3, "%s: daemon is not running", dev); if (!systemd_service_enabled(dev)) { - r = print_cmd_valid(1, NULL, conf); + r = print_cmd_valid(RTVL_NO, NULL, conf); goto out; } } else @@ -1045,9 +1048,9 @@ main (int argc, char *argv[]) switch(delegate_to_multipathd(cmd, dev, dev_type, conf)) { case DELEGATE_OK: - exit(0); + exit(RTVL_OK); case DELEGATE_ERROR: - exit(1); + exit(RTVL_FAIL); case NOT_DELEGATED: break; } @@ -1063,8 +1066,8 @@ main (int argc, char *argv[]) goto out; } if (dm_get_maps(curmp) == 0) - r = replace_wwids(curmp); - if (r == 0) + r = replace_wwids(curmp) ? RTVL_FAIL : RTVL_OK; + if (r == RTVL_OK) printf("successfully reset wwids\n"); vector_foreach_slot_backwards(curmp, mpp, i) { vector_del_slot(curmp, i); @@ -1077,17 +1080,18 @@ main (int argc, char *argv[]) retries = conf->remove_retries; if (conf->remove == FLUSH_ONE) { if (dev_type == DEV_DEVMAP) { - r = dm_suspend_and_flush_map(dev, retries); + r = dm_suspend_and_flush_map(dev, retries) ? + RTVL_FAIL : RTVL_OK; } else condlog(0, "must provide a map name to remove"); goto out; } else if (conf->remove == FLUSH_ALL) { - r = dm_flush_maps(retries); + r = dm_flush_maps(retries) ? RTVL_FAIL : RTVL_OK; goto out; } - while ((r = configure(conf, cmd, dev_type, dev)) < 0) + while ((r = configure(conf, cmd, dev_type, dev)) == RTVL_RETRY) condlog(3, "restart multipath configuration process"); out: @@ -1102,8 +1106,8 @@ out: * multipath -u must exit with status 0, otherwise udev won't * import its output. */ - if (cmd == CMD_VALID_PATH && dev_type == DEV_UEVENT && r == 1) - r = 0; + if (cmd == CMD_VALID_PATH && dev_type == DEV_UEVENT && r == RTVL_NO) + r = RTVL_OK; if (dev_type == DEV_UEVENT) closelog(); diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5 index 6333366..0fe8461 100644 --- a/multipath/multipath.conf.5 +++ b/multipath/multipath.conf.5 @@ -334,6 +334,10 @@ priority provided as argument. Requires prio_args keyword. Generate the path priority based on a latency algorithm. Requires prio_args keyword. .TP +.I ana +(Hardware-dependent) +Generate the path priority based on the NVMe ANA settings. +.TP .I datacore (Hardware-dependent) Generate the path priority for some DataCore storage arrays. Requires prio_args @@ -891,6 +895,46 @@ The default is: \fB/etc/multipath/conf.d/\fR . . .TP +.B san_path_err_threshold +If set to a value greater than 0, multipathd will watch paths and check how many +times a path has been failed due to errors.If the number of failures on a particular +path is greater then the san_path_err_threshold, then the path will not reinstate +till san_path_err_recovery_time. These path failures should occur within a +san_path_err_forget_rate checks, if not we will consider the path is good enough +to reinstantate. See "Shaky paths detection" below. +.RS +.TP +The default is: \fBno\fR +.RE +. +. +.TP +.B san_path_err_forget_rate +If set to a value greater than 0, multipathd will check whether the path failures +has exceeded the san_path_err_threshold within this many checks i.e +san_path_err_forget_rate . If so we will not reinstante the path till +san_path_err_recovery_time. See "Shaky paths detection" below. +.RS +.TP +The default is: \fBno\fR +.RE +. +. +.TP +.B san_path_err_recovery_time +If set to a value greater than 0, multipathd will make sure that when path failures +has exceeded the san_path_err_threshold within san_path_err_forget_rate then the path +will be placed in failed state for san_path_err_recovery_time duration.Once san_path_err_recovery_time +has timeout we will reinstante the failed path . +san_path_err_recovery_time value should be in secs. +See "Shaky paths detection" below. +.RS +.TP +The default is: \fBno\fR +.RE +. +. +.TP .B marginal_path_double_failed_time One of the four parameters of supporting path check based on accounting IO error such as intermittent error. When a path failed event occurs twice in @@ -898,7 +942,7 @@ error such as intermittent error. When a path failed event occurs twice in other three parameters are set, multipathd will fail the path and enqueue this path into a queue of which members are sent a couple of continuous direct reading asynchronous IOs at a fixed sample rate of 10HZ to start IO -error accounting process. +error accounting process. See "Shaky paths detection" below. .RS .TP The default is: \fBno\fR @@ -920,7 +964,7 @@ If the rate of IO error on a particular path is greater than the \fImarginal_path_err_recheck_gap_time\fR seconds unless there is only one active path. After \fImarginal_path_err_recheck_gap_time\fR expires, the path will be requeueed for rechecking. If checking result is good enough, the -path will be reinstated. +path will be reinstated. See "Shaky paths detection" below. .RS .TP The default is: \fBno\fR @@ -934,7 +978,7 @@ of supporting path check based on accounting IO error such as intermittent error. Refer to \fImarginal_path_err_sample_time\fR. If the rate of IO errors on a particular path is greater than this parameter, then the path will not reinstate for \fImarginal_path_err_recheck_gap_time\fR seconds unless there is -only one active path. +only one active path. See "Shaky paths detection" below. .RS .TP The default is: \fBno\fR @@ -951,7 +995,7 @@ value, the failed path of which the IO error rate is larger than \fImarginal_path_err_recheck_gap_time\fR seconds. When \fImarginal_path_err_recheck_gap_time\fR seconds expires, the path will be requeueed for checking. If checking result is good enough, the path will be -reinstated, or else it will keep failed. +reinstated, or else it will keep failed. See "Shaky paths detection" below. .RS .TP The default is: \fBno\fR @@ -963,7 +1007,7 @@ The default is: \fBno\fR If set to a value greater than 0, multipathd will watch paths that have recently become valid for this many checks. If they fail again while they are being watched, when they next become valid, they will not be used until they -have stayed up for \fIdelay_wait_checks\fR checks. +have stayed up for \fIdelay_wait_checks\fR checks. See "Shaky paths detection" below. .RS .TP The default is: \fBno\fR @@ -975,7 +1019,7 @@ The default is: \fBno\fR If set to a value greater than 0, when a device that has recently come back online fails again within \fIdelay_watch_checks\fR checks, the next time it comes back online, it will marked and delayed, and not used until it has passed -\fIdelay_wait_checks\fR checks. +\fIdelay_wait_checks\fR checks. See "Shaky paths detection" below. .RS .TP The default is: \fBno\fR @@ -1174,7 +1218,7 @@ Regular expression matching the device nodes to be excluded/included. .RS .PP The default \fIblacklist\fR consists of the regular expressions -"^(ram|raw|loop|fd|md|dm-|sr|scd|st|dcssblk)[0-9]" and +"^(ram|zram|raw|loop|fd|md|dm-|sr|scd|st|dcssblk)[0-9]" and "^(td|hd|vd)[a-z]". This causes virtual devices, non-disk devices, and some other device types to be excluded from multipath handling by default. .RE @@ -1297,6 +1341,12 @@ section: .TP .B deferred_remove .TP +.B san_path_err_threshold +.TP +.B san_path_err_forget_rate +.TP +.B san_path_err_recovery_time +.TP .B marginal_path_err_sample_time .TP .B marginal_path_err_rate_threshold @@ -1391,6 +1441,10 @@ Active/Standby mode exclusively. .I 1 alua (Hardware-dependent) Hardware handler for SCSI-3 ALUA compatible arrays. +.TP +.I 1 ana +(Hardware-dependent) +Hardware handler for NVMe ANA compatible arrays. .PP The default is: \fB\fR .PP @@ -1448,6 +1502,12 @@ section: .TP .B deferred_remove .TP +.B san_path_err_threshold +.TP +.B san_path_err_forget_rate +.TP +.B san_path_err_recovery_time +.TP .B marginal_path_err_sample_time .TP .B marginal_path_err_rate_threshold @@ -1524,6 +1584,12 @@ the values are taken from the \fIdevices\fR or \fIdefaults\fR sections: .TP .B deferred_remove .TP +.B san_path_err_threshold +.TP +.B san_path_err_forget_rate +.TP +.B san_path_err_recovery_time +.TP .B marginal_path_err_sample_time .TP .B marginal_path_err_rate_threshold @@ -1578,6 +1644,69 @@ are present multipath will try to use the sysfs attribute . . .\" ---------------------------------------------------------------------------- +.SH "Shaky paths detection" +.\" ---------------------------------------------------------------------------- +. +A common problem in SAN setups is the occurence of intermittent errors: a +path is unreachable, then reachable again for a short time, disappears again, +and so forth. This happens typically on unstable interconnects. It is +undesirable to switch pathgroups unnecessarily on such frequent, unreliable +events. \fImultipathd\fR supports three different methods for detecting this +situation and dealing with it. All methods share the same basic mode of +operation: If a path is found to be \(dqshaky\(dq or \(dqflipping\(dq, +and appears to be in healthy status, it is not reinstated (put back to use) +immediately. Instead, it is watched for some time, and only reinstated +if the healthy state appears to be stable. The logic of determining +\(dqshaky\(dq condition, as well as the logic when to reinstate, +differs between the three methods. +.TP 8 +.B \(dqdelay_checks\(dq failure tracking +If a path fails again within a +\fIdelay_watch_checks\fR interval after a failure, don't +reinstate it until it passes a \fIdelay_wait_checks\fR interval +in always good status. +The intervals are measured in \(dqticks\(dq, i.e. the +time between path checks by multipathd, which is variable and controlled by the +\fIpolling_interval\fR and \fImax_polling_interval\fR parameters. +.TP +.B \(dqmarginal_path\(dq failure tracking +If a second failure event (good->bad transition) occurs within +\fImarginal_path_double_failed_time\fR seconds after a failure, high-frequency +monitoring is started for the affected path: I/O is sent at a rate of 10 per +second. This is done for \fImarginal_path_err_sample_time\fR seconds. During +this period, the path is not reinstated. If the +rate of errors remains below \fImarginal_path_err_rate_threshold\fR during the +monitoring period, the path is reinstated. Otherwise, it +is kept in failed state for \fImarginal_path_err_recheck_gap_time\fR, and +after that, it is monitored again. For this method, time intervals are measured +in seconds. +.TP +.B \(dqsan_path_err\(dq failure tracking +multipathd counts path failures for each path. Once the number of failures +exceeds the value given by \fIsan_path_err_threshold\fR, the path is not +reinstated for \fIsan_path_err_recovery_time\fR ticks. While counting +failures, multipathd \(dqforgets\(dq one past failure every +\(dqsan_path_err_forget_rate\(dq ticks; thus if errors don't occur more +often then once in the forget rate interval, the failure count doesn't +increase and the threshold is never reached. As for the \fIdelay_xy\fR method, +intervals are measured in \(dqticks\(dq. +. +.RS 8 +.LP +This method is \fBdeprecated\fR in favor of the \(dqmarginal_path\(dq failure +tracking method, and only offered for backward compatibility. +. +.RE +.LP +See the documentation +of the individual options above for details. +It is \fBstrongly discouraged\fR to use more than one of these methods for any +given multipath map, because the two concurrent methods may interact in +unpredictable ways. If the \(dqmarginal_path\(dq method is active, the +\(dqsan_path_err\(dq parameters are implicitly set to 0. +. +. +.\" ---------------------------------------------------------------------------- .SH "KNOWN ISSUES" .\" ---------------------------------------------------------------------------- . diff --git a/multipathd/cli.c b/multipathd/cli.c index a75afe3..ca176a9 100644 --- a/multipathd/cli.c +++ b/multipathd/cli.c @@ -13,7 +13,9 @@ #include "version.h" #include +#include "mpath_cmd.h" #include "cli.h" +#include "debug.h" static vector keys; static vector handlers; diff --git a/multipathd/cli.h b/multipathd/cli.h index 7cc7e4b..f3fa077 100644 --- a/multipathd/cli.h +++ b/multipathd/cli.h @@ -96,6 +96,12 @@ enum { do { \ if ((a)) { \ char *tmp = (r); \ + \ + if (m >= MAX_REPLY_LEN) { \ + condlog(1, "Warning: max reply length exceeded"); \ + free(tmp); \ + r = NULL; \ + } \ (r) = REALLOC((r), (m) * 2); \ if ((r)) { \ memset((r) + (m), 0, (m)); \ diff --git a/multipathd/cli_handlers.c b/multipathd/cli_handlers.c index a0d57a5..f95813e 100644 --- a/multipathd/cli_handlers.c +++ b/multipathd/cli_handlers.c @@ -26,6 +26,7 @@ #include "prkey.h" #include "propsel.h" #include "main.h" +#include "mpath_cmd.h" #include "cli.h" #include "uevent.h" #include "foreign.h" @@ -346,6 +347,8 @@ cli_list_path (void * v, char ** reply, int * len, void * data) condlog(3, "%s: list path (operator)", param); pp = find_path_by_dev(vecs->pathvec, param); + if (!pp) + return 1; return show_path(reply, len, vecs, pp, "%o"); } @@ -803,7 +806,8 @@ cli_add_map (void * v, char ** reply, int * len, void * data) vecs->pathvec, &refwwid); if (refwwid) { if (coalesce_paths(vecs, NULL, refwwid, - FORCE_RELOAD_NONE, CMD_NONE)) + FORCE_RELOAD_NONE, CMD_NONE) + != CP_OK) condlog(2, "%s: coalesce_paths failed", param); dm_lib_release(); @@ -892,7 +896,7 @@ int resize_map(struct multipath *mpp, unsigned long long size, } mpp->action = ACT_RESIZE; mpp->force_udev_reload = 1; - if (domap(mpp, params, 1) <= 0) { + if (domap(mpp, params, 1) == DOMAP_FAIL) { condlog(0, "%s: failed to resize map : %s", mpp->alias, strerror(errno)); mpp->size = orig_size; diff --git a/multipathd/dmevents.c b/multipathd/dmevents.c index 31e64a7..0034892 100644 --- a/multipathd/dmevents.c +++ b/multipathd/dmevents.c @@ -168,7 +168,9 @@ static int dm_get_events(void) while (names->dev) { uint32_t event_nr; - if (!dm_is_mpath(names->name)) + /* Don't delete device if dm_is_mpath() fails without + * checking the device type */ + if (dm_is_mpath(names->name) == 0) goto next; event_nr = dm_event_nr(names); @@ -204,7 +206,9 @@ int watch_dmevents(char *name) struct dev_event *dev_evt, *old_dev_evt; int i; - if (!dm_is_mpath(name)) { + /* We know that this is a multipath device, so only fail if + * device-mapper tells us that we're wrong */ + if (dm_is_mpath(name) == 0) { condlog(0, "%s: not a multipath device. can't watch events", name); return -1; diff --git a/multipathd/main.c b/multipathd/main.c index cc555bb..fb520b6 100644 --- a/multipathd/main.c +++ b/multipathd/main.c @@ -92,7 +92,8 @@ static int use_watchdog; #define LOG_MSG(lvl, verb, pp) \ do { \ - if (lvl <= verb) { \ + if (pp->mpp && checker_selected(&pp->checker) && \ + lvl <= verb) { \ if (pp->offline) \ condlog(lvl, "%s: %s - path offline", \ pp->mpp->alias, pp->dev); \ @@ -242,10 +243,11 @@ int set_config_state(enum daemon_status state) else if (running_state != DAEMON_IDLE) { struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - ts.tv_sec += 1; - rc = pthread_cond_timedwait(&config_cond, - &config_lock, &ts); + if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) { + ts.tv_sec += 1; + rc = pthread_cond_timedwait(&config_cond, + &config_lock, &ts); + } } if (!rc) { running_state = state; @@ -334,6 +336,7 @@ remove_map_and_stop_waiter(struct multipath *mpp, struct vectors *vecs) { /* devices are automatically removed by the dmevent polling code, * so they don't need to be manually removed here */ + condlog(3, "%s: removing map from internal tables", mpp->alias); if (!poll_dmevents) stop_waiter_thread(mpp, vecs); remove_map(mpp, vecs, PURGE_VEC); @@ -491,13 +494,12 @@ retry: verify_paths(mpp, vecs); mpp->action = ACT_RELOAD; - extract_hwe_from_path(mpp); if (setup_map(mpp, params, PARAMS_SIZE, vecs)) { condlog(0, "%s: failed to setup new map in update", mpp->alias); retries = -1; goto fail; } - if (domap(mpp, params, 1) <= 0 && retries-- > 0) { + if (domap(mpp, params, 1) == DOMAP_FAIL && retries-- > 0) { condlog(0, "%s: map_udate sleep", mpp->alias); sleep(1); goto retry; @@ -654,7 +656,7 @@ flush_map(struct multipath * mpp, struct vectors * vecs, int nopaths) condlog(2, "%s: map flushed", mpp->alias); } - orphan_paths(vecs->pathvec, mpp); + orphan_paths(vecs->pathvec, mpp, "map flushed"); remove_map_and_stop_waiter(mpp, vecs); return 0; @@ -700,7 +702,7 @@ ev_add_map (char * dev, const char * alias, struct vectors * vecs) int delayed_reconfig, reassign_maps; struct config *conf; - if (!dm_is_mpath(alias)) { + if (dm_is_mpath(alias) != 1) { condlog(4, "%s: not a multipath map", alias); return 0; } @@ -786,7 +788,6 @@ uev_remove_map (struct uevent * uev, struct vectors * vecs) goto out; } - orphan_paths(vecs->pathvec, mpp); remove_map_and_stop_waiter(mpp, vecs); out: lock_cleanup_pop(vecs->lock); @@ -925,6 +926,14 @@ ev_add_path (struct path * pp, struct vectors * vecs, int need_do_map) goto fail; /* leave path added to pathvec */ } mpp = find_mp_by_wwid(vecs->mpvec, pp->wwid); + if (mpp && pp->size && mpp->size != pp->size) { + condlog(0, "%s: failed to add new path %s, device size mismatch", mpp->alias, pp->dev); + int i = find_slot(vecs->pathvec, (void *)pp); + if (i != -1) + vector_del_slot(vecs->pathvec, i); + free_path(pp); + return 1; + } if (mpp && mpp->wait_for_udev && (pathcount(mpp, PATH_UP) > 0 || (pathcount(mpp, PATH_GHOST) > 0 && pp->tpgs != TPGS_IMPLICIT && @@ -940,17 +949,6 @@ ev_add_path (struct path * pp, struct vectors * vecs, int need_do_map) pp->mpp = mpp; rescan: if (mpp) { - if (pp->size && mpp->size != pp->size) { - condlog(0, "%s: failed to add new path %s, " - "device size mismatch", - mpp->alias, pp->dev); - int i = find_slot(vecs->pathvec, (void *)pp); - if (i != -1) - vector_del_slot(vecs->pathvec, i); - free_path(pp); - return 1; - } - condlog(4,"%s: adopting all paths for path %s", mpp->alias, pp->dev); if (adopt_paths(vecs->pathvec, mpp)) @@ -958,7 +956,6 @@ rescan: verify_paths(mpp, vecs); mpp->action = ACT_RELOAD; - extract_hwe_from_path(mpp); } else { if (!should_multipath(pp, vecs->pathvec, vecs->mpvec)) { orphan_path(pp, "only one path"); @@ -998,15 +995,14 @@ rescan: /* * reload the map for the multipath mapped device */ -retry: ret = domap(mpp, params, 1); - if (ret <= 0) { - if (ret < 0 && retries-- > 0) { - condlog(0, "%s: retry domap for addition of new " - "path %s", mpp->alias, pp->dev); - sleep(1); - goto retry; - } + while (ret == DOMAP_RETRY && retries-- > 0) { + condlog(0, "%s: retry domap for addition of new " + "path %s", mpp->alias, pp->dev); + sleep(1); + ret = domap(mpp, params, 1); + } + if (ret == DOMAP_FAIL || ret == DOMAP_RETRY) { condlog(0, "%s: failed in domap for addition of new " "path %s", mpp->alias, pp->dev); /* @@ -1157,7 +1153,7 @@ ev_remove_path (struct path *pp, struct vectors * vecs, int need_do_map) * reload the map */ mpp->action = ACT_RELOAD; - if (domap(mpp, params, 1) <= 0) { + if (domap(mpp, params, 1) == DOMAP_FAIL) { condlog(0, "%s: failed in domap for " "removal of path %s", mpp->alias, pp->dev); @@ -1839,6 +1835,94 @@ int update_path_groups(struct multipath *mpp, struct vectors *vecs, int refresh) return 0; } +static int check_path_reinstate_state(struct path * pp) { + struct timespec curr_time; + + /* + * This function is only called when the path state changes + * from "bad" to "good". pp->state reflects the *previous* state. + * If this was "bad", we know that a failure must have occured + * beforehand, and count that. + * Note that we count path state _changes_ this way. If a path + * remains in "bad" state, failure count is not increased. + */ + + if (!((pp->mpp->san_path_err_threshold > 0) && + (pp->mpp->san_path_err_forget_rate > 0) && + (pp->mpp->san_path_err_recovery_time >0))) { + return 0; + } + + if (pp->disable_reinstate) { + /* If we don't know how much time has passed, automatically + * reinstate the path, just to be safe. Also, if there are + * no other usable paths, reinstate the path + */ + if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 || + pp->mpp->nr_active == 0) { + condlog(2, "%s : reinstating path early", pp->dev); + goto reinstate_path; + } + if ((curr_time.tv_sec - pp->dis_reinstate_time ) > pp->mpp->san_path_err_recovery_time) { + condlog(2,"%s : reinstate the path after err recovery time", pp->dev); + goto reinstate_path; + } + return 1; + } + /* forget errors on a working path */ + if ((pp->state == PATH_UP || pp->state == PATH_GHOST) && + pp->path_failures > 0) { + if (pp->san_path_err_forget_rate > 0){ + pp->san_path_err_forget_rate--; + } else { + /* for every san_path_err_forget_rate number of + * successful path checks decrement path_failures by 1 + */ + pp->path_failures--; + pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate; + } + return 0; + } + + /* If the path isn't recovering from a failed state, do nothing */ + if (pp->state != PATH_DOWN && pp->state != PATH_SHAKY && + pp->state != PATH_TIMEOUT) + return 0; + + if (pp->path_failures == 0) + pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate; + + pp->path_failures++; + + /* if we don't know the currently time, we don't know how long to + * delay the path, so there's no point in checking if we should + */ + + if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0) + return 0; + /* when path failures has exceeded the san_path_err_threshold + * place the path in delayed state till san_path_err_recovery_time + * so that the cutomer can rectify the issue within this time. After + * the completion of san_path_err_recovery_time it should + * automatically reinstate the path + */ + if (pp->path_failures > pp->mpp->san_path_err_threshold) { + condlog(2, "%s : hit error threshold. Delaying path reinstatement", pp->dev); + pp->dis_reinstate_time = curr_time.tv_sec; + pp->disable_reinstate = 1; + + return 1; + } else { + return 0; + } + +reinstate_path: + pp->path_failures = 0; + pp->disable_reinstate = 0; + pp->san_path_err_forget_rate = 0; + return 0; +} + /* * Returns '1' if the path has been checked, '-1' if it was blacklisted * and '0' otherwise @@ -1909,6 +1993,16 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) pp->tick = checkint; newstate = path_offline(pp); + if (newstate == PATH_UP) { + conf = get_multipath_config(); + pthread_cleanup_push(put_multipath_config, conf); + newstate = get_state(pp, conf, 1, newstate); + pthread_cleanup_pop(1); + } else { + checker_clear_message(&pp->checker); + condlog(3, "%s: state %s, checker not called", + pp->dev, checker_state_name(newstate)); + } /* * Wait for uevent for removed paths; * some LLDDs like zfcp keep paths unavailable @@ -1917,14 +2011,6 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) if (newstate == PATH_REMOVED) newstate = PATH_DOWN; - if (newstate == PATH_UP) { - conf = get_multipath_config(); - pthread_cleanup_push(put_multipath_config, conf); - newstate = get_state(pp, conf, 1, newstate); - pthread_cleanup_pop(1); - } else - checker_clear_message(&pp->checker); - if (pp->wwid_changed) { condlog(2, "%s: path wwid has changed. Refusing to use", pp->dev); @@ -1932,7 +2018,8 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) } if (newstate == PATH_WILD || newstate == PATH_UNCHECKED) { - condlog(2, "%s: unusable path - checker failed", pp->dev); + condlog(2, "%s: unusable path (%s) - checker failed", + pp->dev, checker_state_name(newstate)); LOG_MSG(2, verbosity, pp); conf = get_multipath_config(); pthread_cleanup_push(put_multipath_config, conf); @@ -1941,7 +2028,9 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) return 1; } if (!pp->mpp) { - if (!strlen(pp->wwid) && pp->initialized == INIT_FAILED && + if (!strlen(pp->wwid) && + (pp->initialized == INIT_FAILED || + pp->initialized == INIT_NEW) && (newstate == PATH_UP || newstate == PATH_GHOST)) { condlog(2, "%s: add missing path", pp->dev); conf = get_multipath_config(); @@ -1984,6 +2073,12 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) if (!pp->mpp) return 0; + if ((newstate == PATH_UP || newstate == PATH_GHOST) && + check_path_reinstate_state(pp)) { + pp->state = PATH_DELAYED; + return 1; + } + if (pp->io_err_disable_reinstate && hit_io_err_recheck_time(pp)) { pp->state = PATH_SHAKY; /* @@ -2181,6 +2276,7 @@ checkerloop (void *ap) unsigned int i; struct timespec last_time; struct config *conf; + int foreign_tick = 0; pthread_cleanup_push(rcu_unregister, NULL); rcu_register_thread(); @@ -2264,7 +2360,7 @@ checkerloop (void *ap) if (num_paths) { unsigned int max_checkint; - condlog(3, "checked %d path%s in %lu.%06lu secs", + condlog(4, "checked %d path%s in %lu.%06lu secs", num_paths, num_paths > 1 ? "s" : "", diff_time.tv_sec, diff_time.tv_nsec / 1000); @@ -2278,7 +2374,15 @@ checkerloop (void *ap) diff_time.tv_sec); } } - check_foreign(); + + if (foreign_tick == 0) { + conf = get_multipath_config(); + foreign_tick = conf->max_checkint; + put_multipath_config(conf); + } + if (--foreign_tick == 0) + check_foreign(); + post_config_state(DAEMON_IDLE); conf = get_multipath_config(); strict_timing = conf->strict_timing; @@ -2369,7 +2473,7 @@ configure (struct vectors * vecs) ret = coalesce_paths(vecs, mpvec, NULL, force_reload, CMD_NONE); if (force_reload == FORCE_RELOAD_WEAK) force_reload = FORCE_RELOAD_YES; - if (ret) { + if (ret != CP_OK) { condlog(0, "configure failed while coalescing paths"); goto fail; } diff --git a/tests/Makefile b/tests/Makefile index b37b502..ef90086 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -26,7 +26,11 @@ hwtable-test_LIBDEPS := -ludev -lpthread -ldl blacklist-test_OBJDEPS := ../libmultipath/blacklist.o blacklist-test_LIBDEPS := -ludev -%.out: %-test +lib/libchecktur.so: + mkdir lib + ln -t lib ../libmultipath/{checkers,prioritizers,foreign}/*.so + +%.out: %-test lib/libchecktur.so @echo == running $< == @LD_LIBRARY_PATH=$(multipathdir):$(mpathcmddir) ./$< >$@ @@ -34,6 +38,7 @@ OBJS = $(TESTS:%=%.o) test-lib.o clean: dep_clean $(RM) $(TESTS:%=%-test) $(TESTS:%=%.out) $(OBJS) + $(RM) -rf lib .SECONDARY: $(OBJS) diff --git a/tests/blacklist.c b/tests/blacklist.c index a55c1c0..54d568f 100644 --- a/tests/blacklist.c +++ b/tests/blacklist.c @@ -267,7 +267,8 @@ static void test_property_blacklist(void **state) static struct udev_device udev = { "sdb", { "ID_FOO", "ID_WWN", "ID_BAR", NULL } }; conf.blist_property = blist_property_wwn; expect_condlog(3, "sdb: udev property ID_WWN blacklisted\n"); - assert_int_equal(filter_property(&conf, &udev), MATCH_PROPERTY_BLIST); + assert_int_equal(filter_property(&conf, &udev, 3), + MATCH_PROPERTY_BLIST); } /* the property check works different in that you check all the property @@ -280,7 +281,7 @@ static void test_property_whitelist(void **state) static struct udev_device udev = { "sdb", { "ID_FOO", "ID_WWN", "ID_BAR", NULL } }; conf.elist_property = blist_property_wwn; expect_condlog(3, "sdb: udev property ID_WWN whitelisted\n"); - assert_int_equal(filter_property(&conf, &udev), + assert_int_equal(filter_property(&conf, &udev, 3), MATCH_PROPERTY_BLIST_EXCEPT); } @@ -289,7 +290,7 @@ static void test_property_missing(void **state) static struct udev_device udev = { "sdb", { "ID_FOO", "ID_BAZ", "ID_BAR", NULL } }; conf.blist_property = blist_property_wwn; expect_condlog(3, "sdb: blacklisted, udev property missing\n"); - assert_int_equal(filter_property(&conf, &udev), + assert_int_equal(filter_property(&conf, &udev, 3), MATCH_PROPERTY_BLIST_MISSING); } diff --git a/tests/hwtable.c b/tests/hwtable.c index 9146ecc..ad863b0 100644 --- a/tests/hwtable.c +++ b/tests/hwtable.c @@ -24,8 +24,8 @@ #include "pgpolicies.h" #include "test-lib.h" #include "print.h" +#include "util.h" -#define ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) #define N_CONF_FILES 2 static const char tmplate[] = "/tmp/hwtable-XXXXXX"; @@ -250,14 +250,19 @@ static void write_defaults(const struct hwt_state *hwt) static struct key_value defaults[] = { { "config_dir", NULL }, { "bindings_file", NULL }, + { "multipath_dir", NULL }, { "detect_prio", "no" }, { "detect_checker", "no" }, }; char buf[sizeof(tmplate) + sizeof(bindings_name)]; + char dirbuf[PATH_MAX]; snprintf(buf, sizeof(buf), "%s/%s", hwt->tmpname, bindings_name); defaults[0].value = hwt->dirname; defaults[1].value = buf; + assert_ptr_not_equal(getcwd(dirbuf, sizeof(dirbuf)), NULL); + strncat(dirbuf, "/lib", sizeof(dirbuf)); + defaults[2].value = dirbuf; write_section(hwt->config_file, "defaults", ARRAY_SIZE(defaults), defaults); } @@ -565,7 +570,7 @@ static void test_internal_nvme(const struct hwt_state *hwt) pp = mock_path("NVME", "NoName"); mp = mock_multipath(pp); assert_ptr_not_equal(mp, NULL); - TEST_PROP(pp->checker.name, NONE); + TEST_PROP(checker_name(&pp->checker), NONE); TEST_PROP(pp->uid_attribute, "ID_WWN"); assert_int_equal(mp->pgpolicy, DEFAULT_PGPOLICY); assert_int_equal(mp->no_path_retry, DEFAULT_NO_PATH_RETRY); @@ -578,7 +583,7 @@ static void test_internal_nvme(const struct hwt_state *hwt) default_wwid_1); mp = mock_multipath(pp); assert_ptr_not_equal(mp, NULL); - TEST_PROP(pp->checker.name, NONE); + TEST_PROP(checker_name(&pp->checker), NONE); TEST_PROP(pp->uid_attribute, "ID_WWN"); assert_int_equal(mp->pgpolicy, MULTIBUS); assert_int_equal(mp->no_path_retry, NO_PATH_RETRY_QUEUE); @@ -750,31 +755,31 @@ static void test_regex_string_hwe(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* boo:baz matches kv1 */ pp = mock_path(vnd_boo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* .oo:ba. matches kv1 */ pp = mock_path(vnd__oo.value, prd_ba_.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* .foo:(bar|baz|ba\.) doesn't match */ pp = mock_path(vnd__oo.value, prd_ba_s.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches kv2 and kv1 */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_regex_string_hwe(void **state) @@ -807,32 +812,32 @@ static void test_regex_string_hwe_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* boo:baz matches kv1 */ pp = mock_path(vnd_boo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* .oo:ba. matches kv1 */ pp = mock_path(vnd__oo.value, prd_ba_.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* .oo:(bar|baz|ba\.)$ doesn't match */ pp = mock_path(vnd__oo.value, prd_ba_s.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches kv2 */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); /* Later match takes prio */ TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_regex_string_hwe_dir(void **state) @@ -863,28 +868,28 @@ static void test_regex_2_strings_hwe_dir(const struct hwt_state *hwt) TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); TEST_PROP(pp->uid_attribute, DEFAULT_UID_ATTRIBUTE); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* boo:baz doesn't match */ pp = mock_path(vnd_boo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); TEST_PROP(pp->uid_attribute, DEFAULT_UID_ATTRIBUTE); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches kv2 and kv1 */ pp = mock_path(vnd_foo.value, prd_bar.value); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, NULL); TEST_PROP(pp->uid_attribute, uid_baz.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* foo:barz matches kv3 and kv2 and kv1 */ pp = mock_path_flags(vnd_foo.value, prd_barz.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_rdac.value); TEST_PROP(pp->getuid, gui_foo.value); TEST_PROP(pp->uid_attribute, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_regex_2_strings_hwe_dir(void **state) @@ -921,31 +926,31 @@ static void test_string_regex_hwe_dir(const struct hwt_state *hwt) pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* foo:baz matches kv1 */ pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* boo:baz matches kv1 */ pp = mock_path(vnd_boo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* .oo:ba. matches kv1 */ pp = mock_path(vnd__oo.value, prd_ba_.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* .oo:(bar|baz|ba\.)$ doesn't match */ pp = mock_path(vnd__oo.value, prd_ba_s.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); } static int setup_string_regex_hwe_dir(void **state) @@ -975,13 +980,13 @@ static void test_2_ident_strings_hwe(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches both */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_ident_strings_hwe(void **state) @@ -1010,13 +1015,13 @@ static void test_2_ident_strings_both_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches both */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_ident_strings_both_dir(void **state) @@ -1050,13 +1055,13 @@ static void test_2_ident_strings_both_dir_w_prev(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches both */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_ident_strings_both_dir_w_prev(void **state) @@ -1095,13 +1100,13 @@ static void test_2_ident_strings_hwe_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches both */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_ident_strings_hwe_dir(void **state) @@ -1129,13 +1134,13 @@ static void test_3_ident_strings_hwe_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches both */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_3_ident_strings_hwe_dir(void **state) @@ -1173,13 +1178,13 @@ static void test_2_ident_self_matching_re_hwe_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches both */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_ident_self_matching_re_hwe_dir(void **state) @@ -1208,13 +1213,13 @@ static void test_2_ident_self_matching_re_hwe(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_ident_self_matching_re_hwe(void **state) @@ -1245,13 +1250,13 @@ test_2_ident_not_self_matching_re_hwe_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_baz.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); /* foo:bar matches both */ pp = mock_path_flags(vnd_foo.value, prd_bar.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_ident_not_self_matching_re_hwe_dir(void **state) @@ -1282,19 +1287,19 @@ static void test_2_matching_res_hwe_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_bar.value); TEST_PROP(prio_name(&pp->prio), prio_emc.value); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* foo:bay matches k1 and k2 */ pp = mock_path_flags(vnd_foo.value, "bay", USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); /* foo:baz matches k2 only. */ pp = mock_path_flags(vnd_foo.value, prd_baz.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); } static int setup_2_matching_res_hwe_dir(void **state) @@ -1323,12 +1328,12 @@ static void test_2_nonmatching_res_hwe_dir(const struct hwt_state *hwt) pp = mock_path(vnd_foo.value, prd_bar.value); TEST_PROP(prio_name(&pp->prio), DEFAULT_PRIO); TEST_PROP(pp->getuid, NULL); - TEST_PROP(pp->checker.name, DEFAULT_CHECKER); + TEST_PROP(checker_name(&pp->checker), DEFAULT_CHECKER); pp = mock_path_flags(vnd_foo.value, prd_baz.value, USE_GETUID); TEST_PROP(prio_name(&pp->prio), prio_hds.value); TEST_PROP(pp->getuid, gui_foo.value); - TEST_PROP(pp->checker.name, chk_hp.value); + TEST_PROP(checker_name(&pp->checker), chk_hp.value); } static int setup_2_nonmatching_res_hwe_dir(void **state) diff --git a/tests/util.c b/tests/util.c index 839effd..e6d4b9a 100644 --- a/tests/util.c +++ b/tests/util.c @@ -26,6 +26,8 @@ #include "globals.c" +#define BITARR_SZ 4 + static void test_basenamecpy_good0(void **state) { char dst[10]; @@ -139,6 +141,100 @@ static void test_basenamecpy_bad5(void **state) assert_int_equal(basenamecpy("baz/qux", NULL, sizeof(dst)), 0); } +static void test_bitmask_1(void **state) +{ + uint64_t arr[BITARR_SZ]; + int i, j, k, m, b; + + memset(arr, 0, sizeof(arr)); + + for (j = 0; j < BITARR_SZ; j++) { + for (i = 0; i < 64; i++) { + b = 64 * j + i; + assert(!is_bit_set_in_array(b, arr)); + set_bit_in_array(b, arr); + for (k = 0; k < BITARR_SZ; k++) { + printf("b = %d j = %d k = %d a = %"PRIx64"\n", + b, j, k, arr[k]); + if (k == j) + assert_int_equal(arr[j], 1ULL << i); + else + assert_int_equal(arr[k], 0ULL); + } + for (m = 0; m < 64; m++) + if (i == m) + assert(is_bit_set_in_array(64 * j + m, + arr)); + else + assert(!is_bit_set_in_array(64 * j + m, + arr)); + clear_bit_in_array(b, arr); + assert(!is_bit_set_in_array(b, arr)); + for (k = 0; k < BITARR_SZ; k++) + assert_int_equal(arr[k], 0ULL); + } + } +} + +static void test_bitmask_2(void **state) +{ + uint64_t arr[BITARR_SZ]; + int i, j, k, m, b; + + memset(arr, 0, sizeof(arr)); + + for (j = 0; j < BITARR_SZ; j++) { + for (i = 0; i < 64; i++) { + b = 64 * j + i; + assert(!is_bit_set_in_array(b, arr)); + set_bit_in_array(b, arr); + for (m = 0; m < 64; m++) + if (m <= i) + assert(is_bit_set_in_array(64 * j + m, + arr)); + else + assert(!is_bit_set_in_array(64 * j + m, + arr)); + assert(is_bit_set_in_array(b, arr)); + for (k = 0; k < BITARR_SZ; k++) { + if (k < j || (k == j && i == 63)) + assert_int_equal(arr[k], ~0ULL); + else if (k > j) + assert_int_equal(arr[k], 0ULL); + else + assert_int_equal( + arr[k], + (1ULL << (i + 1)) - 1); + } + } + } + for (j = 0; j < BITARR_SZ; j++) { + for (i = 0; i < 64; i++) { + b = 64 * j + i; + assert(is_bit_set_in_array(b, arr)); + clear_bit_in_array(b, arr); + for (m = 0; m < 64; m++) + if (m <= i) + assert(!is_bit_set_in_array(64 * j + m, + arr)); + else + assert(is_bit_set_in_array(64 * j + m, + arr)); + assert(!is_bit_set_in_array(b, arr)); + for (k = 0; k < BITARR_SZ; k++) { + if (k < j || (k == j && i == 63)) + assert_int_equal(arr[k], 0ULL); + else if (k > j) + assert_int_equal(arr[k], ~0ULL); + else + assert_int_equal( + arr[k], + ~((1ULL << (i + 1)) - 1)); + } + } + } +} + int test_basenamecpy(void) { const struct CMUnitTest tests[] = { @@ -156,6 +252,8 @@ int test_basenamecpy(void) cmocka_unit_test(test_basenamecpy_bad3), cmocka_unit_test(test_basenamecpy_bad4), cmocka_unit_test(test_basenamecpy_bad5), + cmocka_unit_test(test_bitmask_1), + cmocka_unit_test(test_bitmask_2), }; return cmocka_run_group_tests(tests, NULL, NULL); } -- 2.7.4