Imported Upstream version 0.7.4 upstream/0.7.4
authorDongHun Kwak <dh0128.kwak@samsung.com>
Fri, 14 Jan 2022 04:50:17 +0000 (13:50 +0900)
committerDongHun Kwak <dh0128.kwak@samsung.com>
Fri, 14 Jan 2022 04:50:17 +0000 (13:50 +0900)
28 files changed:
libdmmp/Makefile
libmpathpersist/mpath_persist.c
libmultipath/Makefile
libmultipath/checkers.c
libmultipath/checkers.h
libmultipath/checkers/cciss_tur.c
libmultipath/config.c
libmultipath/config.h
libmultipath/configure.c
libmultipath/dict.c
libmultipath/discovery.c
libmultipath/hwtable.c
libmultipath/io_err_stat.c [new file with mode: 0644]
libmultipath/io_err_stat.h [new file with mode: 0644]
libmultipath/prioritizers/path_latency.c
libmultipath/prioritizers/weightedpath.c
libmultipath/propsel.c
libmultipath/propsel.h
libmultipath/structs.c
libmultipath/structs.h
libmultipath/uevent.c
libmultipath/uevent.h
libmultipath/util.c
libmultipath/util.h
libmultipath/version.h
multipath/multipath.conf.5
multipathd/main.c
multipathd/multipathd.8

index cdd26ed753fa4c6f666eee9d7d8e7a1d2ef8061c..6645a1a48909ccaaa883a7d55e3ffbed321168a1 100644 (file)
@@ -54,6 +54,7 @@ uninstall:
                $(RM) $$file; \
        done
        $(RM) $(DESTDIR)$(man3dir)/libdmmp.h*
+       $(RM) $(DESTDIR)$(pkgconfdir)/$(PKGFILE)
 
 clean:
        $(RM) core *.a *.o *.gz *.so *.so.*
@@ -75,10 +76,10 @@ docs/man/$(EXTRA_MAN_FILES).gz: $(HEADERS)
                $(INSTALL_PROGRAM) -v -m 644 -D docs/$$file docs/man/$$file; \
        done
        cat $(HEADERS) | \
-           perl docs/doc-preclean.pl > $(TEMPFILE)
-       perl docs/kernel-doc -man $(TEMPFILE) | \
+           perl docs/doc-preclean.pl > "$(TEMPFILE)"
+       perl docs/kernel-doc -man "$(TEMPFILE)" | \
            perl docs/split-man.pl docs/man
-       -rm -f $(TEMPFILE)
+       -rm -f "$(TEMPFILE)"
        @for file in docs/man/*.3; do \
                gzip -f $$file; \
        done
index b5ed5567f2b3f9d03261abfaca99bb6f6792438a..84ab29329764a881f7c1df8f58e7900568d0584e 100644 (file)
@@ -339,8 +339,9 @@ int mpath_persistent_reserve_out ( int fd, int rq_servact, int rq_scope,
 
        memcpy(&prkey, paramp->sa_key, 8);
        if (mpp->prkey_source == PRKEY_SOURCE_FILE && prkey &&
-           ((!get_be64(mpp->reservation_key) && MPATH_PROUT_REG_SA) ||
-            MPATH_PROUT_REG_IGN_SA)) {
+           ((!get_be64(mpp->reservation_key) &&
+             rq_servact == MPATH_PROUT_REG_SA) ||
+            rq_servact == MPATH_PROUT_REG_IGN_SA)) {
                memcpy(&mpp->reservation_key, paramp->sa_key, 8);
                if (update_prkey(alias, get_be64(mpp->reservation_key))) {
                        condlog(0, "%s: failed to set prkey for multipathd.",
index 928bc257b798d9e0d2df33340376c4e2337dda59..6447d8dafb4ff71b357a959a972566d9e46cfacd 100644 (file)
@@ -9,7 +9,7 @@ LIBS = $(DEVLIB).$(SONAME)
 
 CFLAGS += $(LIB_CFLAGS) -I$(mpathcmddir)
 
-LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -lurcu
+LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -lurcu -laio
 
 ifdef SYSTEMD
        CFLAGS += -DUSE_SYSTEMD=$(SYSTEMD)
@@ -42,7 +42,8 @@ OBJS = memory.o parser.o vector.o devmapper.o callout.o \
        pgpolicies.o debug.o defaults.o uevent.o time-util.o \
        switchgroup.o uxsock.o print.o alias.o log_pthread.o \
        log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \
-       lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o
+       lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o \
+       io_err_stat.o
 
 all: $(LIBS)
 
index cd6d6a369faffff34ee4a802f707e08cfac4d8d8..08cdfc33a6cc8c2afbbd317beb120422b1601329 100644 (file)
@@ -19,7 +19,6 @@ char *checker_state_names[] = {
        "timeout",
        "removed",
        "delayed",
-       "none",
 };
 
 static LIST_HEAD(checkers);
@@ -44,6 +43,7 @@ struct checker * alloc_checker (void)
        if (c) {
                INIT_LIST_HEAD(&c->node);
                c->refcount = 1;
+               c->fd = -1;
        }
        return c;
 }
@@ -203,6 +203,12 @@ int checker_init (struct checker * c, void ** mpctxt_addr)
        return 0;
 }
 
+void checker_clear (struct checker *c)
+{
+       memset(c, 0x0, sizeof(struct checker));
+       c->fd = -1;
+}
+
 void checker_put (struct checker * dst)
 {
        struct checker * src;
@@ -212,7 +218,7 @@ void checker_put (struct checker * dst)
        src = checker_lookup(dst->name);
        if (dst->free)
                dst->free(dst);
-       memset(dst, 0x0, sizeof(struct checker));
+       checker_clear(dst);
        free_checker(src);
 }
 
index 713399faa9eef89445819dea4e4291c7b736b26a..52154ca061297fa4fce72e6d8d258abe29c3b9d6 100644 (file)
@@ -11,7 +11,7 @@
  *
  * PATH_WILD:
  * - Use: None of the checkers (returned if we don't have an fd)
- * - Description: Corner case where "fd <= 0" for path fd (see checker_check())
+ * - Description: Corner case where "fd < 0" for path fd (see checker_check())
  *
  * PATH_UNCHECKED:
  * - Use: Only in directio checker
@@ -128,6 +128,7 @@ void cleanup_checkers (void);
 struct checker * add_checker (char *, char *);
 struct checker * checker_lookup (char *);
 int checker_init (struct checker *, void **);
+void checker_clear (struct checker *);
 void checker_put (struct checker *);
 void checker_reset (struct checker *);
 void checker_set_sync (struct checker *);
index 9d79f965fbb22e311cfa34e86edd5d9761fa3a66..436470c7ddbdfebf86d40ab4dc12ed5689654824 100644 (file)
@@ -73,7 +73,7 @@ int libcheck_check(struct checker * c)
        LogvolInfo_struct    lvi;       // logical "volume" info
        IOCTL_Command_struct cic;       // cciss ioctl command
 
-       if ((c->fd) <= 0) {
+       if ((c->fd) < 0) {
                MSG(c,"no usable fd");
                ret = -1;
                goto out;
index ea2359acdfb7cf5dfc75de446cb8d64503db834c..eb03f0a9f4b822a7b66d8cf144579797f9181048 100644 (file)
@@ -351,9 +351,6 @@ merge_hwe (struct hwentry * dst, struct hwentry * src)
        merge_num(delay_wait_checks);
        merge_num(skip_kpartx);
        merge_num(max_sectors_kb);
-       merge_num(san_path_err_threshold);
-       merge_num(san_path_err_forget_rate);
-       merge_num(san_path_err_recovery_time);
 
        snprintf(id, sizeof(id), "%s/%s", dst->vendor, dst->product);
        reconcile_features_with_options(id, &dst->features,
index 240730b062de2aec711bd67ecfdc19de73009fe2..51fe27bd4cae7cb32e32802fbde95cf1ed394091 100644 (file)
@@ -75,9 +75,10 @@ struct hwentry {
        int deferred_remove;
        int delay_watch_checks;
        int delay_wait_checks;
-       int san_path_err_threshold;
-       int san_path_err_forget_rate;
-       int san_path_err_recovery_time;
+       int marginal_path_err_sample_time;
+       int marginal_path_err_rate_threshold;
+       int marginal_path_err_recheck_gap_time;
+       int marginal_path_double_failed_time;
        int skip_kpartx;
        int max_sectors_kb;
        char * bl_product;
@@ -107,9 +108,10 @@ struct mpentry {
        int deferred_remove;
        int delay_watch_checks;
        int delay_wait_checks;
-       int san_path_err_threshold;
-       int san_path_err_forget_rate;
-       int san_path_err_recovery_time;
+       int marginal_path_err_sample_time;
+       int marginal_path_err_rate_threshold;
+       int marginal_path_err_recheck_gap_time;
+       int marginal_path_double_failed_time;
        int skip_kpartx;
        int max_sectors_kb;
        uid_t uid;
@@ -156,9 +158,10 @@ struct config {
        int processed_main_config;
        int delay_watch_checks;
        int delay_wait_checks;
-       int san_path_err_threshold;
-       int san_path_err_forget_rate;
-       int san_path_err_recovery_time;
+       int marginal_path_err_sample_time;
+       int marginal_path_err_rate_threshold;
+       int marginal_path_err_recheck_gap_time;
+       int marginal_path_double_failed_time;
        int uxsock_timeout;
        int strict_timing;
        int retrigger_tries;
index 7a3db31897d032af375ecc5764732b06626ed333..09821e84e62aa6f647dbab500d24edf305d1aa59 100644 (file)
@@ -295,9 +295,10 @@ int setup_map(struct multipath *mpp, char *params, int params_size)
        select_deferred_remove(conf, mpp);
        select_delay_watch_checks(conf, mpp);
        select_delay_wait_checks(conf, mpp);
-       select_san_path_err_threshold(conf, mpp);
-       select_san_path_err_forget_rate(conf, mpp);
-       select_san_path_err_recovery_time(conf, mpp);
+       select_marginal_path_err_sample_time(conf, mpp);
+       select_marginal_path_err_rate_threshold(conf, mpp);
+       select_marginal_path_err_recheck_gap_time(conf, mpp);
+       select_marginal_path_double_failed_time(conf, mpp);
        select_skip_kpartx(conf, mpp);
        select_max_sectors_kb(conf, mpp);
 
index 36cccc94efb41581e8c4705e161003a602ab1577..3b36e1dad5229cb99eb6ff91c1261037c353b00e 100644 (file)
@@ -1083,33 +1083,45 @@ declare_hw_handler(delay_wait_checks, set_off_int_undef)
 declare_hw_snprint(delay_wait_checks, print_off_int_undef)
 declare_mp_handler(delay_wait_checks, set_off_int_undef)
 declare_mp_snprint(delay_wait_checks, print_off_int_undef)
-declare_def_handler(san_path_err_threshold, set_off_int_undef)
-declare_def_snprint_defint(san_path_err_threshold, print_off_int_undef,
+declare_def_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_err_sample_time, print_off_int_undef,
                           DEFAULT_ERR_CHECKS)
-declare_ovr_handler(san_path_err_threshold, set_off_int_undef)
-declare_ovr_snprint(san_path_err_threshold, print_off_int_undef)
-declare_hw_handler(san_path_err_threshold, set_off_int_undef)
-declare_hw_snprint(san_path_err_threshold, print_off_int_undef)
-declare_mp_handler(san_path_err_threshold, set_off_int_undef)
-declare_mp_snprint(san_path_err_threshold, print_off_int_undef)
-declare_def_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_def_snprint_defint(san_path_err_forget_rate, print_off_int_undef,
+declare_ovr_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_ovr_snprint(marginal_path_err_sample_time, print_off_int_undef)
+declare_hw_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_hw_snprint(marginal_path_err_sample_time, print_off_int_undef)
+declare_mp_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_mp_snprint(marginal_path_err_sample_time, print_off_int_undef)
+declare_def_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_err_rate_threshold, print_off_int_undef,
                           DEFAULT_ERR_CHECKS)
-declare_ovr_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_ovr_snprint(san_path_err_forget_rate, print_off_int_undef)
-declare_hw_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_hw_snprint(san_path_err_forget_rate, print_off_int_undef)
-declare_mp_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_mp_snprint(san_path_err_forget_rate, print_off_int_undef)
-declare_def_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_def_snprint_defint(san_path_err_recovery_time, print_off_int_undef,
+declare_ovr_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_ovr_snprint(marginal_path_err_rate_threshold, print_off_int_undef)
+declare_hw_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_hw_snprint(marginal_path_err_rate_threshold, print_off_int_undef)
+declare_mp_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_mp_snprint(marginal_path_err_rate_threshold, print_off_int_undef)
+declare_def_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_err_recheck_gap_time, print_off_int_undef,
                           DEFAULT_ERR_CHECKS)
-declare_ovr_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_ovr_snprint(san_path_err_recovery_time, print_off_int_undef)
-declare_hw_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_hw_snprint(san_path_err_recovery_time, print_off_int_undef)
-declare_mp_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_mp_snprint(san_path_err_recovery_time, print_off_int_undef)
+declare_ovr_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_ovr_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef)
+declare_hw_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_hw_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef)
+declare_mp_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_mp_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef)
+declare_def_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_double_failed_time, print_off_int_undef,
+                          DEFAULT_ERR_CHECKS)
+declare_ovr_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_ovr_snprint(marginal_path_double_failed_time, print_off_int_undef)
+declare_hw_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_hw_snprint(marginal_path_double_failed_time, print_off_int_undef)
+declare_mp_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_mp_snprint(marginal_path_double_failed_time, print_off_int_undef)
+
+
+
 static int
 def_uxsock_timeout_handler(struct config *conf, vector strvec)
 {
@@ -1443,9 +1455,10 @@ init_keywords(vector keywords)
        install_keyword("config_dir", &def_config_dir_handler, &snprint_def_config_dir);
        install_keyword("delay_watch_checks", &def_delay_watch_checks_handler, &snprint_def_delay_watch_checks);
        install_keyword("delay_wait_checks", &def_delay_wait_checks_handler, &snprint_def_delay_wait_checks);
-       install_keyword("san_path_err_threshold", &def_san_path_err_threshold_handler, &snprint_def_san_path_err_threshold);
-       install_keyword("san_path_err_forget_rate", &def_san_path_err_forget_rate_handler, &snprint_def_san_path_err_forget_rate);
-       install_keyword("san_path_err_recovery_time", &def_san_path_err_recovery_time_handler, &snprint_def_san_path_err_recovery_time);
+       install_keyword("marginal_path_err_sample_time", &def_marginal_path_err_sample_time_handler, &snprint_def_marginal_path_err_sample_time);
+       install_keyword("marginal_path_err_rate_threshold", &def_marginal_path_err_rate_threshold_handler, &snprint_def_marginal_path_err_rate_threshold);
+       install_keyword("marginal_path_err_recheck_gap_time", &def_marginal_path_err_recheck_gap_time_handler, &snprint_def_marginal_path_err_recheck_gap_time);
+       install_keyword("marginal_path_double_failed_time", &def_marginal_path_double_failed_time_handler, &snprint_def_marginal_path_double_failed_time);
 
        install_keyword("find_multipaths", &def_find_multipaths_handler, &snprint_def_find_multipaths);
        install_keyword("uxsock_timeout", &def_uxsock_timeout_handler, &snprint_def_uxsock_timeout);
@@ -1530,9 +1543,10 @@ init_keywords(vector keywords)
        install_keyword("deferred_remove", &hw_deferred_remove_handler, &snprint_hw_deferred_remove);
        install_keyword("delay_watch_checks", &hw_delay_watch_checks_handler, &snprint_hw_delay_watch_checks);
        install_keyword("delay_wait_checks", &hw_delay_wait_checks_handler, &snprint_hw_delay_wait_checks);
-       install_keyword("san_path_err_threshold", &hw_san_path_err_threshold_handler, &snprint_hw_san_path_err_threshold);
-       install_keyword("san_path_err_forget_rate", &hw_san_path_err_forget_rate_handler, &snprint_hw_san_path_err_forget_rate);
-       install_keyword("san_path_err_recovery_time", &hw_san_path_err_recovery_time_handler, &snprint_hw_san_path_err_recovery_time);
+       install_keyword("marginal_path_err_sample_time", &hw_marginal_path_err_sample_time_handler, &snprint_hw_marginal_path_err_sample_time);
+       install_keyword("marginal_path_err_rate_threshold", &hw_marginal_path_err_rate_threshold_handler, &snprint_hw_marginal_path_err_rate_threshold);
+       install_keyword("marginal_path_err_recheck_gap_time", &hw_marginal_path_err_recheck_gap_time_handler, &snprint_hw_marginal_path_err_recheck_gap_time);
+       install_keyword("marginal_path_double_failed_time", &hw_marginal_path_double_failed_time_handler, &snprint_hw_marginal_path_double_failed_time);
        install_keyword("skip_kpartx", &hw_skip_kpartx_handler, &snprint_hw_skip_kpartx);
        install_keyword("max_sectors_kb", &hw_max_sectors_kb_handler, &snprint_hw_max_sectors_kb);
        install_sublevel_end();
@@ -1563,9 +1577,10 @@ init_keywords(vector keywords)
        install_keyword("deferred_remove", &ovr_deferred_remove_handler, &snprint_ovr_deferred_remove);
        install_keyword("delay_watch_checks", &ovr_delay_watch_checks_handler, &snprint_ovr_delay_watch_checks);
        install_keyword("delay_wait_checks", &ovr_delay_wait_checks_handler, &snprint_ovr_delay_wait_checks);
-       install_keyword("san_path_err_threshold", &ovr_san_path_err_threshold_handler, &snprint_ovr_san_path_err_threshold);
-       install_keyword("san_path_err_forget_rate", &ovr_san_path_err_forget_rate_handler, &snprint_ovr_san_path_err_forget_rate);
-       install_keyword("san_path_err_recovery_time", &ovr_san_path_err_recovery_time_handler, &snprint_ovr_san_path_err_recovery_time);
+       install_keyword("marginal_path_err_sample_time", &ovr_marginal_path_err_sample_time_handler, &snprint_ovr_marginal_path_err_sample_time);
+       install_keyword("marginal_path_err_rate_threshold", &ovr_marginal_path_err_rate_threshold_handler, &snprint_ovr_marginal_path_err_rate_threshold);
+       install_keyword("marginal_path_err_recheck_gap_time", &ovr_marginal_path_err_recheck_gap_time_handler, &snprint_ovr_marginal_path_err_recheck_gap_time);
+       install_keyword("marginal_path_double_failed_time", &ovr_marginal_path_double_failed_time_handler, &snprint_ovr_marginal_path_double_failed_time);
 
        install_keyword("skip_kpartx", &ovr_skip_kpartx_handler, &snprint_ovr_skip_kpartx);
        install_keyword("max_sectors_kb", &ovr_max_sectors_kb_handler, &snprint_ovr_max_sectors_kb);
@@ -1595,9 +1610,10 @@ init_keywords(vector keywords)
        install_keyword("deferred_remove", &mp_deferred_remove_handler, &snprint_mp_deferred_remove);
        install_keyword("delay_watch_checks", &mp_delay_watch_checks_handler, &snprint_mp_delay_watch_checks);
        install_keyword("delay_wait_checks", &mp_delay_wait_checks_handler, &snprint_mp_delay_wait_checks);
-       install_keyword("san_path_err_threshold", &mp_san_path_err_threshold_handler, &snprint_mp_san_path_err_threshold);
-       install_keyword("san_path_err_forget_rate", &mp_san_path_err_forget_rate_handler, &snprint_mp_san_path_err_forget_rate);
-       install_keyword("san_path_err_recovery_time", &mp_san_path_err_recovery_time_handler, &snprint_mp_san_path_err_recovery_time);
+       install_keyword("marginal_path_err_sample_time", &mp_marginal_path_err_sample_time_handler, &snprint_mp_marginal_path_err_sample_time);
+       install_keyword("marginal_path_err_rate_threshold", &mp_marginal_path_err_rate_threshold_handler, &snprint_mp_marginal_path_err_rate_threshold);
+       install_keyword("marginal_path_err_recheck_gap_time", &mp_marginal_path_err_recheck_gap_time_handler, &snprint_mp_marginal_path_err_recheck_gap_time);
+       install_keyword("marginal_path_double_failed_time", &mp_marginal_path_double_failed_time_handler, &snprint_mp_marginal_path_double_failed_time);
        install_keyword("skip_kpartx", &mp_skip_kpartx_handler, &snprint_mp_skip_kpartx);
        install_keyword("max_sectors_kb", &mp_max_sectors_kb_handler, &snprint_mp_max_sectors_kb);
        install_sublevel_end();
index efac82458ffcfb0b50cdb147428a978aab06f0fe..cadf4617894f55ab42287881e92fe9b71f4aa396 100644 (file)
@@ -1573,7 +1573,7 @@ get_state (struct path * pp, struct config *conf, int daemon, int oldstate)
                }
                checker_set_fd(c, pp->fd);
                if (checker_init(c, pp->mpp?&pp->mpp->mpcontext:NULL)) {
-                       memset(c, 0x0, sizeof(struct checker));
+                       checker_clear(c);
                        condlog(3, "%s: checker init failed", pp->dev);
                        return PATH_UNCHECKED;
                }
index b018ddf6dd63a5426fadfe27e50300da53c3c1b4..17e8ac6a0602a5dd5f20776e0464d5d4672af957 100644 (file)
@@ -568,15 +568,15 @@ static struct hwentry default_hw[] = {
        },
        {
                /* XIV Storage System / FlashSystem A9000/A9000R */
-               .vendor        = "IBM",
-               .product       = "2810XIV",
+               .vendor        = "(XIV|IBM)",
+               .product       = "(NEXTRA|2810XIV)",
                .no_path_retry = NO_PATH_RETRY_QUEUE,
                .pgpolicy      = MULTIBUS,
        },
        {
-               /* FlashSystem 710/720/810/820/840/900 */
-               .vendor        = "IBM",
-               .product       = "FlashSystem",
+               /* TMS RamSan / FlashSystem 710/720/810/820/840/900 */
+               .vendor        = "(TMS|IBM)",
+               .product       = "(RamSan|FlashSystem)",
                .pgpolicy      = MULTIBUS,
        },
        {
@@ -940,7 +940,8 @@ static struct hwentry default_hw[] = {
                /* OceanStor V3 */
                .vendor        = "HUAWEI",
                .product       = "XSG1",
-               .pgpolicy      = MULTIBUS,
+               .pgpolicy      = GROUP_BY_PRIO,
+               .prio_name     = PRIO_ALUA,
        },
        /*
         * Red Hat
@@ -1063,6 +1064,13 @@ static struct hwentry default_hw[] = {
                .pgpolicy      = MULTIBUS,
                .no_path_retry = 30,
        },
+       {
+               /* Magnitude family */
+               .vendor        = "(XIOTECH|XIOtech)",
+               .product       = "Magnitude",
+               .pgpolicy      = MULTIBUS,
+               .no_path_retry = 30,
+       },
        /*
         * Violin Memory
         */
@@ -1162,6 +1170,16 @@ static struct hwentry default_hw[] = {
                .prio_name     = PRIO_ALUA,
                .no_path_retry = 30,
        },
+       /*
+        * AccelStor
+        */
+       {
+               /* NeoSapphire */
+               .vendor        = "AStor",
+               .product       = "NeoSapphire",
+               .pgpolicy      = MULTIBUS,
+               .no_path_retry = 30,
+       },
        /*
         * EOL
         */
diff --git a/libmultipath/io_err_stat.c b/libmultipath/io_err_stat.c
new file mode 100644 (file)
index 0000000..75a6df6
--- /dev/null
@@ -0,0 +1,743 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
+ *
+ * io_err_stat.c
+ * version 1.0
+ *
+ * IO error stream statistic process for path failure event from kernel
+ *
+ * Author(s): Guan Junxiong 2017 <guanjunxiong@huawei.com>
+ *
+ * This file is released under the GPL version 2, or any later version.
+ */
+
+#include <unistd.h>
+#include <pthread.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <libaio.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "vector.h"
+#include "memory.h"
+#include "checkers.h"
+#include "config.h"
+#include "structs.h"
+#include "structs_vec.h"
+#include "devmapper.h"
+#include "debug.h"
+#include "lock.h"
+#include "time-util.h"
+#include "io_err_stat.h"
+
+#define IOTIMEOUT_SEC                  60
+#define TIMEOUT_NO_IO_NSEC             10000000 /*10ms = 10000000ns*/
+#define FLAKY_PATHFAIL_THRESHOLD       2
+#define CONCUR_NR_EVENT                        32
+
+#define PATH_IO_ERR_IN_CHECKING                -1
+#define PATH_IO_ERR_IN_POLLING_RECHECK -2
+
+#define io_err_stat_log(prio, fmt, args...) \
+       condlog(prio, "io error statistic: " fmt, ##args)
+
+
+struct io_err_stat_pathvec {
+       pthread_mutex_t mutex;
+       vector          pathvec;
+};
+
+struct dio_ctx {
+       struct timespec io_starttime;
+       int             blksize;
+       void            *buf;
+       struct iocb     io;
+};
+
+struct io_err_stat_path {
+       char            devname[FILE_NAME_SIZE];
+       int             fd;
+       struct dio_ctx  *dio_ctx_array;
+       int             io_err_nr;
+       int             io_nr;
+       struct timespec start_time;
+
+       int             total_time;
+       int             err_rate_threshold;
+};
+
+pthread_t              io_err_stat_thr;
+pthread_attr_t         io_err_stat_attr;
+
+static struct io_err_stat_pathvec *paths;
+struct vectors *vecs;
+io_context_t   ioctx;
+
+static void cancel_inflight_io(struct io_err_stat_path *pp);
+
+static void rcu_unregister(void *param)
+{
+       rcu_unregister_thread();
+}
+
+struct io_err_stat_path *find_err_path_by_dev(vector pathvec, char *dev)
+{
+       int i;
+       struct io_err_stat_path *pp;
+
+       if (!pathvec)
+               return NULL;
+       vector_foreach_slot(pathvec, pp, i)
+               if (!strcmp(pp->devname, dev))
+                       return pp;
+
+       io_err_stat_log(4, "%s: not found in check queue", dev);
+
+       return NULL;
+}
+
+static int init_each_dio_ctx(struct dio_ctx *ct, int blksize,
+               unsigned long pgsize)
+{
+       ct->blksize = blksize;
+       if (posix_memalign(&ct->buf, pgsize, blksize))
+               return 1;
+       memset(ct->buf, 0, blksize);
+       ct->io_starttime.tv_sec = 0;
+       ct->io_starttime.tv_nsec = 0;
+
+       return 0;
+}
+
+static void deinit_each_dio_ctx(struct dio_ctx *ct)
+{
+       if (ct->buf)
+               free(ct->buf);
+}
+
+static int setup_directio_ctx(struct io_err_stat_path *p)
+{
+       unsigned long pgsize = getpagesize();
+       char fpath[PATH_MAX];
+       int blksize = 0;
+       int i;
+
+       if (snprintf(fpath, PATH_MAX, "/dev/%s", p->devname) >= PATH_MAX)
+               return 1;
+       if (p->fd < 0)
+               p->fd = open(fpath, O_RDONLY | O_DIRECT);
+       if (p->fd < 0)
+               return 1;
+
+       p->dio_ctx_array = MALLOC(sizeof(struct dio_ctx) * CONCUR_NR_EVENT);
+       if (!p->dio_ctx_array)
+               goto fail_close;
+
+       if (ioctl(p->fd, BLKBSZGET, &blksize) < 0) {
+               io_err_stat_log(4, "%s:cannot get blocksize, set default 512",
+                               p->devname);
+               blksize = 512;
+       }
+       if (!blksize)
+               goto free_pdctx;
+
+       for (i = 0; i < CONCUR_NR_EVENT; i++) {
+               if (init_each_dio_ctx(p->dio_ctx_array + i, blksize, pgsize))
+                       goto deinit;
+       }
+       return 0;
+
+deinit:
+       for (i = 0; i < CONCUR_NR_EVENT; i++)
+               deinit_each_dio_ctx(p->dio_ctx_array + i);
+free_pdctx:
+       FREE(p->dio_ctx_array);
+fail_close:
+       close(p->fd);
+
+       return 1;
+}
+
+static void destroy_directio_ctx(struct io_err_stat_path *p)
+{
+       int i;
+
+       if (!p || !p->dio_ctx_array)
+               return;
+       cancel_inflight_io(p);
+
+       for (i = 0; i < CONCUR_NR_EVENT; i++)
+               deinit_each_dio_ctx(p->dio_ctx_array + i);
+       FREE(p->dio_ctx_array);
+
+       if (p->fd > 0)
+               close(p->fd);
+}
+
+static struct io_err_stat_path *alloc_io_err_stat_path(void)
+{
+       struct io_err_stat_path *p;
+
+       p = (struct io_err_stat_path *)MALLOC(sizeof(*p));
+       if (!p)
+               return NULL;
+
+       memset(p->devname, 0, sizeof(p->devname));
+       p->io_err_nr = 0;
+       p->io_nr = 0;
+       p->total_time = 0;
+       p->start_time.tv_sec = 0;
+       p->start_time.tv_nsec = 0;
+       p->err_rate_threshold = 0;
+       p->fd = -1;
+
+       return p;
+}
+
+static void free_io_err_stat_path(struct io_err_stat_path *p)
+{
+       FREE(p);
+}
+
+static struct io_err_stat_pathvec *alloc_pathvec(void)
+{
+       struct io_err_stat_pathvec *p;
+       int r;
+
+       p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p));
+       if (!p)
+               return NULL;
+       p->pathvec = vector_alloc();
+       if (!p->pathvec)
+               goto out_free_struct_pathvec;
+       r = pthread_mutex_init(&p->mutex, NULL);
+       if (r)
+               goto out_free_member_pathvec;
+
+       return p;
+
+out_free_member_pathvec:
+       vector_free(p->pathvec);
+out_free_struct_pathvec:
+       FREE(p);
+       return NULL;
+}
+
+static void free_io_err_pathvec(struct io_err_stat_pathvec *p)
+{
+       struct io_err_stat_path *path;
+       int i;
+
+       if (!p)
+               return;
+       pthread_mutex_destroy(&p->mutex);
+       if (!p->pathvec) {
+               vector_foreach_slot(p->pathvec, path, i) {
+                       destroy_directio_ctx(path);
+                       free_io_err_stat_path(path);
+               }
+               vector_free(p->pathvec);
+       }
+       FREE(p);
+}
+
+/*
+ * return value
+ * 0: enqueue OK
+ * 1: fails because of internal error
+ * 2: fails because of existing already
+ */
+static int enqueue_io_err_stat_by_path(struct path *path)
+{
+       struct io_err_stat_path *p;
+
+       pthread_mutex_lock(&paths->mutex);
+       p = find_err_path_by_dev(paths->pathvec, path->dev);
+       if (p) {
+               pthread_mutex_unlock(&paths->mutex);
+               return 2;
+       }
+       pthread_mutex_unlock(&paths->mutex);
+
+       p = alloc_io_err_stat_path();
+       if (!p)
+               return 1;
+
+       memcpy(p->devname, path->dev, sizeof(p->devname));
+       p->total_time = path->mpp->marginal_path_err_sample_time;
+       p->err_rate_threshold = path->mpp->marginal_path_err_rate_threshold;
+
+       if (setup_directio_ctx(p))
+               goto free_ioerr_path;
+       pthread_mutex_lock(&paths->mutex);
+       if (!vector_alloc_slot(paths->pathvec))
+               goto unlock_destroy;
+       vector_set_slot(paths->pathvec, p);
+       pthread_mutex_unlock(&paths->mutex);
+
+       if (!path->io_err_disable_reinstate) {
+               /*
+                *fail the path in the kernel for the time of the to make
+                *the test more reliable
+                */
+               io_err_stat_log(3, "%s: fail dm path %s before checking",
+                               path->mpp->alias, path->dev);
+               path->io_err_disable_reinstate = 1;
+               dm_fail_path(path->mpp->alias, path->dev_t);
+               update_queue_mode_del_path(path->mpp);
+
+               /*
+                * schedule path check as soon as possible to
+                * update path state to delayed state
+                */
+               path->tick = 1;
+
+       }
+       io_err_stat_log(2, "%s: enqueue path %s to check",
+                       path->mpp->alias, path->dev);
+       return 0;
+
+unlock_destroy:
+       pthread_mutex_unlock(&paths->mutex);
+       destroy_directio_ctx(p);
+free_ioerr_path:
+       free_io_err_stat_path(p);
+
+       return 1;
+}
+
+int io_err_stat_handle_pathfail(struct path *path)
+{
+       struct timespec curr_time;
+       int res;
+
+       if (path->io_err_disable_reinstate) {
+               io_err_stat_log(3, "%s: reinstate is already disabled",
+                               path->dev);
+               return 1;
+       }
+       if (path->io_err_pathfail_cnt < 0)
+               return 1;
+
+       if (!path->mpp)
+               return 1;
+       if (path->mpp->nr_active <= 1)
+               return 1;
+       if (path->mpp->marginal_path_double_failed_time <= 0 ||
+               path->mpp->marginal_path_err_sample_time <= 0 ||
+               path->mpp->marginal_path_err_recheck_gap_time <= 0 ||
+               path->mpp->marginal_path_err_rate_threshold < 0) {
+               io_err_stat_log(4, "%s: parameter not set", path->mpp->alias);
+               return 1;
+       }
+       if (path->mpp->marginal_path_err_sample_time < (2 * IOTIMEOUT_SEC)) {
+               io_err_stat_log(2, "%s: marginal_path_err_sample_time should not less than %d",
+                               path->mpp->alias, 2 * IOTIMEOUT_SEC);
+               return 1;
+       }
+       /*
+        * The test should only be started for paths that have failed
+        * repeatedly in a certain time frame, so that we have reason
+        * to assume they're flaky. Without bother the admin to configure
+        * the repeated count threshold and time frame, we assume a path
+        * which fails at least twice within 60 seconds is flaky.
+        */
+       if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
+               return 1;
+       if (path->io_err_pathfail_cnt == 0) {
+               path->io_err_pathfail_cnt++;
+               path->io_err_pathfail_starttime = curr_time.tv_sec;
+               io_err_stat_log(5, "%s: start path flakiness pre-checking",
+                               path->dev);
+               return 0;
+       }
+       if ((curr_time.tv_sec - path->io_err_pathfail_starttime) >
+                       path->mpp->marginal_path_double_failed_time) {
+               path->io_err_pathfail_cnt = 0;
+               path->io_err_pathfail_starttime = curr_time.tv_sec;
+               io_err_stat_log(5, "%s: restart path flakiness pre-checking",
+                               path->dev);
+       }
+       path->io_err_pathfail_cnt++;
+       if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) {
+               res = enqueue_io_err_stat_by_path(path);
+               if (!res)
+                       path->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
+               else
+                       path->io_err_pathfail_cnt = 0;
+       }
+
+       return 0;
+}
+
+int hit_io_err_recheck_time(struct path *pp)
+{
+       struct timespec curr_time;
+       int r;
+
+       if (pp->io_err_disable_reinstate == 0)
+               return 1;
+       if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
+               return 1;
+       if (pp->io_err_pathfail_cnt != PATH_IO_ERR_IN_POLLING_RECHECK)
+               return 1;
+       if (pp->mpp->nr_active <= 0) {
+               io_err_stat_log(2, "%s: recover path early", pp->dev);
+               goto recover;
+       }
+       if ((curr_time.tv_sec - pp->io_err_dis_reinstate_time) >
+                       pp->mpp->marginal_path_err_recheck_gap_time) {
+               io_err_stat_log(4, "%s: reschedule checking after %d seconds",
+                               pp->dev,
+                               pp->mpp->marginal_path_err_recheck_gap_time);
+               /*
+                * to reschedule io error checking again
+                * if the path is good enough, we claim it is good
+                * and can be reinsated as soon as possible in the
+                * check_path routine.
+                */
+               pp->io_err_dis_reinstate_time = curr_time.tv_sec;
+               r = enqueue_io_err_stat_by_path(pp);
+               /*
+                * Enqueue fails because of internal error.
+                * In this case , we recover this path
+                * Or else,  return 1 to set path state to PATH_SHAKY
+                */
+               if (r == 1) {
+                       io_err_stat_log(3, "%s: enqueue fails, to recover",
+                                       pp->dev);
+                       goto recover;
+               } else if (!r) {
+                       pp->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
+               }
+       }
+
+       return 1;
+
+recover:
+       pp->io_err_pathfail_cnt = 0;
+       pp->io_err_disable_reinstate = 0;
+       pp->tick = 1;
+       return 0;
+}
+
+static int delete_io_err_stat_by_addr(struct io_err_stat_path *p)
+{
+       int i;
+
+       i = find_slot(paths->pathvec, p);
+       if (i != -1)
+               vector_del_slot(paths->pathvec, i);
+
+       destroy_directio_ctx(p);
+       free_io_err_stat_path(p);
+
+       return 0;
+}
+
+static void account_async_io_state(struct io_err_stat_path *pp, int rc)
+{
+       switch (rc) {
+       case PATH_DOWN:
+       case PATH_TIMEOUT:
+               pp->io_err_nr++;
+               break;
+       case PATH_UNCHECKED:
+       case PATH_UP:
+       case PATH_PENDING:
+               break;
+       default:
+               break;
+       }
+}
+
+static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp)
+{
+       struct timespec currtime, difftime;
+       struct path *path;
+       double err_rate;
+
+       if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
+               return 1;
+       timespecsub(&currtime, &pp->start_time, &difftime);
+       if (difftime.tv_sec < pp->total_time)
+               return 0;
+
+       io_err_stat_log(4, "%s: check end", pp->devname);
+
+       err_rate = pp->io_nr == 0 ? 0 : (pp->io_err_nr * 1000.0f) / pp->io_nr;
+       io_err_stat_log(3, "%s: IO error rate (%.1f/1000)",
+                       pp->devname, err_rate);
+       pthread_cleanup_push(cleanup_lock, &vecs->lock);
+       lock(&vecs->lock);
+       pthread_testcancel();
+       path = find_path_by_dev(vecs->pathvec, pp->devname);
+       if (!path) {
+               io_err_stat_log(4, "path %s not found'", pp->devname);
+       } else if (err_rate <= pp->err_rate_threshold) {
+               path->io_err_pathfail_cnt = 0;
+               path->io_err_disable_reinstate = 0;
+               io_err_stat_log(3, "%s: (%d/%d) good to enable reinstating",
+                               pp->devname, pp->io_err_nr, pp->io_nr);
+               /*
+                * schedule path check as soon as possible to
+                * update path state. Do NOT reinstate dm path here
+                */
+               path->tick = 1;
+
+       } else if (path->mpp && path->mpp->nr_active > 1) {
+               io_err_stat_log(3, "%s: keep failing the dm path %s",
+                               path->mpp->alias, path->dev);
+               path->io_err_pathfail_cnt = PATH_IO_ERR_IN_POLLING_RECHECK;
+               path->io_err_disable_reinstate = 1;
+               path->io_err_dis_reinstate_time = currtime.tv_sec;
+               io_err_stat_log(3, "%s: disable reinstating of %s",
+                               path->mpp->alias, path->dev);
+       } else {
+               path->io_err_pathfail_cnt = 0;
+               path->io_err_disable_reinstate = 0;
+               io_err_stat_log(3, "%s: there is orphan path, enable reinstating",
+                               pp->devname);
+       }
+       lock_cleanup_pop(vecs->lock);
+
+       delete_io_err_stat_by_addr(pp);
+
+       return 0;
+}
+
+static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev)
+{
+       int rc = -1;
+
+       if (ct->io_starttime.tv_nsec == 0 &&
+                       ct->io_starttime.tv_sec == 0) {
+               struct iocb *ios[1] = { &ct->io };
+
+               if (clock_gettime(CLOCK_MONOTONIC, &ct->io_starttime) != 0) {
+                       ct->io_starttime.tv_sec = 0;
+                       ct->io_starttime.tv_nsec = 0;
+                       return rc;
+               }
+               io_prep_pread(&ct->io, fd, ct->buf, ct->blksize, 0);
+               if (io_submit(ioctx, 1, ios) != 1) {
+                       io_err_stat_log(5, "%s: io_submit error %i",
+                                       dev, errno);
+                       return rc;
+               }
+               rc = 0;
+       }
+
+       return rc;
+}
+
+static void send_batch_async_ios(struct io_err_stat_path *pp)
+{
+       int i;
+       struct dio_ctx *ct;
+       struct timespec currtime, difftime;
+
+       if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
+               return;
+       /*
+        * Give a free time for all IO to complete or timeout
+        */
+       if (pp->start_time.tv_sec != 0) {
+               timespecsub(&currtime, &pp->start_time, &difftime);
+               if (difftime.tv_sec + IOTIMEOUT_SEC >= pp->total_time)
+                       return;
+       }
+
+       for (i = 0; i < CONCUR_NR_EVENT; i++) {
+               ct = pp->dio_ctx_array + i;
+               if (!send_each_async_io(ct, pp->fd, pp->devname))
+                       pp->io_nr++;
+       }
+       if (pp->start_time.tv_sec == 0 && pp->start_time.tv_nsec == 0 &&
+               clock_gettime(CLOCK_MONOTONIC, &pp->start_time)) {
+               pp->start_time.tv_sec = 0;
+               pp->start_time.tv_nsec = 0;
+       }
+}
+
+static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct timespec *t,
+               char *dev)
+{
+       struct timespec difftime;
+       struct io_event event;
+       int             rc = PATH_UNCHECKED;
+       int             r;
+
+       if (ct->io_starttime.tv_sec == 0)
+               return rc;
+       timespecsub(t, &ct->io_starttime, &difftime);
+       if (difftime.tv_sec > IOTIMEOUT_SEC) {
+               struct iocb *ios[1] = { &ct->io };
+
+               io_err_stat_log(5, "%s: abort check on timeout", dev);
+               r = io_cancel(ioctx, ios[0], &event);
+               if (r)
+                       io_err_stat_log(5, "%s: io_cancel error %i",
+                                       dev, errno);
+               ct->io_starttime.tv_sec = 0;
+               ct->io_starttime.tv_nsec = 0;
+               rc = PATH_TIMEOUT;
+       } else {
+               rc = PATH_PENDING;
+       }
+
+       return rc;
+}
+
+static void poll_async_io_timeout(void)
+{
+       struct io_err_stat_path *pp;
+       struct timespec curr_time;
+       int             rc = PATH_UNCHECKED;
+       int             i, j;
+
+       if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
+               return;
+       vector_foreach_slot(paths->pathvec, pp, i) {
+               for (j = 0; j < CONCUR_NR_EVENT; j++) {
+                       rc = try_to_cancel_timeout_io(pp->dio_ctx_array + j,
+                                       &curr_time, pp->devname);
+                       account_async_io_state(pp, rc);
+               }
+       }
+}
+
+static void cancel_inflight_io(struct io_err_stat_path *pp)
+{
+       struct io_event event;
+       int i, r;
+
+       for (i = 0; i < CONCUR_NR_EVENT; i++) {
+               struct dio_ctx *ct = pp->dio_ctx_array + i;
+               struct iocb *ios[1] = { &ct->io };
+
+               if (ct->io_starttime.tv_sec == 0
+                               && ct->io_starttime.tv_nsec == 0)
+                       continue;
+               io_err_stat_log(5, "%s: abort infligh io",
+                               pp->devname);
+               r = io_cancel(ioctx, ios[0], &event);
+               if (r)
+                       io_err_stat_log(5, "%s: io_cancel error %d, %i",
+                                       pp->devname, r, errno);
+               ct->io_starttime.tv_sec = 0;
+               ct->io_starttime.tv_nsec = 0;
+       }
+}
+
+static inline int handle_done_dio_ctx(struct dio_ctx *ct, struct io_event *ev)
+{
+       ct->io_starttime.tv_sec = 0;
+       ct->io_starttime.tv_nsec = 0;
+       return (ev->res == ct->blksize) ? PATH_UP : PATH_DOWN;
+}
+
+static void handle_async_io_done_event(struct io_event *io_evt)
+{
+       struct io_err_stat_path *pp;
+       struct dio_ctx *ct;
+       int rc = PATH_UNCHECKED;
+       int i, j;
+
+       vector_foreach_slot(paths->pathvec, pp, i) {
+               for (j = 0; j < CONCUR_NR_EVENT; j++) {
+                       ct = pp->dio_ctx_array + j;
+                       if (&ct->io == io_evt->obj) {
+                               rc = handle_done_dio_ctx(ct, io_evt);
+                               account_async_io_state(pp, rc);
+                               return;
+                       }
+               }
+       }
+}
+
+static void process_async_ios_event(int timeout_nsecs, char *dev)
+{
+       struct io_event events[CONCUR_NR_EVENT];
+       int             i, n;
+       struct timespec timeout = { .tv_nsec = timeout_nsecs };
+
+       errno = 0;
+       n = io_getevents(ioctx, 1L, CONCUR_NR_EVENT, events, &timeout);
+       if (n < 0) {
+               io_err_stat_log(3, "%s: async io events returned %d (errno=%s)",
+                               dev, n, strerror(errno));
+       } else {
+               for (i = 0; i < n; i++)
+                       handle_async_io_done_event(&events[i]);
+       }
+}
+
+static void service_paths(void)
+{
+       struct io_err_stat_path *pp;
+       int i;
+
+       pthread_mutex_lock(&paths->mutex);
+       vector_foreach_slot(paths->pathvec, pp, i) {
+               send_batch_async_ios(pp);
+               process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp->devname);
+               poll_async_io_timeout();
+               poll_io_err_stat(vecs, pp);
+       }
+       pthread_mutex_unlock(&paths->mutex);
+}
+
+static void *io_err_stat_loop(void *data)
+{
+       vecs = (struct vectors *)data;
+       pthread_cleanup_push(rcu_unregister, NULL);
+       rcu_register_thread();
+
+       mlockall(MCL_CURRENT | MCL_FUTURE);
+       while (1) {
+               service_paths();
+               usleep(100000);
+       }
+
+       pthread_cleanup_pop(1);
+       return NULL;
+}
+
+int start_io_err_stat_thread(void *data)
+{
+       if (io_setup(CONCUR_NR_EVENT, &ioctx) != 0) {
+               io_err_stat_log(4, "io_setup failed");
+               return 1;
+       }
+       paths = alloc_pathvec();
+       if (!paths)
+               goto destroy_ctx;
+
+       if (pthread_create(&io_err_stat_thr, &io_err_stat_attr,
+                               io_err_stat_loop, data)) {
+               io_err_stat_log(0, "cannot create io_error statistic thread");
+               goto out_free;
+       }
+       io_err_stat_log(3, "thread started");
+       return 0;
+
+out_free:
+       free_io_err_pathvec(paths);
+destroy_ctx:
+       io_destroy(ioctx);
+       io_err_stat_log(0, "failed to start io_error statistic thread");
+       return 1;
+}
+
+void stop_io_err_stat_thread(void)
+{
+       pthread_cancel(io_err_stat_thr);
+       pthread_kill(io_err_stat_thr, SIGUSR2);
+       free_io_err_pathvec(paths);
+       io_destroy(ioctx);
+}
diff --git a/libmultipath/io_err_stat.h b/libmultipath/io_err_stat.h
new file mode 100644 (file)
index 0000000..bbf31b4
--- /dev/null
@@ -0,0 +1,15 @@
+#ifndef _IO_ERR_STAT_H
+#define _IO_ERR_STAT_H
+
+#include "vector.h"
+#include "lock.h"
+
+
+extern pthread_attr_t io_err_stat_attr;
+
+int start_io_err_stat_thread(void *data);
+void stop_io_err_stat_thread(void);
+int io_err_stat_handle_pathfail(struct path *path);
+int hit_io_err_recheck_time(struct path *pp);
+
+#endif /* _IO_ERR_STAT_H */
index 9fc2dfc0a86dfe38ed8d362cb637cdc5a20a7d6f..9d5397ec1b3ae0b1aa532147ca7b7119c3d9c61c 100644 (file)
  *    scale, the priority "rc" of each path can be provided.
  *
  * Author(s): Yang Feng <philip.yang@huawei.com>
+ * Revised:   Guan Junxiong <guanjunxiong@huawei.com>
  *
  * This file is released under the GPL version 2, or any later version.
  */
 
+#define _GNU_SOURCE
 #include <stdio.h>
 #include <math.h>
 #include <ctype.h>
 #include <time.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <unistd.h>
 
 #include "debug.h"
 #include "prio.h"
 #include "structs.h"
-#include "../checkers/libsg.h"
+#include "util.h"
 
 #define pp_pl_log(prio, fmt, args...) condlog(prio, "path_latency prio: " fmt, ##args)
 
 #define MAX_IO_NUM             200
-#define MIN_IO_NUM             2
+#define MIN_IO_NUM             20
+#define DEF_IO_NUM             100
 
 #define MAX_BASE_NUM           10
-#define MIN_BASE_NUM           2
+#define MIN_BASE_NUM           1.01
+#define DEF_BASE_NUM           1.5
 
 #define MAX_AVG_LATENCY                100000000.      /* Unit: us */
 #define MIN_AVG_LATENCY                1.              /* Unit: us */
 
 #define DEFAULT_PRIORITY       0
 
-#define MAX_CHAR_SIZE          30
-
 #define USEC_PER_SEC           1000000LL
 #define NSEC_PER_USEC          1000LL
 
-static long long path_latency[MAX_IO_NUM];
+#define DEF_BLK_SIZE           4096
+
+static double lg_path_latency[MAX_IO_NUM];
 
 static inline long long timeval_to_us(const struct timespec *tv)
 {
@@ -55,18 +63,75 @@ static inline long long timeval_to_us(const struct timespec *tv)
            (tv->tv_nsec / NSEC_PER_USEC);
 }
 
-static int do_readsector0(int fd, unsigned int timeout)
+static int prepare_directio_read(int fd, int *blksz, char **pbuf,
+               int *restore_flags)
+{
+       unsigned long pgsize = getpagesize();
+       long flags;
+
+       if (ioctl(fd, BLKBSZGET, blksz) < 0) {
+               pp_pl_log(3,"catnnot get blocksize, set default");
+               *blksz = DEF_BLK_SIZE;
+       }
+       if (posix_memalign((void **)pbuf, pgsize, *blksz))
+               return -1;
+
+       flags = fcntl(fd, F_GETFL);
+       if (flags < 0)
+               goto free_out;
+       if (!(flags & O_DIRECT)) {
+               flags |= O_DIRECT;
+               if (fcntl(fd, F_SETFL, flags) < 0)
+                       goto free_out;
+               *restore_flags = 1;
+       }
+
+       return 0;
+
+free_out:
+       free(*pbuf);
+
+       return -1;
+}
+
+static void cleanup_directio_read(int fd, char *buf, int restore_flags)
 {
-       unsigned char buf[4096];
-       unsigned char sbuf[SENSE_BUFF_LEN];
+       long flags;
+
+       free(buf);
+
+       if (!restore_flags)
+               return;
+       if ((flags = fcntl(fd, F_GETFL)) >= 0) {
+               int ret __attribute__ ((unused));
+               flags &= ~O_DIRECT;
+               /* No point in checking for errors */
+               ret = fcntl(fd, F_SETFL, flags);
+       }
+}
+
+static int do_directio_read(int fd, unsigned int timeout, char *buf, int sz)
+{
+       fd_set read_fds;
+       struct timeval tm = { .tv_sec = timeout };
        int ret;
+       int num_read;
 
-       ret = sg_read(fd, &buf[0], 4096, &sbuf[0], SENSE_BUFF_LEN, timeout);
+       if (lseek(fd, 0, SEEK_SET) == -1)
+               return -1;
+       FD_ZERO(&read_fds);
+       FD_SET(fd, &read_fds);
+       ret = select(fd+1, &read_fds, NULL, NULL, &tm);
+       if (ret <= 0)
+               return -1;
+       num_read = read(fd, buf, sz);
+       if (num_read != sz)
+               return -1;
 
-       return ret;
+       return 0;
 }
 
-int check_args_valid(int io_num, int base_num)
+int check_args_valid(int io_num, double base_num)
 {
        if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM)) {
                pp_pl_log(0, "args io_num is outside the valid range");
@@ -82,176 +147,200 @@ int check_args_valid(int io_num, int base_num)
 }
 
 /*
- * In multipath.conf, args form: io_num|base_num. For example,
- * args is "20|10", this function can get io_num value 20, and
+ * In multipath.conf, args form: io_num=n base_num=m. For example, args are
+ * "io_num=20 base_num=10", this function can get io_num value 20 and
  * base_num value 10.
  */
-static int get_ionum_and_basenum(char *args, int *ionum, int *basenum)
+static int get_ionum_and_basenum(char *args, int *ionum, double *basenum)
 {
-       char source[MAX_CHAR_SIZE];
-       char vertica = '|';
-       char *endstrbefore = NULL;
-       char *endstrafter = NULL;
-       unsigned int size = strlen(args);
+       char split_char[] = " \t";
+       char *arg, *temp;
+       char *str, *str_inval;
+       int i;
+       int flag_io = 0, flag_base = 0;
 
        if ((args == NULL) || (ionum == NULL) || (basenum == NULL)) {
                pp_pl_log(0, "args string is NULL");
                return 0;
        }
 
-       if ((size < 1) || (size > MAX_CHAR_SIZE - 1)) {
-               pp_pl_log(0, "args string's size is too long");
+       arg = temp = STRDUP(args);
+       if (!arg)
                return 0;
-       }
 
-       memcpy(source, args, size + 1);
-
-       if (!isdigit(source[0])) {
-               pp_pl_log(0, "invalid prio_args format: %s", source);
-               return 0;
-       }
-
-       *ionum = (int)strtoul(source, &endstrbefore, 10);
-       if (endstrbefore[0] != vertica) {
-               pp_pl_log(0, "invalid prio_args format: %s", source);
-               return 0;
+       for (i = 0; i < 2; i++) {
+               str = get_next_string(&temp, split_char);
+               if (!str)
+                       goto out;
+               if (!strncmp(str, "io_num=", 7) && strlen(str) > 7) {
+                       *ionum = (int)strtoul(str + 7, &str_inval, 10);
+                       if (str == str_inval)
+                               goto out;
+                       flag_io = 1;
+               }
+               else if (!strncmp(str, "base_num=", 9) && strlen(str) > 9) {
+                       *basenum = strtod(str + 9, &str_inval);
+                       if (str == str_inval)
+                               goto out;
+                       flag_base = 1;
+               }
        }
 
-       if (!isdigit(endstrbefore[1])) {
-               pp_pl_log(0, "invalid prio_args format: %s", source);
-               return 0;
-       }
-
-       *basenum = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
-       if (check_args_valid(*ionum, *basenum) == 0) {
-               return 0;
-       }
+       if (!flag_io || !flag_base)
+               goto out;
+       if (check_args_valid(*ionum, *basenum) == 0)
+               goto out;
 
+       FREE(arg);
        return 1;
+out:
+       FREE(arg);
+       return 0;
 }
 
-long long calc_standard_deviation(long long *path_latency, int size,
-                                 long long avglatency)
+double calc_standard_deviation(double *lg_path_latency, int size,
+                                 double lg_avglatency)
 {
        int index;
-       long long total = 0;
+       double sum = 0;
 
        for (index = 0; index < size; index++) {
-               total +=
-                   (path_latency[index] - avglatency) * (path_latency[index] -
-                                                         avglatency);
+               sum += (lg_path_latency[index] - lg_avglatency) *
+                       (lg_path_latency[index] - lg_avglatency);
        }
 
-       total /= (size - 1);
+       sum /= (size - 1);
 
-       return (long long)sqrt((double)total);
+       return sqrt(sum);
 }
 
-int calcPrio(double avglatency, double max_avglatency, double min_avglatency,
-            double base_num)
+/*
+ * Do not scale the prioriy in a certain range such as [0, 1024]
+ * because scaling will eliminate the effect of base_num.
+ */
+int calcPrio(double lg_avglatency, double lg_maxavglatency,
+               double lg_minavglatency)
 {
-       double lavglatency = log(avglatency) / log(base_num);
-       double lmax_avglatency = log(max_avglatency) / log(base_num);
-       double lmin_avglatency = log(min_avglatency) / log(base_num);
-
-       if (lavglatency <= lmin_avglatency)
-               return (int)(lmax_avglatency + 1.);
+       if (lg_avglatency <= lg_minavglatency)
+               return lg_maxavglatency - lg_minavglatency;
 
-       if (lavglatency > lmax_avglatency)
+       if (lg_avglatency >= lg_maxavglatency)
                return 0;
 
-       return (int)(lmax_avglatency - lavglatency + 1.);
-}
-
-/* Calc the latency interval corresponding to the average latency */
-long long calc_latency_interval(double avglatency, double max_avglatency,
-                               double min_avglatency, double base_num)
-{
-       double lavglatency = log(avglatency) / log(base_num);
-       double lmax_avglatency = log(max_avglatency) / log(base_num);
-       double lmin_avglatency = log(min_avglatency) / log(base_num);
-
-       if ((lavglatency <= lmin_avglatency)
-           || (lavglatency > lmax_avglatency))
-               return 0;       /* Invalid value */
-
-       if ((double)((int)lavglatency) == lavglatency)
-               return (long long)(avglatency - (avglatency / base_num));
-       else
-               return (long long)(pow(base_num, (double)((int)lavglatency + 1))
-                                  - pow(base_num, (double)((int)lavglatency)));
+       return lg_maxavglatency - lg_avglatency;
 }
 
 int getprio(struct path *pp, char *args, unsigned int timeout)
 {
        int rc, temp;
        int index = 0;
-       int io_num;
-       int base_num;
-       long long avglatency;
-       long long latency_interval;
-       long long standard_deviation;
-       long long toldelay = 0;
+       int io_num = 0;
+       double base_num = 0;
+       double lg_avglatency, lg_maxavglatency, lg_minavglatency;
+       double standard_deviation;
+       double lg_toldelay = 0;
        long long before, after;
        struct timespec tv;
+       int blksize;
+       char *buf;
+       int restore_flags = 0;
+       double lg_base;
+       long long sum_latency = 0;
+       long long arith_mean_lat;
 
        if (pp->fd < 0)
                return -1;
 
        if (get_ionum_and_basenum(args, &io_num, &base_num) == 0) {
-               pp_pl_log(0, "%s: get path_latency args fail", pp->dev);
-               return DEFAULT_PRIORITY;
+               io_num = DEF_IO_NUM;
+               base_num = DEF_BASE_NUM;
+               pp_pl_log(0, "%s: fails to get path_latency args, set default:"
+                               "io_num=%d base_num=%.3lf",
+                               pp->dev, io_num, base_num);
        }
 
-       memset(path_latency, 0, sizeof(path_latency));
+       memset(lg_path_latency, 0, sizeof(lg_path_latency));
+       lg_base = log(base_num);
+       lg_maxavglatency = log(MAX_AVG_LATENCY) / lg_base;
+       lg_minavglatency = log(MIN_AVG_LATENCY) / lg_base;
+
+       prepare_directio_read(pp->fd, &blksize, &buf, &restore_flags);
 
        temp = io_num;
        while (temp-- > 0) {
                (void)clock_gettime(CLOCK_MONOTONIC, &tv);
                before = timeval_to_us(&tv);
 
-               if (do_readsector0(pp->fd, timeout) == 2) {
+               if (do_directio_read(pp->fd, timeout, buf, blksize)) {
                        pp_pl_log(0, "%s: path down", pp->dev);
+                       cleanup_directio_read(pp->fd, buf, restore_flags);
                        return -1;
                }
 
                (void)clock_gettime(CLOCK_MONOTONIC, &tv);
                after = timeval_to_us(&tv);
-
-               path_latency[index] = after - before;
-               toldelay += path_latency[index++];
+               /*
+                * We assume that the latency complies with Log-normal
+                * distribution. The logarithm of latency is in normal
+                * distribution.
+                */
+               lg_path_latency[index] = log(after - before) / lg_base;
+               lg_toldelay += lg_path_latency[index++];
+               sum_latency += after - before;
        }
 
-       avglatency = toldelay / (long long)io_num;
-       pp_pl_log(4, "%s: average latency is (%lld us)", pp->dev, avglatency);
+       cleanup_directio_read(pp->fd, buf, restore_flags);
+
+       lg_avglatency = lg_toldelay / (long long)io_num;
+       arith_mean_lat = sum_latency / (long long)io_num;
+       pp_pl_log(4, "%s: arithmetic mean latency is (%lld us), geometric mean latency is (%lld us)",
+                       pp->dev, arith_mean_lat,
+                       (long long)pow(base_num, lg_avglatency));
 
-       if (avglatency > MAX_AVG_LATENCY) {
+       if (lg_avglatency > lg_maxavglatency) {
                pp_pl_log(0,
                          "%s: average latency (%lld us) is outside the thresold (%lld us)",
-                         pp->dev, avglatency, (long long)MAX_AVG_LATENCY);
+                         pp->dev, (long long)pow(base_num, lg_avglatency),
+                         (long long)MAX_AVG_LATENCY);
                return DEFAULT_PRIORITY;
        }
 
+       standard_deviation = calc_standard_deviation(lg_path_latency,
+                       index, lg_avglatency);
        /*
-        * Min average latency and max average latency are constant, the args
-        * base_num set can change latency_interval value corresponding to
-        * avglatency and is not constant.
-        * Warn the user if latency_interval is smaller than (2 * standard_deviation),
-        * or equal.
+        * In calPrio(), we let prio y = f(x) = log(max, base) - log (x, base);
+        * So if we want to let the priority of the latency outside 2 standard
+        * deviations can be distinguished from the latency inside 2 standard
+        * deviation, in others words at most 95% are the same and at least 5%
+        * are different according interval estimation of normal distribution,
+        * we should warn the user to set the base_num to be smaller if the
+        * log(x_threshold, base) is small than 2 standard deviation.
+        * x_threshold is derived from:
+        * y + 1 = f(x) + 1 = f(x) + log(base, base), so x_threadshold =
+        * base_num; Note that we only can compare the logarithm of x_threshold
+        * with the standard deviation because the standard deviation is derived
+        * from logarithm of latency.
+        *
+        * therefore , we recommend the base_num to meet the condition :
+        * 1 <= 2 * standard_deviation
         */
-       standard_deviation =
-           calc_standard_deviation(path_latency, index, avglatency);
-       latency_interval =
-           calc_latency_interval(avglatency, MAX_AVG_LATENCY, MIN_AVG_LATENCY,
-                                 base_num);
-       if ((latency_interval != 0)
-           && (latency_interval <= (2 * standard_deviation)))
-               pp_pl_log(3,
-                         "%s: latency interval (%lld) according to average latency (%lld us) is smaller than "
-                         "2 * standard deviation (%lld us), or equal, args base_num (%d) needs to be set bigger value",
-                         pp->dev, latency_interval, avglatency,
-                         standard_deviation, base_num);
-
-       rc = calcPrio(avglatency, MAX_AVG_LATENCY, MIN_AVG_LATENCY, base_num);
+       pp_pl_log(5, "%s: standard deviation for logarithm of latency = %.6f",
+                       pp->dev, standard_deviation);
+       if (standard_deviation <= 0.5)
+               pp_pl_log(3, "%s: the base_num(%.3lf) is too big to distinguish different priority "
+                         "of two far-away latency. It is recommend to be set smaller",
+                         pp->dev, base_num);
+       /*
+        * If the standard deviation is too large , we should also warn the user
+        */
+
+       if (standard_deviation > 4)
+               pp_pl_log(3, "%s: the base_num(%.3lf) is too small to avoid noise disturbance "
+                         ".It is recommend to be set larger",
+                         pp->dev, base_num);
+
+
+       rc = calcPrio(lg_avglatency, lg_maxavglatency, lg_minavglatency);
+
        return rc;
 }
index 34a43a81a77c1f3d60fd078867cde664e79a4c8b..e0f3efbb5c65cc230c3b241511b110c3e5c2cac5 100644 (file)
 #include <regex.h>
 #include "structs_vec.h"
 #include "print.h"
-
-char *get_next_string(char **temp, char *split_char)
-{
-       char *token = NULL;
-       token = strsep(temp, split_char);
-       while (token != NULL && !strcmp(token, ""))
-               token = strsep(temp, split_char);
-       return token;
-}
+#include "util.h"
 
 #define CHECK_LEN \
 do { \
index 00adc0da4cb1267832369262402f0215c8508b7e..0d29ed2893894704d59876a493e14614c2ade96e 100644 (file)
@@ -367,16 +367,42 @@ out:
        return 0;
 }
 
+/*
+ * Current RDAC (NetApp E-Series) firmware relies
+ * on periodic REPORT TARGET PORT GROUPS for
+ * internal load balancing.
+ * Using the sysfs priority checker defeats this purpose.
+ *
+ * Moreover, NetApp would also prefer the RDAC checker over ALUA.
+ * (https://www.redhat.com/archives/dm-devel/2017-September/msg00326.html)
+ */
+static int
+check_rdac(struct path * pp)
+{
+       int len;
+       char buff[44];
+
+       len = get_vpd_sgio(pp->fd, 0xC9, buff, 44);
+       if (len <= 0)
+               return 0;
+       return !(memcmp(buff + 4, "vac1", 4));
+}
+
 int select_checker(struct config *conf, struct path *pp)
 {
        char *origin, *checker_name;
        struct checker * c = &pp->checker;
 
-       if (pp->detect_checker == DETECT_CHECKER_ON && pp->tpgs > 0) {
-               checker_name = TUR;
+       if (pp->detect_checker == DETECT_CHECKER_ON) {
                origin = "(setting: storage device autodetected)";
-               goto out;
-       }
+               if (check_rdac(pp)) {
+                       checker_name = RDAC;
+                       goto out;
+               } else if (pp->tpgs > 0) {
+                       checker_name = TUR;
+                       goto out;
+               }
+       }
        do_set(checker_name, conf->overrides, checker_name, "(setting: multipath.conf overrides section)");
        do_set(checker_name, pp->hwe, checker_name, "(setting: storage device configuration)");
        do_set(checker_name, conf, checker_name, "(setting: multipath.conf defaults/devices section)");
@@ -427,24 +453,6 @@ out:
        return 0;
 }
 
-/*
- * Current RDAC (NetApp E-Series) firmware relies
- * on periodic REPORT TARGET PORT GROUPS for
- * internal load balancing.
- * Using the sysfs priority checker defeats this purpose.
- */
-static int
-check_rdac(struct path * pp)
-{
-       int len;
-       char buff[44];
-
-       len = get_vpd_sgio(pp->fd, 0xC9, buff, 44);
-       if (len <= 0)
-               return 0;
-       return !(memcmp(buff + 4, "vac1", 4));
-}
-
 void
 detect_prio(struct config *conf, struct path * pp)
 {
@@ -754,51 +762,71 @@ out:
        return 0;
 
 }
-int select_san_path_err_threshold(struct config *conf, struct multipath *mp)
+
+int select_marginal_path_err_sample_time(struct config *conf, struct multipath *mp)
 {
        char *origin, buff[12];
 
-       mp_set_mpe(san_path_err_threshold);
-       mp_set_ovr(san_path_err_threshold);
-       mp_set_hwe(san_path_err_threshold);
-       mp_set_conf(san_path_err_threshold);
-       mp_set_default(san_path_err_threshold, DEFAULT_ERR_CHECKS);
+       mp_set_mpe(marginal_path_err_sample_time);
+       mp_set_ovr(marginal_path_err_sample_time);
+       mp_set_hwe(marginal_path_err_sample_time);
+       mp_set_conf(marginal_path_err_sample_time);
+       mp_set_default(marginal_path_err_sample_time, DEFAULT_ERR_CHECKS);
 out:
-       print_off_int_undef(buff, 12, &mp->san_path_err_threshold);
-       condlog(3, "%s: san_path_err_threshold = %s %s", mp->alias, buff, origin);
+       print_off_int_undef(buff, 12, &mp->marginal_path_err_sample_time);
+       condlog(3, "%s: marginal_path_err_sample_time = %s %s", mp->alias, buff,
+                       origin);
        return 0;
 }
 
-int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp)
+int select_marginal_path_err_rate_threshold(struct config *conf, struct multipath *mp)
 {
        char *origin, buff[12];
 
-       mp_set_mpe(san_path_err_forget_rate);
-       mp_set_ovr(san_path_err_forget_rate);
-       mp_set_hwe(san_path_err_forget_rate);
-       mp_set_conf(san_path_err_forget_rate);
-       mp_set_default(san_path_err_forget_rate, DEFAULT_ERR_CHECKS);
+       mp_set_mpe(marginal_path_err_rate_threshold);
+       mp_set_ovr(marginal_path_err_rate_threshold);
+       mp_set_hwe(marginal_path_err_rate_threshold);
+       mp_set_conf(marginal_path_err_rate_threshold);
+       mp_set_default(marginal_path_err_rate_threshold, DEFAULT_ERR_CHECKS);
 out:
-       print_off_int_undef(buff, 12, &mp->san_path_err_forget_rate);
-       condlog(3, "%s: san_path_err_forget_rate = %s %s", mp->alias, buff, origin);
+       print_off_int_undef(buff, 12, &mp->marginal_path_err_rate_threshold);
+       condlog(3, "%s: marginal_path_err_rate_threshold = %s %s", mp->alias, buff,
+                       origin);
        return 0;
-
 }
-int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp)
+
+int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multipath *mp)
 {
        char *origin, buff[12];
 
-       mp_set_mpe(san_path_err_recovery_time);
-       mp_set_ovr(san_path_err_recovery_time);
-       mp_set_hwe(san_path_err_recovery_time);
-       mp_set_conf(san_path_err_recovery_time);
-       mp_set_default(san_path_err_recovery_time, DEFAULT_ERR_CHECKS);
+       mp_set_mpe(marginal_path_err_recheck_gap_time);
+       mp_set_ovr(marginal_path_err_recheck_gap_time);
+       mp_set_hwe(marginal_path_err_recheck_gap_time);
+       mp_set_conf(marginal_path_err_recheck_gap_time);
+       mp_set_default(marginal_path_err_recheck_gap_time, DEFAULT_ERR_CHECKS);
 out:
-       print_off_int_undef(buff, 12, &mp->san_path_err_recovery_time);
-       condlog(3, "%s: san_path_err_recovery_time = %s %s", mp->alias, buff, origin);
+       print_off_int_undef(buff, 12, &mp->marginal_path_err_recheck_gap_time);
+       condlog(3, "%s: marginal_path_err_recheck_gap_time = %s %s", mp->alias, buff,
+                       origin);
        return 0;
+}
 
+int select_marginal_path_double_failed_time(struct config *conf, struct multipath *mp)
+{
+       char *origin, buff[12];
+
+       mp_set_mpe(marginal_path_double_failed_time);
+       mp_set_ovr(marginal_path_double_failed_time);
+       mp_set_hwe(marginal_path_double_failed_time);
+       mp_set_conf(marginal_path_double_failed_time);
+       mp_set_default(marginal_path_double_failed_time, DEFAULT_ERR_CHECKS);
+out:
+       print_off_int_undef(buff, 12, &mp->marginal_path_double_failed_time);
+       condlog(3, "%s: marginal_path_double_failed_time = %s %s", mp->alias, buff,
+                       origin);
+       return 0;
 }
+
 int select_skip_kpartx (struct config *conf, struct multipath * mp)
 {
        char *origin;
index f8e96d856b0fc1f2f43099692915e27d04a8d88d..347cb321ac66e65958b09177c8d4e0f5eb2e6c75 100644 (file)
@@ -25,9 +25,10 @@ int select_delay_watch_checks (struct config *conf, struct multipath * mp);
 int select_delay_wait_checks (struct config *conf, struct multipath * mp);
 int select_skip_kpartx (struct config *conf, struct multipath * mp);
 int select_max_sectors_kb (struct config *conf, struct multipath * mp);
-int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp);
-int select_san_path_err_threshold(struct config *conf, struct multipath *mp);
-int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp);
+int select_marginal_path_err_sample_time(struct config *conf, struct multipath *mp);
+int select_marginal_path_err_rate_threshold(struct config *conf, struct multipath *mp);
+int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multipath *mp);
+int select_marginal_path_double_failed_time(struct config *conf, struct multipath *mp);
 void reconcile_features_with_options(const char *id, char **features,
                                     int* no_path_retry,
                                     int *retain_hwhandler);
index 828e7907073a96b81e7a125a7482bca235473b46..3e057f503fa8bf6831652e5ffeed5a5a4f3bb1b0 100644 (file)
@@ -99,6 +99,7 @@ alloc_path (void)
                pp->fd = -1;
                pp->tpgs = TPGS_UNDEF;
                pp->priority = PRIO_UNDEF;
+               checker_clear(&pp->checker);
        }
        return pp;
 }
index f06824a408b5c0fd6ce63d89f0f004e67a9fe3cd..c2cf3fb9a60bc63ecd3ce0f00f4d2bb6a4f33131 100644 (file)
@@ -240,10 +240,10 @@ struct path {
        int initialized;
        int retriggers;
        int wwid_changed;
-       unsigned int path_failures;
-       time_t dis_reinstate_time;
-       int disable_reinstate;
-       int san_path_err_forget_rate;
+       time_t io_err_dis_reinstate_time;
+       int io_err_disable_reinstate;
+       int io_err_pathfail_cnt;
+       int io_err_pathfail_starttime;
        /* configlet pointers */
        struct hwentry * hwe;
 };
@@ -275,9 +275,10 @@ struct multipath {
        int deferred_remove;
        int delay_watch_checks;
        int delay_wait_checks;
-       int san_path_err_threshold;
-       int san_path_err_forget_rate;
-       int san_path_err_recovery_time;
+       int marginal_path_err_sample_time;
+       int marginal_path_err_rate_threshold;
+       int marginal_path_err_recheck_gap_time;
+       int marginal_path_double_failed_time;
        int skip_kpartx;
        int max_sectors_kb;
        int force_readonly;
index 0cbcc5960fd7bb4c29cd8d9acaaeb1c28ac07d35..80bf1dd15372371ff837cc06e818c6e9bf2612e5 100644 (file)
@@ -922,3 +922,35 @@ char *uevent_get_dm_name(struct uevent *uev)
        }
        return p;
 }
+
+char *uevent_get_dm_path(struct uevent *uev)
+{
+       char *p = NULL;
+       int i;
+
+       for (i = 0; uev->envp[i] != NULL; i++) {
+               if (!strncmp(uev->envp[i], "DM_PATH", 7) &&
+                   strlen(uev->envp[i]) > 8) {
+                       p = MALLOC(strlen(uev->envp[i] + 8) + 1);
+                       strcpy(p, uev->envp[i] + 8);
+                       break;
+               }
+       }
+       return p;
+}
+
+char *uevent_get_dm_action(struct uevent *uev)
+{
+       char *p = NULL;
+       int i;
+
+       for (i = 0; uev->envp[i] != NULL; i++) {
+               if (!strncmp(uev->envp[i], "DM_ACTION", 9) &&
+                   strlen(uev->envp[i]) > 10) {
+                       p = MALLOC(strlen(uev->envp[i] + 10) + 1);
+                       strcpy(p, uev->envp[i] + 10);
+                       break;
+               }
+       }
+       return p;
+}
index 61a420714b489793d777d5aecfdf620e7ed3712e..6f5af0af64786aff564343c8a9626b8879696a5f 100644 (file)
@@ -37,5 +37,7 @@ int uevent_get_major(struct uevent *uev);
 int uevent_get_minor(struct uevent *uev);
 int uevent_get_disk_ro(struct uevent *uev);
 char *uevent_get_dm_name(struct uevent *uev);
+char *uevent_get_dm_path(struct uevent *uev);
+char *uevent_get_dm_action(struct uevent *uev);
 
 #endif /* _UEVENT_H */
index 0800da527f0da7ce9f3a8d017ffb0e04ef8efff4..0b43d29d1236139f503e05b47a488dc5a90e911a 100644 (file)
@@ -65,6 +65,15 @@ filepresent (char * run) {
        return 0;
 }
 
+char *get_next_string(char **temp, char *split_char)
+{
+       char *token = NULL;
+       token = strsep(temp, split_char);
+       while (token != NULL && !strcmp(token, ""))
+               token = strsep(temp, split_char);
+       return token;
+}
+
 int
 get_word (char * sentence, char ** word)
 {
index 3dc048e2c0b86a38a8b243e9e69ec165c66d3a7e..51a6d542bfa6dbb5ddb5a4cfca34137b0be26b3b 100644 (file)
@@ -7,6 +7,7 @@
 size_t strchop(char *);
 int basenamecpy (const char * src, char * dst, int);
 int filepresent (char * run);
+char *get_next_string(char **temp, char *split_char);
 int get_word (char * sentence, char ** word);
 size_t strlcpy(char *dst, const char *src, size_t size);
 size_t strlcat(char *dst, const char *src, size_t size);
index 0a0da9eb09b33698cf45880f7e165578612ad282..ca628b2e8294a9e1e327c84488fa0a36f727083a 100644 (file)
@@ -20,8 +20,8 @@
 #ifndef _VERSION_H
 #define _VERSION_H
 
-#define VERSION_CODE 0x000703
-#define DATE_CODE    0x090514
+#define VERSION_CODE 0x000704
+#define DATE_CODE    0x0b0f11
 
 #define PROG    "multipath-tools"
 
index 5b6dde71491e2bfabf324b8bd7e9792cb2b82816..36551b47ba2f3be2ce2ba0ca53c149dba7213aee 100644 (file)
@@ -351,7 +351,7 @@ these values can be looked up through sysfs or by running \fImultipathd show pat
 .RE
 .TP 12
 .I path_latency
-Needs a value of the form \fI"<io_num>|<base_num>"\fR
+Needs a value of the form "io_num=\fI<20>\fR base_num=\fI<10>\fR"
 .RS
 .TP 8
 .I io_num
@@ -653,7 +653,7 @@ seconds, or 68 years. It will be automatically adjusted to the overall
 retry interval \fIno_path_retry\fR * \fIpolling_interval\fR
 if a number of retries is given with \fIno_path_retry\fR and the
 overall retry interval is longer than the specified \fIdev_loss_tmo\fR value.
-The Linux kernel will cap this value to \fI300\fR if \fIfast_io_fail_tmo\fR
+The Linux kernel will cap this value to \fI600\fR if \fIfast_io_fail_tmo\fR
 is not set. See KNOWN ISSUES.
 .RS
 .TP
@@ -682,6 +682,17 @@ The default is: \fB/etc/multipath/wwids\fR
 .
 .
 .TP
+.B prkeys_file
+The full pathname of the prkeys file, which is used by multipathd to keep
+track of the persistent reservation key used for a specific WWID, when
+\fIreservation_key\fR is set to \fBfile\fR.
+.RS
+.TP
+The default is \fB/etc/multipath/prkeys\fR
+.RE
+.
+.
+.TP
 .B log_checker_err
 If set to
 .I once
@@ -703,6 +714,12 @@ the same as the RESERVATION KEY field of the PERSISTENT RESERVE OUT parameter
 list which contains an 8-byte value provided by the application client to the
 device server to identify the I_T nexus.
 .RS
+.PP
+Alternatively, this can be set to \fBfile\fR, which will store the RESERVATION
+KEY registered by mpathpersist in the \fIprkeys_file\fR. multipathd will then
+use this key to register additional paths as they appear.  When the
+registration is removed, the RESERVATION KEY is removed from the
+\fIprkeys_file\fR.
 .TP
 The default is: \fB<unset>\fR
 .RE
@@ -824,13 +841,14 @@ The default is: \fB/etc/multipath/conf.d/\fR
 .
 .
 .TP
-.B san_path_err_threshold
-If set to a value greater than 0, multipathd will watch paths and check how many
-times a path has been failed due to errors.If the number of failures on a particular
-path is greater then the san_path_err_threshold then the path will not  reinstante
-till san_path_err_recovery_time.These path failures should occur within a
-san_path_err_forget_rate checks, if not we will consider the path is good enough
-to reinstantate.
+.B marginal_path_double_failed_time
+One of the four parameters of supporting path check based on accounting IO
+error such as intermittent error. When a path failed event occurs twice in
+\fImarginal_path_double_failed_time\fR seconds due to an IO error and all the
+other three parameters are set, multipathd will fail the path and enqueue
+this path into a queue of which members are sent a couple of continuous
+direct reading asynchronous IOs at a fixed sample rate of 10HZ to start IO
+error accounting process.
 .RS
 .TP
 The default is: \fBno\fR
@@ -838,11 +856,21 @@ The default is: \fBno\fR
 .
 .
 .TP
-.B san_path_err_forget_rate
-If set to a value greater than 0, multipathd will check whether the path failures
-has exceeded  the san_path_err_threshold within this many checks i.e
-san_path_err_forget_rate . If so we will not reinstante the path till
-san_path_err_recovery_time.
+.B marginal_path_err_sample_time
+One of the four parameters of supporting path check based on accounting IO
+error such as intermittent error. If it is set to a value no less than 120,
+when a path fail event occurs twice in \fImarginal_path_double_failed_time\fR
+second due to an IO error, multipathd will fail the path and enqueue this
+path into a queue of which members are sent a couple of continuous direct
+reading asynchronous IOs at a fixed sample rate of 10HZ to start the IO
+accounting process for the path will last for
+\fImarginal_path_err_sample_time\fR.
+If the rate of IO error on a particular path is greater than the
+\fImarginal_path_err_rate_threshold\fR, then the path will not reinstate for
+\fImarginal_path_err_rate_threshold\fR seconds unless there is only one
+active path. After \fImarginal_path_err_recheck_gap_time\fR expires, the path
+will be requeueed for rechecking. If checking result is good enough, the
+path will be reinstated.
 .RS
 .TP
 The default is: \fBno\fR
@@ -850,12 +878,30 @@ The default is: \fBno\fR
 .
 .
 .TP
-.B san_path_err_recovery_time
-If set to a value greater than 0, multipathd will make sure that when path failures
-has exceeded the san_path_err_threshold within san_path_err_forget_rate then the path
-will be placed in failed state for san_path_err_recovery_time duration.Once san_path_err_recovery_time
-has timeout  we will reinstante the failed path .
-san_path_err_recovery_time value should be in secs.
+.B marginal_path_err_rate_threshold
+The error rate threshold as a permillage (1/1000). One of the four parameters
+of supporting path check based on accounting IO error such as intermittent
+error. Refer to \fImarginal_path_err_sample_time\fR. If the rate of IO errors
+on a particular path is greater than this parameter, then the path will not
+reinstate for \fImarginal_path_err_rate_threshold\fR seconds unless there is
+only one active path.
+.RS
+.TP
+The default is: \fBno\fR
+.RE
+.
+.
+.TP
+.B marginal_path_err_recheck_gap_time
+One of the four parameters of supporting path check based on accounting IO
+error such as intermittent error. Refer to
+\fImarginal_path_err_sample_time\fR. If this parameter is set to a positive
+value, the failed path of  which the IO error rate is larger than
+\fImarginal_path_err_rate_threshold\fR will be kept in failed state for
+\fImarginal_path_err_recheck_gap_time\fR seconds. When
+\fImarginal_path_err_recheck_gap_time\fR seconds expires, the path will be
+requeueed for checking. If checking result is good enough, the path will be
+reinstated, or else it will keep failed.
 .RS
 .TP
 The default is: \fBno\fR
@@ -1127,11 +1173,13 @@ are taken from the \fIdefaults\fR or \fIdevices\fR section:
 .TP
 .B deferred_remove
 .TP
-.B san_path_err_threshold
+.B marginal_path_err_sample_time
+.TP
+.B marginal_path_err_rate_threshold
 .TP
-.B san_path_err_forget_rate
+.B marginal_path_err_recheck_gap_time
 .TP
-.B san_path_err_recovery_time
+.B marginal_path_double_failed_time
 .TP
 .B delay_watch_checks
 .TP
@@ -1254,11 +1302,13 @@ section:
 .TP
 .B deferred_remove
 .TP
-.B san_path_err_threshold
+.B marginal_path_err_sample_time
 .TP
-.B san_path_err_forget_rate
+.B marginal_path_err_rate_threshold
 .TP
-.B san_path_err_recovery_time
+.B marginal_path_err_recheck_gap_time
+.TP
+.B marginal_path_double_failed_time
 .TP
 .B delay_watch_checks
 .TP
@@ -1326,11 +1376,13 @@ the values are taken from the \fIdevices\fR or \fIdefaults\fR sections:
 .TP
 .B deferred_remove
 .TP
-.B san_path_err_threshold
+.B marginal_path_err_sample_time
+.TP
+.B marginal_path_err_rate_threshold
 .TP
-.B san_path_err_forget_rate
+.B marginal_path_err_recheck_gap_time
 .TP
-.B san_path_err_recovery_time
+.B marginal_path_double_failed_time
 .TP
 .B delay_watch_checks
 .TP
index 8049da22fe3c92df800bef1478fe579eaa4ecf06..31ce923385a7b67ce2430fd2bce8b0d3893ae160 100644 (file)
@@ -84,6 +84,7 @@ int uxsock_timeout;
 #include "cli_handlers.h"
 #include "lock.h"
 #include "waiter.h"
+#include "io_err_stat.h"
 #include "wwids.h"
 #include "../third-party/valgrind/drd.h"
 
@@ -169,6 +170,7 @@ sd_notify_status(void)
        return NULL;
 }
 
+#ifdef USE_SYSTEMD
 static void do_sd_notify(enum daemon_status old_state)
 {
        /*
@@ -181,6 +183,7 @@ static void do_sd_notify(enum daemon_status old_state)
                return;
        sd_notify(0, sd_notify_status());
 }
+#endif
 
 static void config_cleanup(void *arg)
 {
@@ -1065,6 +1068,42 @@ out:
        return retval;
 }
 
+static int
+uev_pathfail_check(struct uevent *uev, struct vectors *vecs)
+{
+       char *action = NULL, *devt = NULL;
+       struct path *pp;
+       int r;
+
+       action = uevent_get_dm_action(uev);
+       if (!action)
+               return 1;
+       if (strncmp(action, "PATH_FAILED", 11))
+               goto out;
+       devt = uevent_get_dm_path(uev);
+       if (!devt) {
+               condlog(3, "%s: No DM_PATH in uevent", uev->kernel);
+               goto out;
+       }
+
+       pthread_cleanup_push(cleanup_lock, &vecs->lock);
+       lock(&vecs->lock);
+       pthread_testcancel();
+       pp = find_path_by_devt(vecs->pathvec, devt);
+       r = io_err_stat_handle_pathfail(pp);
+       lock_cleanup_pop(vecs->lock);
+
+       if (r)
+               condlog(3, "io_err_stat: %s: cannot handle pathfail uevent",
+                               pp->dev);
+       FREE(devt);
+       FREE(action);
+       return 0;
+out:
+       FREE(action);
+       return 1;
+}
+
 static int
 map_discovery (struct vectors * vecs)
 {
@@ -1150,6 +1189,14 @@ uev_trigger (struct uevent * uev, void * trigger_data)
        if (!strncmp(uev->kernel, "dm-", 3)) {
                if (!strncmp(uev->action, "change", 6)) {
                        r = uev_add_map(uev, vecs);
+
+                       /*
+                        * the kernel-side dm-mpath issues a PATH_FAILED event
+                        * when it encounters a path IO error. It is reason-
+                        * able be the entry of path IO error accounting pro-
+                        * cess.
+                        */
+                       uev_pathfail_check(uev, vecs);
                        goto out;
                }
                if (!strncmp(uev->action, "remove", 6)) {
@@ -1507,83 +1554,6 @@ void repair_path(struct path * pp)
        LOG_MSG(1, checker_message(&pp->checker));
 }
 
-static int check_path_reinstate_state(struct path * pp) {
-       struct timespec curr_time;
-       if (!((pp->mpp->san_path_err_threshold > 0) &&
-                               (pp->mpp->san_path_err_forget_rate > 0) &&
-                               (pp->mpp->san_path_err_recovery_time >0))) {
-               return 0;
-       }
-
-       if (pp->disable_reinstate) {
-               /* If we don't know how much time has passed, automatically
-                * reinstate the path, just to be safe. Also, if there are
-                * no other usable paths, reinstate the path
-                */
-               if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 ||
-                               pp->mpp->nr_active == 0) {
-                       condlog(2, "%s : reinstating path early", pp->dev);
-                       goto reinstate_path;
-               }
-               if ((curr_time.tv_sec - pp->dis_reinstate_time ) > pp->mpp->san_path_err_recovery_time) {
-                       condlog(2,"%s : reinstate the path after err recovery time", pp->dev);
-                       goto reinstate_path;
-               }
-               return 1;
-       }
-       /* forget errors on a working path */
-       if ((pp->state == PATH_UP || pp->state == PATH_GHOST) &&
-                       pp->path_failures > 0) {
-               if (pp->san_path_err_forget_rate > 0){
-                       pp->san_path_err_forget_rate--;
-               } else {
-                       /* for every san_path_err_forget_rate number of
-                        * successful path checks decrement path_failures by 1
-                        */
-                       pp->path_failures--;
-                       pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate;
-               }
-               return 0;
-       }
-
-       /* If the path isn't recovering from a failed state, do nothing */
-       if (pp->state != PATH_DOWN && pp->state != PATH_SHAKY &&
-                       pp->state != PATH_TIMEOUT)
-               return 0;
-
-       if (pp->path_failures == 0)
-               pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate;
-
-       pp->path_failures++;
-
-       /* if we don't know the currently time, we don't know how long to
-        * delay the path, so there's no point in checking if we should
-        */
-
-       if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
-               return 0;
-       /* when path failures has exceeded the san_path_err_threshold
-        * place the path in delayed state till san_path_err_recovery_time
-        * so that the cutomer can rectify the issue within this time. After
-        * the completion of san_path_err_recovery_time it should
-        * automatically reinstate the path
-        */
-       if (pp->path_failures > pp->mpp->san_path_err_threshold) {
-               condlog(2, "%s : hit error threshold. Delaying path reinstatement", pp->dev);
-               pp->dis_reinstate_time = curr_time.tv_sec;
-               pp->disable_reinstate = 1;
-               return 1;
-       } else {
-               return 0;
-       }
-
-reinstate_path:
-       pp->path_failures = 0;
-       pp->disable_reinstate = 0;
-       pp->san_path_err_forget_rate = 0;
-       return 0;
-}
-
 /*
  * Returns '1' if the path has been checked, '-1' if it was blacklisted
  * and '0' otherwise
@@ -1697,9 +1667,13 @@ check_path (struct vectors * vecs, struct path * pp, int ticks)
        if (!pp->mpp)
                return 0;
 
-       if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
-                       check_path_reinstate_state(pp)) {
-               pp->state = PATH_DELAYED;
+       if (pp->io_err_disable_reinstate && hit_io_err_recheck_time(pp)) {
+               pp->state = PATH_SHAKY;
+               /*
+                * to reschedule as soon as possible,so that this path can
+                * be recoverd in time
+                */
+               pp->tick = 1;
                return 1;
        }
 
@@ -2396,6 +2370,7 @@ child (void * param)
        setup_thread_attr(&misc_attr, 64 * 1024, 0);
        setup_thread_attr(&uevent_attr, DEFAULT_UEVENT_STACKSIZE * 1024, 0);
        setup_thread_attr(&waiter_attr, 32 * 1024, 1);
+       setup_thread_attr(&io_err_stat_attr, 32 * 1024, 1);
 
        if (logsink == 1) {
                setup_thread_attr(&log_attr, 64 * 1024, 0);
@@ -2518,6 +2493,10 @@ child (void * param)
        /*
         * start threads
         */
+       rc = start_io_err_stat_thread(vecs);
+       if (rc)
+               goto failed;
+
        if ((rc = pthread_create(&check_thr, &misc_attr, checkerloop, vecs))) {
                condlog(0,"failed to create checker loop thread: %d", rc);
                goto failed;
@@ -2567,6 +2546,8 @@ child (void * param)
        remove_maps_and_stop_waiters(vecs);
        unlock(&vecs->lock);
 
+       stop_io_err_stat_thread();
+
        pthread_cancel(check_thr);
        pthread_cancel(uevent_thr);
        pthread_cancel(uxlsnr_thr);
@@ -2612,6 +2593,7 @@ child (void * param)
        udev_unref(udev);
        udev = NULL;
        pthread_attr_destroy(&waiter_attr);
+       pthread_attr_destroy(&io_err_stat_attr);
 #ifdef _DEBUG_
        dbg_free_final(NULL);
 #endif
index 2615728522fff1edb3a9dcae7b438f218080f289..5c96680c0514749d1865801a26d637f3f0c22fb7 100644 (file)
@@ -247,6 +247,22 @@ Disable persistent reservation management on $map.
 Get the current persistent reservation management status of $map.
 .
 .TP
+.B map|multipath $map getprkey
+Get the current persistent reservation key associated with $map.
+.
+.TP
+.B map|multipath $map setprkey key $key
+Set the persistent reservation key associated with $map to $key in the
+\fIprkeys_file\fR. This key will only be used by multipathd if
+\fIreservation_key\fR is set to \fBfile\fR in \fI/etc/multipath.conf\fR.
+.
+.TP
+.B map|multipath $map unsetprkey
+Remove the persistent reservation key associated with $map from the
+\fIprkeys_file\fR. This will only unset the key used by multipathd if
+\fIreservation_key\fR is set to \fBfile\fR in \fI/etc/multipath.conf\fR.
+.
+.TP
 .B quit|exit
 End interactive session.
 .