From 8a7322d7177d87d51690d29bd2bbc643855e33c3 Mon Sep 17 00:00:00 2001 From: DongHun Kwak Date: Fri, 14 Jan 2022 13:50:17 +0900 Subject: [PATCH] Imported Upstream version 0.7.4 --- libdmmp/Makefile | 7 +- libmpathpersist/mpath_persist.c | 5 +- libmultipath/Makefile | 5 +- libmultipath/checkers.c | 10 +- libmultipath/checkers.h | 3 +- libmultipath/checkers/cciss_tur.c | 2 +- libmultipath/config.c | 3 - libmultipath/config.h | 21 +- libmultipath/configure.c | 7 +- libmultipath/dict.c | 88 ++-- libmultipath/discovery.c | 2 +- libmultipath/hwtable.c | 30 +- libmultipath/io_err_stat.c | 743 +++++++++++++++++++++++++++++++ libmultipath/io_err_stat.h | 15 + libmultipath/prioritizers/path_latency.c | 321 ++++++++----- libmultipath/prioritizers/weightedpath.c | 10 +- libmultipath/propsel.c | 122 +++-- libmultipath/propsel.h | 7 +- libmultipath/structs.c | 1 + libmultipath/structs.h | 15 +- libmultipath/uevent.c | 32 ++ libmultipath/uevent.h | 2 + libmultipath/util.c | 9 + libmultipath/util.h | 1 + libmultipath/version.h | 4 +- multipath/multipath.conf.5 | 110 +++-- multipathd/main.c | 142 +++--- multipathd/multipathd.8 | 16 + 28 files changed, 1371 insertions(+), 362 deletions(-) create mode 100644 libmultipath/io_err_stat.c create mode 100644 libmultipath/io_err_stat.h diff --git a/libdmmp/Makefile b/libdmmp/Makefile index cdd26ed..6645a1a 100644 --- a/libdmmp/Makefile +++ b/libdmmp/Makefile @@ -54,6 +54,7 @@ uninstall: $(RM) $$file; \ done $(RM) $(DESTDIR)$(man3dir)/libdmmp.h* + $(RM) $(DESTDIR)$(pkgconfdir)/$(PKGFILE) clean: $(RM) core *.a *.o *.gz *.so *.so.* @@ -75,10 +76,10 @@ docs/man/$(EXTRA_MAN_FILES).gz: $(HEADERS) $(INSTALL_PROGRAM) -v -m 644 -D docs/$$file docs/man/$$file; \ done cat $(HEADERS) | \ - perl docs/doc-preclean.pl > $(TEMPFILE) - perl docs/kernel-doc -man $(TEMPFILE) | \ + perl docs/doc-preclean.pl > "$(TEMPFILE)" + perl docs/kernel-doc -man "$(TEMPFILE)" | \ perl docs/split-man.pl docs/man - -rm -f $(TEMPFILE) + -rm -f "$(TEMPFILE)" @for file in docs/man/*.3; do \ gzip -f $$file; \ done diff --git a/libmpathpersist/mpath_persist.c b/libmpathpersist/mpath_persist.c index b5ed556..84ab293 100644 --- a/libmpathpersist/mpath_persist.c +++ b/libmpathpersist/mpath_persist.c @@ -339,8 +339,9 @@ int mpath_persistent_reserve_out ( int fd, int rq_servact, int rq_scope, memcpy(&prkey, paramp->sa_key, 8); if (mpp->prkey_source == PRKEY_SOURCE_FILE && prkey && - ((!get_be64(mpp->reservation_key) && MPATH_PROUT_REG_SA) || - MPATH_PROUT_REG_IGN_SA)) { + ((!get_be64(mpp->reservation_key) && + rq_servact == MPATH_PROUT_REG_SA) || + rq_servact == MPATH_PROUT_REG_IGN_SA)) { memcpy(&mpp->reservation_key, paramp->sa_key, 8); if (update_prkey(alias, get_be64(mpp->reservation_key))) { condlog(0, "%s: failed to set prkey for multipathd.", diff --git a/libmultipath/Makefile b/libmultipath/Makefile index 928bc25..6447d8d 100644 --- a/libmultipath/Makefile +++ b/libmultipath/Makefile @@ -9,7 +9,7 @@ LIBS = $(DEVLIB).$(SONAME) CFLAGS += $(LIB_CFLAGS) -I$(mpathcmddir) -LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -lurcu +LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -lurcu -laio ifdef SYSTEMD CFLAGS += -DUSE_SYSTEMD=$(SYSTEMD) @@ -42,7 +42,8 @@ OBJS = memory.o parser.o vector.o devmapper.o callout.o \ pgpolicies.o debug.o defaults.o uevent.o time-util.o \ switchgroup.o uxsock.o print.o alias.o log_pthread.o \ log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \ - lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o + lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o \ + io_err_stat.o all: $(LIBS) diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c index cd6d6a3..08cdfc3 100644 --- a/libmultipath/checkers.c +++ b/libmultipath/checkers.c @@ -19,7 +19,6 @@ char *checker_state_names[] = { "timeout", "removed", "delayed", - "none", }; static LIST_HEAD(checkers); @@ -44,6 +43,7 @@ struct checker * alloc_checker (void) if (c) { INIT_LIST_HEAD(&c->node); c->refcount = 1; + c->fd = -1; } return c; } @@ -203,6 +203,12 @@ int checker_init (struct checker * c, void ** mpctxt_addr) return 0; } +void checker_clear (struct checker *c) +{ + memset(c, 0x0, sizeof(struct checker)); + c->fd = -1; +} + void checker_put (struct checker * dst) { struct checker * src; @@ -212,7 +218,7 @@ void checker_put (struct checker * dst) src = checker_lookup(dst->name); if (dst->free) dst->free(dst); - memset(dst, 0x0, sizeof(struct checker)); + checker_clear(dst); free_checker(src); } diff --git a/libmultipath/checkers.h b/libmultipath/checkers.h index 713399f..52154ca 100644 --- a/libmultipath/checkers.h +++ b/libmultipath/checkers.h @@ -11,7 +11,7 @@ * * PATH_WILD: * - Use: None of the checkers (returned if we don't have an fd) - * - Description: Corner case where "fd <= 0" for path fd (see checker_check()) + * - Description: Corner case where "fd < 0" for path fd (see checker_check()) * * PATH_UNCHECKED: * - Use: Only in directio checker @@ -128,6 +128,7 @@ void cleanup_checkers (void); struct checker * add_checker (char *, char *); struct checker * checker_lookup (char *); int checker_init (struct checker *, void **); +void checker_clear (struct checker *); void checker_put (struct checker *); void checker_reset (struct checker *); void checker_set_sync (struct checker *); diff --git a/libmultipath/checkers/cciss_tur.c b/libmultipath/checkers/cciss_tur.c index 9d79f96..436470c 100644 --- a/libmultipath/checkers/cciss_tur.c +++ b/libmultipath/checkers/cciss_tur.c @@ -73,7 +73,7 @@ int libcheck_check(struct checker * c) LogvolInfo_struct lvi; // logical "volume" info IOCTL_Command_struct cic; // cciss ioctl command - if ((c->fd) <= 0) { + if ((c->fd) < 0) { MSG(c,"no usable fd"); ret = -1; goto out; diff --git a/libmultipath/config.c b/libmultipath/config.c index ea2359a..eb03f0a 100644 --- a/libmultipath/config.c +++ b/libmultipath/config.c @@ -351,9 +351,6 @@ merge_hwe (struct hwentry * dst, struct hwentry * src) merge_num(delay_wait_checks); merge_num(skip_kpartx); merge_num(max_sectors_kb); - merge_num(san_path_err_threshold); - merge_num(san_path_err_forget_rate); - merge_num(san_path_err_recovery_time); snprintf(id, sizeof(id), "%s/%s", dst->vendor, dst->product); reconcile_features_with_options(id, &dst->features, diff --git a/libmultipath/config.h b/libmultipath/config.h index 240730b..51fe27b 100644 --- a/libmultipath/config.h +++ b/libmultipath/config.h @@ -75,9 +75,10 @@ struct hwentry { int deferred_remove; int delay_watch_checks; int delay_wait_checks; - int san_path_err_threshold; - int san_path_err_forget_rate; - int san_path_err_recovery_time; + int marginal_path_err_sample_time; + int marginal_path_err_rate_threshold; + int marginal_path_err_recheck_gap_time; + int marginal_path_double_failed_time; int skip_kpartx; int max_sectors_kb; char * bl_product; @@ -107,9 +108,10 @@ struct mpentry { int deferred_remove; int delay_watch_checks; int delay_wait_checks; - int san_path_err_threshold; - int san_path_err_forget_rate; - int san_path_err_recovery_time; + int marginal_path_err_sample_time; + int marginal_path_err_rate_threshold; + int marginal_path_err_recheck_gap_time; + int marginal_path_double_failed_time; int skip_kpartx; int max_sectors_kb; uid_t uid; @@ -156,9 +158,10 @@ struct config { int processed_main_config; int delay_watch_checks; int delay_wait_checks; - int san_path_err_threshold; - int san_path_err_forget_rate; - int san_path_err_recovery_time; + int marginal_path_err_sample_time; + int marginal_path_err_rate_threshold; + int marginal_path_err_recheck_gap_time; + int marginal_path_double_failed_time; int uxsock_timeout; int strict_timing; int retrigger_tries; diff --git a/libmultipath/configure.c b/libmultipath/configure.c index 7a3db31..09821e8 100644 --- a/libmultipath/configure.c +++ b/libmultipath/configure.c @@ -295,9 +295,10 @@ int setup_map(struct multipath *mpp, char *params, int params_size) select_deferred_remove(conf, mpp); select_delay_watch_checks(conf, mpp); select_delay_wait_checks(conf, mpp); - select_san_path_err_threshold(conf, mpp); - select_san_path_err_forget_rate(conf, mpp); - select_san_path_err_recovery_time(conf, mpp); + select_marginal_path_err_sample_time(conf, mpp); + select_marginal_path_err_rate_threshold(conf, mpp); + select_marginal_path_err_recheck_gap_time(conf, mpp); + select_marginal_path_double_failed_time(conf, mpp); select_skip_kpartx(conf, mpp); select_max_sectors_kb(conf, mpp); diff --git a/libmultipath/dict.c b/libmultipath/dict.c index 36cccc9..3b36e1d 100644 --- a/libmultipath/dict.c +++ b/libmultipath/dict.c @@ -1083,33 +1083,45 @@ declare_hw_handler(delay_wait_checks, set_off_int_undef) declare_hw_snprint(delay_wait_checks, print_off_int_undef) declare_mp_handler(delay_wait_checks, set_off_int_undef) declare_mp_snprint(delay_wait_checks, print_off_int_undef) -declare_def_handler(san_path_err_threshold, set_off_int_undef) -declare_def_snprint_defint(san_path_err_threshold, print_off_int_undef, +declare_def_handler(marginal_path_err_sample_time, set_off_int_undef) +declare_def_snprint_defint(marginal_path_err_sample_time, print_off_int_undef, DEFAULT_ERR_CHECKS) -declare_ovr_handler(san_path_err_threshold, set_off_int_undef) -declare_ovr_snprint(san_path_err_threshold, print_off_int_undef) -declare_hw_handler(san_path_err_threshold, set_off_int_undef) -declare_hw_snprint(san_path_err_threshold, print_off_int_undef) -declare_mp_handler(san_path_err_threshold, set_off_int_undef) -declare_mp_snprint(san_path_err_threshold, print_off_int_undef) -declare_def_handler(san_path_err_forget_rate, set_off_int_undef) -declare_def_snprint_defint(san_path_err_forget_rate, print_off_int_undef, +declare_ovr_handler(marginal_path_err_sample_time, set_off_int_undef) +declare_ovr_snprint(marginal_path_err_sample_time, print_off_int_undef) +declare_hw_handler(marginal_path_err_sample_time, set_off_int_undef) +declare_hw_snprint(marginal_path_err_sample_time, print_off_int_undef) +declare_mp_handler(marginal_path_err_sample_time, set_off_int_undef) +declare_mp_snprint(marginal_path_err_sample_time, print_off_int_undef) +declare_def_handler(marginal_path_err_rate_threshold, set_off_int_undef) +declare_def_snprint_defint(marginal_path_err_rate_threshold, print_off_int_undef, DEFAULT_ERR_CHECKS) -declare_ovr_handler(san_path_err_forget_rate, set_off_int_undef) -declare_ovr_snprint(san_path_err_forget_rate, print_off_int_undef) -declare_hw_handler(san_path_err_forget_rate, set_off_int_undef) -declare_hw_snprint(san_path_err_forget_rate, print_off_int_undef) -declare_mp_handler(san_path_err_forget_rate, set_off_int_undef) -declare_mp_snprint(san_path_err_forget_rate, print_off_int_undef) -declare_def_handler(san_path_err_recovery_time, set_off_int_undef) -declare_def_snprint_defint(san_path_err_recovery_time, print_off_int_undef, +declare_ovr_handler(marginal_path_err_rate_threshold, set_off_int_undef) +declare_ovr_snprint(marginal_path_err_rate_threshold, print_off_int_undef) +declare_hw_handler(marginal_path_err_rate_threshold, set_off_int_undef) +declare_hw_snprint(marginal_path_err_rate_threshold, print_off_int_undef) +declare_mp_handler(marginal_path_err_rate_threshold, set_off_int_undef) +declare_mp_snprint(marginal_path_err_rate_threshold, print_off_int_undef) +declare_def_handler(marginal_path_err_recheck_gap_time, set_off_int_undef) +declare_def_snprint_defint(marginal_path_err_recheck_gap_time, print_off_int_undef, DEFAULT_ERR_CHECKS) -declare_ovr_handler(san_path_err_recovery_time, set_off_int_undef) -declare_ovr_snprint(san_path_err_recovery_time, print_off_int_undef) -declare_hw_handler(san_path_err_recovery_time, set_off_int_undef) -declare_hw_snprint(san_path_err_recovery_time, print_off_int_undef) -declare_mp_handler(san_path_err_recovery_time, set_off_int_undef) -declare_mp_snprint(san_path_err_recovery_time, print_off_int_undef) +declare_ovr_handler(marginal_path_err_recheck_gap_time, set_off_int_undef) +declare_ovr_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef) +declare_hw_handler(marginal_path_err_recheck_gap_time, set_off_int_undef) +declare_hw_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef) +declare_mp_handler(marginal_path_err_recheck_gap_time, set_off_int_undef) +declare_mp_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef) +declare_def_handler(marginal_path_double_failed_time, set_off_int_undef) +declare_def_snprint_defint(marginal_path_double_failed_time, print_off_int_undef, + DEFAULT_ERR_CHECKS) +declare_ovr_handler(marginal_path_double_failed_time, set_off_int_undef) +declare_ovr_snprint(marginal_path_double_failed_time, print_off_int_undef) +declare_hw_handler(marginal_path_double_failed_time, set_off_int_undef) +declare_hw_snprint(marginal_path_double_failed_time, print_off_int_undef) +declare_mp_handler(marginal_path_double_failed_time, set_off_int_undef) +declare_mp_snprint(marginal_path_double_failed_time, print_off_int_undef) + + + static int def_uxsock_timeout_handler(struct config *conf, vector strvec) { @@ -1443,9 +1455,10 @@ init_keywords(vector keywords) install_keyword("config_dir", &def_config_dir_handler, &snprint_def_config_dir); install_keyword("delay_watch_checks", &def_delay_watch_checks_handler, &snprint_def_delay_watch_checks); install_keyword("delay_wait_checks", &def_delay_wait_checks_handler, &snprint_def_delay_wait_checks); - install_keyword("san_path_err_threshold", &def_san_path_err_threshold_handler, &snprint_def_san_path_err_threshold); - install_keyword("san_path_err_forget_rate", &def_san_path_err_forget_rate_handler, &snprint_def_san_path_err_forget_rate); - install_keyword("san_path_err_recovery_time", &def_san_path_err_recovery_time_handler, &snprint_def_san_path_err_recovery_time); + install_keyword("marginal_path_err_sample_time", &def_marginal_path_err_sample_time_handler, &snprint_def_marginal_path_err_sample_time); + install_keyword("marginal_path_err_rate_threshold", &def_marginal_path_err_rate_threshold_handler, &snprint_def_marginal_path_err_rate_threshold); + install_keyword("marginal_path_err_recheck_gap_time", &def_marginal_path_err_recheck_gap_time_handler, &snprint_def_marginal_path_err_recheck_gap_time); + install_keyword("marginal_path_double_failed_time", &def_marginal_path_double_failed_time_handler, &snprint_def_marginal_path_double_failed_time); install_keyword("find_multipaths", &def_find_multipaths_handler, &snprint_def_find_multipaths); install_keyword("uxsock_timeout", &def_uxsock_timeout_handler, &snprint_def_uxsock_timeout); @@ -1530,9 +1543,10 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &hw_deferred_remove_handler, &snprint_hw_deferred_remove); install_keyword("delay_watch_checks", &hw_delay_watch_checks_handler, &snprint_hw_delay_watch_checks); install_keyword("delay_wait_checks", &hw_delay_wait_checks_handler, &snprint_hw_delay_wait_checks); - install_keyword("san_path_err_threshold", &hw_san_path_err_threshold_handler, &snprint_hw_san_path_err_threshold); - install_keyword("san_path_err_forget_rate", &hw_san_path_err_forget_rate_handler, &snprint_hw_san_path_err_forget_rate); - install_keyword("san_path_err_recovery_time", &hw_san_path_err_recovery_time_handler, &snprint_hw_san_path_err_recovery_time); + install_keyword("marginal_path_err_sample_time", &hw_marginal_path_err_sample_time_handler, &snprint_hw_marginal_path_err_sample_time); + install_keyword("marginal_path_err_rate_threshold", &hw_marginal_path_err_rate_threshold_handler, &snprint_hw_marginal_path_err_rate_threshold); + install_keyword("marginal_path_err_recheck_gap_time", &hw_marginal_path_err_recheck_gap_time_handler, &snprint_hw_marginal_path_err_recheck_gap_time); + install_keyword("marginal_path_double_failed_time", &hw_marginal_path_double_failed_time_handler, &snprint_hw_marginal_path_double_failed_time); install_keyword("skip_kpartx", &hw_skip_kpartx_handler, &snprint_hw_skip_kpartx); install_keyword("max_sectors_kb", &hw_max_sectors_kb_handler, &snprint_hw_max_sectors_kb); install_sublevel_end(); @@ -1563,9 +1577,10 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &ovr_deferred_remove_handler, &snprint_ovr_deferred_remove); install_keyword("delay_watch_checks", &ovr_delay_watch_checks_handler, &snprint_ovr_delay_watch_checks); install_keyword("delay_wait_checks", &ovr_delay_wait_checks_handler, &snprint_ovr_delay_wait_checks); - install_keyword("san_path_err_threshold", &ovr_san_path_err_threshold_handler, &snprint_ovr_san_path_err_threshold); - install_keyword("san_path_err_forget_rate", &ovr_san_path_err_forget_rate_handler, &snprint_ovr_san_path_err_forget_rate); - install_keyword("san_path_err_recovery_time", &ovr_san_path_err_recovery_time_handler, &snprint_ovr_san_path_err_recovery_time); + install_keyword("marginal_path_err_sample_time", &ovr_marginal_path_err_sample_time_handler, &snprint_ovr_marginal_path_err_sample_time); + install_keyword("marginal_path_err_rate_threshold", &ovr_marginal_path_err_rate_threshold_handler, &snprint_ovr_marginal_path_err_rate_threshold); + install_keyword("marginal_path_err_recheck_gap_time", &ovr_marginal_path_err_recheck_gap_time_handler, &snprint_ovr_marginal_path_err_recheck_gap_time); + install_keyword("marginal_path_double_failed_time", &ovr_marginal_path_double_failed_time_handler, &snprint_ovr_marginal_path_double_failed_time); install_keyword("skip_kpartx", &ovr_skip_kpartx_handler, &snprint_ovr_skip_kpartx); install_keyword("max_sectors_kb", &ovr_max_sectors_kb_handler, &snprint_ovr_max_sectors_kb); @@ -1595,9 +1610,10 @@ init_keywords(vector keywords) install_keyword("deferred_remove", &mp_deferred_remove_handler, &snprint_mp_deferred_remove); install_keyword("delay_watch_checks", &mp_delay_watch_checks_handler, &snprint_mp_delay_watch_checks); install_keyword("delay_wait_checks", &mp_delay_wait_checks_handler, &snprint_mp_delay_wait_checks); - install_keyword("san_path_err_threshold", &mp_san_path_err_threshold_handler, &snprint_mp_san_path_err_threshold); - install_keyword("san_path_err_forget_rate", &mp_san_path_err_forget_rate_handler, &snprint_mp_san_path_err_forget_rate); - install_keyword("san_path_err_recovery_time", &mp_san_path_err_recovery_time_handler, &snprint_mp_san_path_err_recovery_time); + install_keyword("marginal_path_err_sample_time", &mp_marginal_path_err_sample_time_handler, &snprint_mp_marginal_path_err_sample_time); + install_keyword("marginal_path_err_rate_threshold", &mp_marginal_path_err_rate_threshold_handler, &snprint_mp_marginal_path_err_rate_threshold); + install_keyword("marginal_path_err_recheck_gap_time", &mp_marginal_path_err_recheck_gap_time_handler, &snprint_mp_marginal_path_err_recheck_gap_time); + install_keyword("marginal_path_double_failed_time", &mp_marginal_path_double_failed_time_handler, &snprint_mp_marginal_path_double_failed_time); install_keyword("skip_kpartx", &mp_skip_kpartx_handler, &snprint_mp_skip_kpartx); install_keyword("max_sectors_kb", &mp_max_sectors_kb_handler, &snprint_mp_max_sectors_kb); install_sublevel_end(); diff --git a/libmultipath/discovery.c b/libmultipath/discovery.c index efac824..cadf461 100644 --- a/libmultipath/discovery.c +++ b/libmultipath/discovery.c @@ -1573,7 +1573,7 @@ get_state (struct path * pp, struct config *conf, int daemon, int oldstate) } checker_set_fd(c, pp->fd); if (checker_init(c, pp->mpp?&pp->mpp->mpcontext:NULL)) { - memset(c, 0x0, sizeof(struct checker)); + checker_clear(c); condlog(3, "%s: checker init failed", pp->dev); return PATH_UNCHECKED; } diff --git a/libmultipath/hwtable.c b/libmultipath/hwtable.c index b018ddf..17e8ac6 100644 --- a/libmultipath/hwtable.c +++ b/libmultipath/hwtable.c @@ -568,15 +568,15 @@ static struct hwentry default_hw[] = { }, { /* XIV Storage System / FlashSystem A9000/A9000R */ - .vendor = "IBM", - .product = "2810XIV", + .vendor = "(XIV|IBM)", + .product = "(NEXTRA|2810XIV)", .no_path_retry = NO_PATH_RETRY_QUEUE, .pgpolicy = MULTIBUS, }, { - /* FlashSystem 710/720/810/820/840/900 */ - .vendor = "IBM", - .product = "FlashSystem", + /* TMS RamSan / FlashSystem 710/720/810/820/840/900 */ + .vendor = "(TMS|IBM)", + .product = "(RamSan|FlashSystem)", .pgpolicy = MULTIBUS, }, { @@ -940,7 +940,8 @@ static struct hwentry default_hw[] = { /* OceanStor V3 */ .vendor = "HUAWEI", .product = "XSG1", - .pgpolicy = MULTIBUS, + .pgpolicy = GROUP_BY_PRIO, + .prio_name = PRIO_ALUA, }, /* * Red Hat @@ -1063,6 +1064,13 @@ static struct hwentry default_hw[] = { .pgpolicy = MULTIBUS, .no_path_retry = 30, }, + { + /* Magnitude family */ + .vendor = "(XIOTECH|XIOtech)", + .product = "Magnitude", + .pgpolicy = MULTIBUS, + .no_path_retry = 30, + }, /* * Violin Memory */ @@ -1163,6 +1171,16 @@ static struct hwentry default_hw[] = { .no_path_retry = 30, }, /* + * AccelStor + */ + { + /* NeoSapphire */ + .vendor = "AStor", + .product = "NeoSapphire", + .pgpolicy = MULTIBUS, + .no_path_retry = 30, + }, + /* * EOL */ { diff --git a/libmultipath/io_err_stat.c b/libmultipath/io_err_stat.c new file mode 100644 index 0000000..75a6df6 --- /dev/null +++ b/libmultipath/io_err_stat.c @@ -0,0 +1,743 @@ +/* + * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved. + * + * io_err_stat.c + * version 1.0 + * + * IO error stream statistic process for path failure event from kernel + * + * Author(s): Guan Junxiong 2017 + * + * This file is released under the GPL version 2, or any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vector.h" +#include "memory.h" +#include "checkers.h" +#include "config.h" +#include "structs.h" +#include "structs_vec.h" +#include "devmapper.h" +#include "debug.h" +#include "lock.h" +#include "time-util.h" +#include "io_err_stat.h" + +#define IOTIMEOUT_SEC 60 +#define TIMEOUT_NO_IO_NSEC 10000000 /*10ms = 10000000ns*/ +#define FLAKY_PATHFAIL_THRESHOLD 2 +#define CONCUR_NR_EVENT 32 + +#define PATH_IO_ERR_IN_CHECKING -1 +#define PATH_IO_ERR_IN_POLLING_RECHECK -2 + +#define io_err_stat_log(prio, fmt, args...) \ + condlog(prio, "io error statistic: " fmt, ##args) + + +struct io_err_stat_pathvec { + pthread_mutex_t mutex; + vector pathvec; +}; + +struct dio_ctx { + struct timespec io_starttime; + int blksize; + void *buf; + struct iocb io; +}; + +struct io_err_stat_path { + char devname[FILE_NAME_SIZE]; + int fd; + struct dio_ctx *dio_ctx_array; + int io_err_nr; + int io_nr; + struct timespec start_time; + + int total_time; + int err_rate_threshold; +}; + +pthread_t io_err_stat_thr; +pthread_attr_t io_err_stat_attr; + +static struct io_err_stat_pathvec *paths; +struct vectors *vecs; +io_context_t ioctx; + +static void cancel_inflight_io(struct io_err_stat_path *pp); + +static void rcu_unregister(void *param) +{ + rcu_unregister_thread(); +} + +struct io_err_stat_path *find_err_path_by_dev(vector pathvec, char *dev) +{ + int i; + struct io_err_stat_path *pp; + + if (!pathvec) + return NULL; + vector_foreach_slot(pathvec, pp, i) + if (!strcmp(pp->devname, dev)) + return pp; + + io_err_stat_log(4, "%s: not found in check queue", dev); + + return NULL; +} + +static int init_each_dio_ctx(struct dio_ctx *ct, int blksize, + unsigned long pgsize) +{ + ct->blksize = blksize; + if (posix_memalign(&ct->buf, pgsize, blksize)) + return 1; + memset(ct->buf, 0, blksize); + ct->io_starttime.tv_sec = 0; + ct->io_starttime.tv_nsec = 0; + + return 0; +} + +static void deinit_each_dio_ctx(struct dio_ctx *ct) +{ + if (ct->buf) + free(ct->buf); +} + +static int setup_directio_ctx(struct io_err_stat_path *p) +{ + unsigned long pgsize = getpagesize(); + char fpath[PATH_MAX]; + int blksize = 0; + int i; + + if (snprintf(fpath, PATH_MAX, "/dev/%s", p->devname) >= PATH_MAX) + return 1; + if (p->fd < 0) + p->fd = open(fpath, O_RDONLY | O_DIRECT); + if (p->fd < 0) + return 1; + + p->dio_ctx_array = MALLOC(sizeof(struct dio_ctx) * CONCUR_NR_EVENT); + if (!p->dio_ctx_array) + goto fail_close; + + if (ioctl(p->fd, BLKBSZGET, &blksize) < 0) { + io_err_stat_log(4, "%s:cannot get blocksize, set default 512", + p->devname); + blksize = 512; + } + if (!blksize) + goto free_pdctx; + + for (i = 0; i < CONCUR_NR_EVENT; i++) { + if (init_each_dio_ctx(p->dio_ctx_array + i, blksize, pgsize)) + goto deinit; + } + return 0; + +deinit: + for (i = 0; i < CONCUR_NR_EVENT; i++) + deinit_each_dio_ctx(p->dio_ctx_array + i); +free_pdctx: + FREE(p->dio_ctx_array); +fail_close: + close(p->fd); + + return 1; +} + +static void destroy_directio_ctx(struct io_err_stat_path *p) +{ + int i; + + if (!p || !p->dio_ctx_array) + return; + cancel_inflight_io(p); + + for (i = 0; i < CONCUR_NR_EVENT; i++) + deinit_each_dio_ctx(p->dio_ctx_array + i); + FREE(p->dio_ctx_array); + + if (p->fd > 0) + close(p->fd); +} + +static struct io_err_stat_path *alloc_io_err_stat_path(void) +{ + struct io_err_stat_path *p; + + p = (struct io_err_stat_path *)MALLOC(sizeof(*p)); + if (!p) + return NULL; + + memset(p->devname, 0, sizeof(p->devname)); + p->io_err_nr = 0; + p->io_nr = 0; + p->total_time = 0; + p->start_time.tv_sec = 0; + p->start_time.tv_nsec = 0; + p->err_rate_threshold = 0; + p->fd = -1; + + return p; +} + +static void free_io_err_stat_path(struct io_err_stat_path *p) +{ + FREE(p); +} + +static struct io_err_stat_pathvec *alloc_pathvec(void) +{ + struct io_err_stat_pathvec *p; + int r; + + p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p)); + if (!p) + return NULL; + p->pathvec = vector_alloc(); + if (!p->pathvec) + goto out_free_struct_pathvec; + r = pthread_mutex_init(&p->mutex, NULL); + if (r) + goto out_free_member_pathvec; + + return p; + +out_free_member_pathvec: + vector_free(p->pathvec); +out_free_struct_pathvec: + FREE(p); + return NULL; +} + +static void free_io_err_pathvec(struct io_err_stat_pathvec *p) +{ + struct io_err_stat_path *path; + int i; + + if (!p) + return; + pthread_mutex_destroy(&p->mutex); + if (!p->pathvec) { + vector_foreach_slot(p->pathvec, path, i) { + destroy_directio_ctx(path); + free_io_err_stat_path(path); + } + vector_free(p->pathvec); + } + FREE(p); +} + +/* + * return value + * 0: enqueue OK + * 1: fails because of internal error + * 2: fails because of existing already + */ +static int enqueue_io_err_stat_by_path(struct path *path) +{ + struct io_err_stat_path *p; + + pthread_mutex_lock(&paths->mutex); + p = find_err_path_by_dev(paths->pathvec, path->dev); + if (p) { + pthread_mutex_unlock(&paths->mutex); + return 2; + } + pthread_mutex_unlock(&paths->mutex); + + p = alloc_io_err_stat_path(); + if (!p) + return 1; + + memcpy(p->devname, path->dev, sizeof(p->devname)); + p->total_time = path->mpp->marginal_path_err_sample_time; + p->err_rate_threshold = path->mpp->marginal_path_err_rate_threshold; + + if (setup_directio_ctx(p)) + goto free_ioerr_path; + pthread_mutex_lock(&paths->mutex); + if (!vector_alloc_slot(paths->pathvec)) + goto unlock_destroy; + vector_set_slot(paths->pathvec, p); + pthread_mutex_unlock(&paths->mutex); + + if (!path->io_err_disable_reinstate) { + /* + *fail the path in the kernel for the time of the to make + *the test more reliable + */ + io_err_stat_log(3, "%s: fail dm path %s before checking", + path->mpp->alias, path->dev); + path->io_err_disable_reinstate = 1; + dm_fail_path(path->mpp->alias, path->dev_t); + update_queue_mode_del_path(path->mpp); + + /* + * schedule path check as soon as possible to + * update path state to delayed state + */ + path->tick = 1; + + } + io_err_stat_log(2, "%s: enqueue path %s to check", + path->mpp->alias, path->dev); + return 0; + +unlock_destroy: + pthread_mutex_unlock(&paths->mutex); + destroy_directio_ctx(p); +free_ioerr_path: + free_io_err_stat_path(p); + + return 1; +} + +int io_err_stat_handle_pathfail(struct path *path) +{ + struct timespec curr_time; + int res; + + if (path->io_err_disable_reinstate) { + io_err_stat_log(3, "%s: reinstate is already disabled", + path->dev); + return 1; + } + if (path->io_err_pathfail_cnt < 0) + return 1; + + if (!path->mpp) + return 1; + if (path->mpp->nr_active <= 1) + return 1; + if (path->mpp->marginal_path_double_failed_time <= 0 || + path->mpp->marginal_path_err_sample_time <= 0 || + path->mpp->marginal_path_err_recheck_gap_time <= 0 || + path->mpp->marginal_path_err_rate_threshold < 0) { + io_err_stat_log(4, "%s: parameter not set", path->mpp->alias); + return 1; + } + if (path->mpp->marginal_path_err_sample_time < (2 * IOTIMEOUT_SEC)) { + io_err_stat_log(2, "%s: marginal_path_err_sample_time should not less than %d", + path->mpp->alias, 2 * IOTIMEOUT_SEC); + return 1; + } + /* + * The test should only be started for paths that have failed + * repeatedly in a certain time frame, so that we have reason + * to assume they're flaky. Without bother the admin to configure + * the repeated count threshold and time frame, we assume a path + * which fails at least twice within 60 seconds is flaky. + */ + if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0) + return 1; + if (path->io_err_pathfail_cnt == 0) { + path->io_err_pathfail_cnt++; + path->io_err_pathfail_starttime = curr_time.tv_sec; + io_err_stat_log(5, "%s: start path flakiness pre-checking", + path->dev); + return 0; + } + if ((curr_time.tv_sec - path->io_err_pathfail_starttime) > + path->mpp->marginal_path_double_failed_time) { + path->io_err_pathfail_cnt = 0; + path->io_err_pathfail_starttime = curr_time.tv_sec; + io_err_stat_log(5, "%s: restart path flakiness pre-checking", + path->dev); + } + path->io_err_pathfail_cnt++; + if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) { + res = enqueue_io_err_stat_by_path(path); + if (!res) + path->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING; + else + path->io_err_pathfail_cnt = 0; + } + + return 0; +} + +int hit_io_err_recheck_time(struct path *pp) +{ + struct timespec curr_time; + int r; + + if (pp->io_err_disable_reinstate == 0) + return 1; + if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0) + return 1; + if (pp->io_err_pathfail_cnt != PATH_IO_ERR_IN_POLLING_RECHECK) + return 1; + if (pp->mpp->nr_active <= 0) { + io_err_stat_log(2, "%s: recover path early", pp->dev); + goto recover; + } + if ((curr_time.tv_sec - pp->io_err_dis_reinstate_time) > + pp->mpp->marginal_path_err_recheck_gap_time) { + io_err_stat_log(4, "%s: reschedule checking after %d seconds", + pp->dev, + pp->mpp->marginal_path_err_recheck_gap_time); + /* + * to reschedule io error checking again + * if the path is good enough, we claim it is good + * and can be reinsated as soon as possible in the + * check_path routine. + */ + pp->io_err_dis_reinstate_time = curr_time.tv_sec; + r = enqueue_io_err_stat_by_path(pp); + /* + * Enqueue fails because of internal error. + * In this case , we recover this path + * Or else, return 1 to set path state to PATH_SHAKY + */ + if (r == 1) { + io_err_stat_log(3, "%s: enqueue fails, to recover", + pp->dev); + goto recover; + } else if (!r) { + pp->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING; + } + } + + return 1; + +recover: + pp->io_err_pathfail_cnt = 0; + pp->io_err_disable_reinstate = 0; + pp->tick = 1; + return 0; +} + +static int delete_io_err_stat_by_addr(struct io_err_stat_path *p) +{ + int i; + + i = find_slot(paths->pathvec, p); + if (i != -1) + vector_del_slot(paths->pathvec, i); + + destroy_directio_ctx(p); + free_io_err_stat_path(p); + + return 0; +} + +static void account_async_io_state(struct io_err_stat_path *pp, int rc) +{ + switch (rc) { + case PATH_DOWN: + case PATH_TIMEOUT: + pp->io_err_nr++; + break; + case PATH_UNCHECKED: + case PATH_UP: + case PATH_PENDING: + break; + default: + break; + } +} + +static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp) +{ + struct timespec currtime, difftime; + struct path *path; + double err_rate; + + if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0) + return 1; + timespecsub(&currtime, &pp->start_time, &difftime); + if (difftime.tv_sec < pp->total_time) + return 0; + + io_err_stat_log(4, "%s: check end", pp->devname); + + err_rate = pp->io_nr == 0 ? 0 : (pp->io_err_nr * 1000.0f) / pp->io_nr; + io_err_stat_log(3, "%s: IO error rate (%.1f/1000)", + pp->devname, err_rate); + pthread_cleanup_push(cleanup_lock, &vecs->lock); + lock(&vecs->lock); + pthread_testcancel(); + path = find_path_by_dev(vecs->pathvec, pp->devname); + if (!path) { + io_err_stat_log(4, "path %s not found'", pp->devname); + } else if (err_rate <= pp->err_rate_threshold) { + path->io_err_pathfail_cnt = 0; + path->io_err_disable_reinstate = 0; + io_err_stat_log(3, "%s: (%d/%d) good to enable reinstating", + pp->devname, pp->io_err_nr, pp->io_nr); + /* + * schedule path check as soon as possible to + * update path state. Do NOT reinstate dm path here + */ + path->tick = 1; + + } else if (path->mpp && path->mpp->nr_active > 1) { + io_err_stat_log(3, "%s: keep failing the dm path %s", + path->mpp->alias, path->dev); + path->io_err_pathfail_cnt = PATH_IO_ERR_IN_POLLING_RECHECK; + path->io_err_disable_reinstate = 1; + path->io_err_dis_reinstate_time = currtime.tv_sec; + io_err_stat_log(3, "%s: disable reinstating of %s", + path->mpp->alias, path->dev); + } else { + path->io_err_pathfail_cnt = 0; + path->io_err_disable_reinstate = 0; + io_err_stat_log(3, "%s: there is orphan path, enable reinstating", + pp->devname); + } + lock_cleanup_pop(vecs->lock); + + delete_io_err_stat_by_addr(pp); + + return 0; +} + +static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev) +{ + int rc = -1; + + if (ct->io_starttime.tv_nsec == 0 && + ct->io_starttime.tv_sec == 0) { + struct iocb *ios[1] = { &ct->io }; + + if (clock_gettime(CLOCK_MONOTONIC, &ct->io_starttime) != 0) { + ct->io_starttime.tv_sec = 0; + ct->io_starttime.tv_nsec = 0; + return rc; + } + io_prep_pread(&ct->io, fd, ct->buf, ct->blksize, 0); + if (io_submit(ioctx, 1, ios) != 1) { + io_err_stat_log(5, "%s: io_submit error %i", + dev, errno); + return rc; + } + rc = 0; + } + + return rc; +} + +static void send_batch_async_ios(struct io_err_stat_path *pp) +{ + int i; + struct dio_ctx *ct; + struct timespec currtime, difftime; + + if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0) + return; + /* + * Give a free time for all IO to complete or timeout + */ + if (pp->start_time.tv_sec != 0) { + timespecsub(&currtime, &pp->start_time, &difftime); + if (difftime.tv_sec + IOTIMEOUT_SEC >= pp->total_time) + return; + } + + for (i = 0; i < CONCUR_NR_EVENT; i++) { + ct = pp->dio_ctx_array + i; + if (!send_each_async_io(ct, pp->fd, pp->devname)) + pp->io_nr++; + } + if (pp->start_time.tv_sec == 0 && pp->start_time.tv_nsec == 0 && + clock_gettime(CLOCK_MONOTONIC, &pp->start_time)) { + pp->start_time.tv_sec = 0; + pp->start_time.tv_nsec = 0; + } +} + +static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct timespec *t, + char *dev) +{ + struct timespec difftime; + struct io_event event; + int rc = PATH_UNCHECKED; + int r; + + if (ct->io_starttime.tv_sec == 0) + return rc; + timespecsub(t, &ct->io_starttime, &difftime); + if (difftime.tv_sec > IOTIMEOUT_SEC) { + struct iocb *ios[1] = { &ct->io }; + + io_err_stat_log(5, "%s: abort check on timeout", dev); + r = io_cancel(ioctx, ios[0], &event); + if (r) + io_err_stat_log(5, "%s: io_cancel error %i", + dev, errno); + ct->io_starttime.tv_sec = 0; + ct->io_starttime.tv_nsec = 0; + rc = PATH_TIMEOUT; + } else { + rc = PATH_PENDING; + } + + return rc; +} + +static void poll_async_io_timeout(void) +{ + struct io_err_stat_path *pp; + struct timespec curr_time; + int rc = PATH_UNCHECKED; + int i, j; + + if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0) + return; + vector_foreach_slot(paths->pathvec, pp, i) { + for (j = 0; j < CONCUR_NR_EVENT; j++) { + rc = try_to_cancel_timeout_io(pp->dio_ctx_array + j, + &curr_time, pp->devname); + account_async_io_state(pp, rc); + } + } +} + +static void cancel_inflight_io(struct io_err_stat_path *pp) +{ + struct io_event event; + int i, r; + + for (i = 0; i < CONCUR_NR_EVENT; i++) { + struct dio_ctx *ct = pp->dio_ctx_array + i; + struct iocb *ios[1] = { &ct->io }; + + if (ct->io_starttime.tv_sec == 0 + && ct->io_starttime.tv_nsec == 0) + continue; + io_err_stat_log(5, "%s: abort infligh io", + pp->devname); + r = io_cancel(ioctx, ios[0], &event); + if (r) + io_err_stat_log(5, "%s: io_cancel error %d, %i", + pp->devname, r, errno); + ct->io_starttime.tv_sec = 0; + ct->io_starttime.tv_nsec = 0; + } +} + +static inline int handle_done_dio_ctx(struct dio_ctx *ct, struct io_event *ev) +{ + ct->io_starttime.tv_sec = 0; + ct->io_starttime.tv_nsec = 0; + return (ev->res == ct->blksize) ? PATH_UP : PATH_DOWN; +} + +static void handle_async_io_done_event(struct io_event *io_evt) +{ + struct io_err_stat_path *pp; + struct dio_ctx *ct; + int rc = PATH_UNCHECKED; + int i, j; + + vector_foreach_slot(paths->pathvec, pp, i) { + for (j = 0; j < CONCUR_NR_EVENT; j++) { + ct = pp->dio_ctx_array + j; + if (&ct->io == io_evt->obj) { + rc = handle_done_dio_ctx(ct, io_evt); + account_async_io_state(pp, rc); + return; + } + } + } +} + +static void process_async_ios_event(int timeout_nsecs, char *dev) +{ + struct io_event events[CONCUR_NR_EVENT]; + int i, n; + struct timespec timeout = { .tv_nsec = timeout_nsecs }; + + errno = 0; + n = io_getevents(ioctx, 1L, CONCUR_NR_EVENT, events, &timeout); + if (n < 0) { + io_err_stat_log(3, "%s: async io events returned %d (errno=%s)", + dev, n, strerror(errno)); + } else { + for (i = 0; i < n; i++) + handle_async_io_done_event(&events[i]); + } +} + +static void service_paths(void) +{ + struct io_err_stat_path *pp; + int i; + + pthread_mutex_lock(&paths->mutex); + vector_foreach_slot(paths->pathvec, pp, i) { + send_batch_async_ios(pp); + process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp->devname); + poll_async_io_timeout(); + poll_io_err_stat(vecs, pp); + } + pthread_mutex_unlock(&paths->mutex); +} + +static void *io_err_stat_loop(void *data) +{ + vecs = (struct vectors *)data; + pthread_cleanup_push(rcu_unregister, NULL); + rcu_register_thread(); + + mlockall(MCL_CURRENT | MCL_FUTURE); + while (1) { + service_paths(); + usleep(100000); + } + + pthread_cleanup_pop(1); + return NULL; +} + +int start_io_err_stat_thread(void *data) +{ + if (io_setup(CONCUR_NR_EVENT, &ioctx) != 0) { + io_err_stat_log(4, "io_setup failed"); + return 1; + } + paths = alloc_pathvec(); + if (!paths) + goto destroy_ctx; + + if (pthread_create(&io_err_stat_thr, &io_err_stat_attr, + io_err_stat_loop, data)) { + io_err_stat_log(0, "cannot create io_error statistic thread"); + goto out_free; + } + io_err_stat_log(3, "thread started"); + return 0; + +out_free: + free_io_err_pathvec(paths); +destroy_ctx: + io_destroy(ioctx); + io_err_stat_log(0, "failed to start io_error statistic thread"); + return 1; +} + +void stop_io_err_stat_thread(void) +{ + pthread_cancel(io_err_stat_thr); + pthread_kill(io_err_stat_thr, SIGUSR2); + free_io_err_pathvec(paths); + io_destroy(ioctx); +} diff --git a/libmultipath/io_err_stat.h b/libmultipath/io_err_stat.h new file mode 100644 index 0000000..bbf31b4 --- /dev/null +++ b/libmultipath/io_err_stat.h @@ -0,0 +1,15 @@ +#ifndef _IO_ERR_STAT_H +#define _IO_ERR_STAT_H + +#include "vector.h" +#include "lock.h" + + +extern pthread_attr_t io_err_stat_attr; + +int start_io_err_stat_thread(void *data); +void stop_io_err_stat_thread(void); +int io_err_stat_handle_pathfail(struct path *path); +int hit_io_err_recheck_time(struct path *pp); + +#endif /* _IO_ERR_STAT_H */ diff --git a/libmultipath/prioritizers/path_latency.c b/libmultipath/prioritizers/path_latency.c index 9fc2dfc..9d5397e 100644 --- a/libmultipath/prioritizers/path_latency.c +++ b/libmultipath/prioritizers/path_latency.c @@ -15,39 +15,47 @@ * scale, the priority "rc" of each path can be provided. * * Author(s): Yang Feng + * Revised: Guan Junxiong * * This file is released under the GPL version 2, or any later version. */ +#define _GNU_SOURCE #include #include #include #include +#include +#include +#include +#include #include "debug.h" #include "prio.h" #include "structs.h" -#include "../checkers/libsg.h" +#include "util.h" #define pp_pl_log(prio, fmt, args...) condlog(prio, "path_latency prio: " fmt, ##args) #define MAX_IO_NUM 200 -#define MIN_IO_NUM 2 +#define MIN_IO_NUM 20 +#define DEF_IO_NUM 100 #define MAX_BASE_NUM 10 -#define MIN_BASE_NUM 2 +#define MIN_BASE_NUM 1.01 +#define DEF_BASE_NUM 1.5 #define MAX_AVG_LATENCY 100000000. /* Unit: us */ #define MIN_AVG_LATENCY 1. /* Unit: us */ #define DEFAULT_PRIORITY 0 -#define MAX_CHAR_SIZE 30 - #define USEC_PER_SEC 1000000LL #define NSEC_PER_USEC 1000LL -static long long path_latency[MAX_IO_NUM]; +#define DEF_BLK_SIZE 4096 + +static double lg_path_latency[MAX_IO_NUM]; static inline long long timeval_to_us(const struct timespec *tv) { @@ -55,18 +63,75 @@ static inline long long timeval_to_us(const struct timespec *tv) (tv->tv_nsec / NSEC_PER_USEC); } -static int do_readsector0(int fd, unsigned int timeout) +static int prepare_directio_read(int fd, int *blksz, char **pbuf, + int *restore_flags) +{ + unsigned long pgsize = getpagesize(); + long flags; + + if (ioctl(fd, BLKBSZGET, blksz) < 0) { + pp_pl_log(3,"catnnot get blocksize, set default"); + *blksz = DEF_BLK_SIZE; + } + if (posix_memalign((void **)pbuf, pgsize, *blksz)) + return -1; + + flags = fcntl(fd, F_GETFL); + if (flags < 0) + goto free_out; + if (!(flags & O_DIRECT)) { + flags |= O_DIRECT; + if (fcntl(fd, F_SETFL, flags) < 0) + goto free_out; + *restore_flags = 1; + } + + return 0; + +free_out: + free(*pbuf); + + return -1; +} + +static void cleanup_directio_read(int fd, char *buf, int restore_flags) { - unsigned char buf[4096]; - unsigned char sbuf[SENSE_BUFF_LEN]; + long flags; + + free(buf); + + if (!restore_flags) + return; + if ((flags = fcntl(fd, F_GETFL)) >= 0) { + int ret __attribute__ ((unused)); + flags &= ~O_DIRECT; + /* No point in checking for errors */ + ret = fcntl(fd, F_SETFL, flags); + } +} + +static int do_directio_read(int fd, unsigned int timeout, char *buf, int sz) +{ + fd_set read_fds; + struct timeval tm = { .tv_sec = timeout }; int ret; + int num_read; - ret = sg_read(fd, &buf[0], 4096, &sbuf[0], SENSE_BUFF_LEN, timeout); + if (lseek(fd, 0, SEEK_SET) == -1) + return -1; + FD_ZERO(&read_fds); + FD_SET(fd, &read_fds); + ret = select(fd+1, &read_fds, NULL, NULL, &tm); + if (ret <= 0) + return -1; + num_read = read(fd, buf, sz); + if (num_read != sz) + return -1; - return ret; + return 0; } -int check_args_valid(int io_num, int base_num) +int check_args_valid(int io_num, double base_num) { if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM)) { pp_pl_log(0, "args io_num is outside the valid range"); @@ -82,176 +147,200 @@ int check_args_valid(int io_num, int base_num) } /* - * In multipath.conf, args form: io_num|base_num. For example, - * args is "20|10", this function can get io_num value 20, and + * In multipath.conf, args form: io_num=n base_num=m. For example, args are + * "io_num=20 base_num=10", this function can get io_num value 20 and * base_num value 10. */ -static int get_ionum_and_basenum(char *args, int *ionum, int *basenum) +static int get_ionum_and_basenum(char *args, int *ionum, double *basenum) { - char source[MAX_CHAR_SIZE]; - char vertica = '|'; - char *endstrbefore = NULL; - char *endstrafter = NULL; - unsigned int size = strlen(args); + char split_char[] = " \t"; + char *arg, *temp; + char *str, *str_inval; + int i; + int flag_io = 0, flag_base = 0; if ((args == NULL) || (ionum == NULL) || (basenum == NULL)) { pp_pl_log(0, "args string is NULL"); return 0; } - if ((size < 1) || (size > MAX_CHAR_SIZE - 1)) { - pp_pl_log(0, "args string's size is too long"); + arg = temp = STRDUP(args); + if (!arg) return 0; - } - memcpy(source, args, size + 1); - - if (!isdigit(source[0])) { - pp_pl_log(0, "invalid prio_args format: %s", source); - return 0; - } - - *ionum = (int)strtoul(source, &endstrbefore, 10); - if (endstrbefore[0] != vertica) { - pp_pl_log(0, "invalid prio_args format: %s", source); - return 0; + for (i = 0; i < 2; i++) { + str = get_next_string(&temp, split_char); + if (!str) + goto out; + if (!strncmp(str, "io_num=", 7) && strlen(str) > 7) { + *ionum = (int)strtoul(str + 7, &str_inval, 10); + if (str == str_inval) + goto out; + flag_io = 1; + } + else if (!strncmp(str, "base_num=", 9) && strlen(str) > 9) { + *basenum = strtod(str + 9, &str_inval); + if (str == str_inval) + goto out; + flag_base = 1; + } } - if (!isdigit(endstrbefore[1])) { - pp_pl_log(0, "invalid prio_args format: %s", source); - return 0; - } - - *basenum = (long long)strtol(&endstrbefore[1], &endstrafter, 10); - if (check_args_valid(*ionum, *basenum) == 0) { - return 0; - } + if (!flag_io || !flag_base) + goto out; + if (check_args_valid(*ionum, *basenum) == 0) + goto out; + FREE(arg); return 1; +out: + FREE(arg); + return 0; } -long long calc_standard_deviation(long long *path_latency, int size, - long long avglatency) +double calc_standard_deviation(double *lg_path_latency, int size, + double lg_avglatency) { int index; - long long total = 0; + double sum = 0; for (index = 0; index < size; index++) { - total += - (path_latency[index] - avglatency) * (path_latency[index] - - avglatency); + sum += (lg_path_latency[index] - lg_avglatency) * + (lg_path_latency[index] - lg_avglatency); } - total /= (size - 1); + sum /= (size - 1); - return (long long)sqrt((double)total); + return sqrt(sum); } -int calcPrio(double avglatency, double max_avglatency, double min_avglatency, - double base_num) +/* + * Do not scale the prioriy in a certain range such as [0, 1024] + * because scaling will eliminate the effect of base_num. + */ +int calcPrio(double lg_avglatency, double lg_maxavglatency, + double lg_minavglatency) { - double lavglatency = log(avglatency) / log(base_num); - double lmax_avglatency = log(max_avglatency) / log(base_num); - double lmin_avglatency = log(min_avglatency) / log(base_num); - - if (lavglatency <= lmin_avglatency) - return (int)(lmax_avglatency + 1.); + if (lg_avglatency <= lg_minavglatency) + return lg_maxavglatency - lg_minavglatency; - if (lavglatency > lmax_avglatency) + if (lg_avglatency >= lg_maxavglatency) return 0; - return (int)(lmax_avglatency - lavglatency + 1.); -} - -/* Calc the latency interval corresponding to the average latency */ -long long calc_latency_interval(double avglatency, double max_avglatency, - double min_avglatency, double base_num) -{ - double lavglatency = log(avglatency) / log(base_num); - double lmax_avglatency = log(max_avglatency) / log(base_num); - double lmin_avglatency = log(min_avglatency) / log(base_num); - - if ((lavglatency <= lmin_avglatency) - || (lavglatency > lmax_avglatency)) - return 0; /* Invalid value */ - - if ((double)((int)lavglatency) == lavglatency) - return (long long)(avglatency - (avglatency / base_num)); - else - return (long long)(pow(base_num, (double)((int)lavglatency + 1)) - - pow(base_num, (double)((int)lavglatency))); + return lg_maxavglatency - lg_avglatency; } int getprio(struct path *pp, char *args, unsigned int timeout) { int rc, temp; int index = 0; - int io_num; - int base_num; - long long avglatency; - long long latency_interval; - long long standard_deviation; - long long toldelay = 0; + int io_num = 0; + double base_num = 0; + double lg_avglatency, lg_maxavglatency, lg_minavglatency; + double standard_deviation; + double lg_toldelay = 0; long long before, after; struct timespec tv; + int blksize; + char *buf; + int restore_flags = 0; + double lg_base; + long long sum_latency = 0; + long long arith_mean_lat; if (pp->fd < 0) return -1; if (get_ionum_and_basenum(args, &io_num, &base_num) == 0) { - pp_pl_log(0, "%s: get path_latency args fail", pp->dev); - return DEFAULT_PRIORITY; + io_num = DEF_IO_NUM; + base_num = DEF_BASE_NUM; + pp_pl_log(0, "%s: fails to get path_latency args, set default:" + "io_num=%d base_num=%.3lf", + pp->dev, io_num, base_num); } - memset(path_latency, 0, sizeof(path_latency)); + memset(lg_path_latency, 0, sizeof(lg_path_latency)); + lg_base = log(base_num); + lg_maxavglatency = log(MAX_AVG_LATENCY) / lg_base; + lg_minavglatency = log(MIN_AVG_LATENCY) / lg_base; + + prepare_directio_read(pp->fd, &blksize, &buf, &restore_flags); temp = io_num; while (temp-- > 0) { (void)clock_gettime(CLOCK_MONOTONIC, &tv); before = timeval_to_us(&tv); - if (do_readsector0(pp->fd, timeout) == 2) { + if (do_directio_read(pp->fd, timeout, buf, blksize)) { pp_pl_log(0, "%s: path down", pp->dev); + cleanup_directio_read(pp->fd, buf, restore_flags); return -1; } (void)clock_gettime(CLOCK_MONOTONIC, &tv); after = timeval_to_us(&tv); - - path_latency[index] = after - before; - toldelay += path_latency[index++]; + /* + * We assume that the latency complies with Log-normal + * distribution. The logarithm of latency is in normal + * distribution. + */ + lg_path_latency[index] = log(after - before) / lg_base; + lg_toldelay += lg_path_latency[index++]; + sum_latency += after - before; } - avglatency = toldelay / (long long)io_num; - pp_pl_log(4, "%s: average latency is (%lld us)", pp->dev, avglatency); + cleanup_directio_read(pp->fd, buf, restore_flags); + + lg_avglatency = lg_toldelay / (long long)io_num; + arith_mean_lat = sum_latency / (long long)io_num; + pp_pl_log(4, "%s: arithmetic mean latency is (%lld us), geometric mean latency is (%lld us)", + pp->dev, arith_mean_lat, + (long long)pow(base_num, lg_avglatency)); - if (avglatency > MAX_AVG_LATENCY) { + if (lg_avglatency > lg_maxavglatency) { pp_pl_log(0, "%s: average latency (%lld us) is outside the thresold (%lld us)", - pp->dev, avglatency, (long long)MAX_AVG_LATENCY); + pp->dev, (long long)pow(base_num, lg_avglatency), + (long long)MAX_AVG_LATENCY); return DEFAULT_PRIORITY; } + standard_deviation = calc_standard_deviation(lg_path_latency, + index, lg_avglatency); /* - * Min average latency and max average latency are constant, the args - * base_num set can change latency_interval value corresponding to - * avglatency and is not constant. - * Warn the user if latency_interval is smaller than (2 * standard_deviation), - * or equal. + * In calPrio(), we let prio y = f(x) = log(max, base) - log (x, base); + * So if we want to let the priority of the latency outside 2 standard + * deviations can be distinguished from the latency inside 2 standard + * deviation, in others words at most 95% are the same and at least 5% + * are different according interval estimation of normal distribution, + * we should warn the user to set the base_num to be smaller if the + * log(x_threshold, base) is small than 2 standard deviation. + * x_threshold is derived from: + * y + 1 = f(x) + 1 = f(x) + log(base, base), so x_threadshold = + * base_num; Note that we only can compare the logarithm of x_threshold + * with the standard deviation because the standard deviation is derived + * from logarithm of latency. + * + * therefore , we recommend the base_num to meet the condition : + * 1 <= 2 * standard_deviation */ - standard_deviation = - calc_standard_deviation(path_latency, index, avglatency); - latency_interval = - calc_latency_interval(avglatency, MAX_AVG_LATENCY, MIN_AVG_LATENCY, - base_num); - if ((latency_interval != 0) - && (latency_interval <= (2 * standard_deviation))) - pp_pl_log(3, - "%s: latency interval (%lld) according to average latency (%lld us) is smaller than " - "2 * standard deviation (%lld us), or equal, args base_num (%d) needs to be set bigger value", - pp->dev, latency_interval, avglatency, - standard_deviation, base_num); - - rc = calcPrio(avglatency, MAX_AVG_LATENCY, MIN_AVG_LATENCY, base_num); + pp_pl_log(5, "%s: standard deviation for logarithm of latency = %.6f", + pp->dev, standard_deviation); + if (standard_deviation <= 0.5) + pp_pl_log(3, "%s: the base_num(%.3lf) is too big to distinguish different priority " + "of two far-away latency. It is recommend to be set smaller", + pp->dev, base_num); + /* + * If the standard deviation is too large , we should also warn the user + */ + + if (standard_deviation > 4) + pp_pl_log(3, "%s: the base_num(%.3lf) is too small to avoid noise disturbance " + ".It is recommend to be set larger", + pp->dev, base_num); + + + rc = calcPrio(lg_avglatency, lg_maxavglatency, lg_minavglatency); + return rc; } diff --git a/libmultipath/prioritizers/weightedpath.c b/libmultipath/prioritizers/weightedpath.c index 34a43a8..e0f3efb 100644 --- a/libmultipath/prioritizers/weightedpath.c +++ b/libmultipath/prioritizers/weightedpath.c @@ -34,15 +34,7 @@ #include #include "structs_vec.h" #include "print.h" - -char *get_next_string(char **temp, char *split_char) -{ - char *token = NULL; - token = strsep(temp, split_char); - while (token != NULL && !strcmp(token, "")) - token = strsep(temp, split_char); - return token; -} +#include "util.h" #define CHECK_LEN \ do { \ diff --git a/libmultipath/propsel.c b/libmultipath/propsel.c index 00adc0d..0d29ed2 100644 --- a/libmultipath/propsel.c +++ b/libmultipath/propsel.c @@ -367,16 +367,42 @@ out: return 0; } +/* + * Current RDAC (NetApp E-Series) firmware relies + * on periodic REPORT TARGET PORT GROUPS for + * internal load balancing. + * Using the sysfs priority checker defeats this purpose. + * + * Moreover, NetApp would also prefer the RDAC checker over ALUA. + * (https://www.redhat.com/archives/dm-devel/2017-September/msg00326.html) + */ +static int +check_rdac(struct path * pp) +{ + int len; + char buff[44]; + + len = get_vpd_sgio(pp->fd, 0xC9, buff, 44); + if (len <= 0) + return 0; + return !(memcmp(buff + 4, "vac1", 4)); +} + int select_checker(struct config *conf, struct path *pp) { char *origin, *checker_name; struct checker * c = &pp->checker; - if (pp->detect_checker == DETECT_CHECKER_ON && pp->tpgs > 0) { - checker_name = TUR; + if (pp->detect_checker == DETECT_CHECKER_ON) { origin = "(setting: storage device autodetected)"; - goto out; - } + if (check_rdac(pp)) { + checker_name = RDAC; + goto out; + } else if (pp->tpgs > 0) { + checker_name = TUR; + goto out; + } + } do_set(checker_name, conf->overrides, checker_name, "(setting: multipath.conf overrides section)"); do_set(checker_name, pp->hwe, checker_name, "(setting: storage device configuration)"); do_set(checker_name, conf, checker_name, "(setting: multipath.conf defaults/devices section)"); @@ -427,24 +453,6 @@ out: return 0; } -/* - * Current RDAC (NetApp E-Series) firmware relies - * on periodic REPORT TARGET PORT GROUPS for - * internal load balancing. - * Using the sysfs priority checker defeats this purpose. - */ -static int -check_rdac(struct path * pp) -{ - int len; - char buff[44]; - - len = get_vpd_sgio(pp->fd, 0xC9, buff, 44); - if (len <= 0) - return 0; - return !(memcmp(buff + 4, "vac1", 4)); -} - void detect_prio(struct config *conf, struct path * pp) { @@ -754,51 +762,71 @@ out: return 0; } -int select_san_path_err_threshold(struct config *conf, struct multipath *mp) + +int select_marginal_path_err_sample_time(struct config *conf, struct multipath *mp) { char *origin, buff[12]; - mp_set_mpe(san_path_err_threshold); - mp_set_ovr(san_path_err_threshold); - mp_set_hwe(san_path_err_threshold); - mp_set_conf(san_path_err_threshold); - mp_set_default(san_path_err_threshold, DEFAULT_ERR_CHECKS); + mp_set_mpe(marginal_path_err_sample_time); + mp_set_ovr(marginal_path_err_sample_time); + mp_set_hwe(marginal_path_err_sample_time); + mp_set_conf(marginal_path_err_sample_time); + mp_set_default(marginal_path_err_sample_time, DEFAULT_ERR_CHECKS); out: - print_off_int_undef(buff, 12, &mp->san_path_err_threshold); - condlog(3, "%s: san_path_err_threshold = %s %s", mp->alias, buff, origin); + print_off_int_undef(buff, 12, &mp->marginal_path_err_sample_time); + condlog(3, "%s: marginal_path_err_sample_time = %s %s", mp->alias, buff, + origin); return 0; } -int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp) +int select_marginal_path_err_rate_threshold(struct config *conf, struct multipath *mp) { char *origin, buff[12]; - mp_set_mpe(san_path_err_forget_rate); - mp_set_ovr(san_path_err_forget_rate); - mp_set_hwe(san_path_err_forget_rate); - mp_set_conf(san_path_err_forget_rate); - mp_set_default(san_path_err_forget_rate, DEFAULT_ERR_CHECKS); + mp_set_mpe(marginal_path_err_rate_threshold); + mp_set_ovr(marginal_path_err_rate_threshold); + mp_set_hwe(marginal_path_err_rate_threshold); + mp_set_conf(marginal_path_err_rate_threshold); + mp_set_default(marginal_path_err_rate_threshold, DEFAULT_ERR_CHECKS); out: - print_off_int_undef(buff, 12, &mp->san_path_err_forget_rate); - condlog(3, "%s: san_path_err_forget_rate = %s %s", mp->alias, buff, origin); + print_off_int_undef(buff, 12, &mp->marginal_path_err_rate_threshold); + condlog(3, "%s: marginal_path_err_rate_threshold = %s %s", mp->alias, buff, + origin); return 0; - } -int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp) + +int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multipath *mp) { char *origin, buff[12]; - mp_set_mpe(san_path_err_recovery_time); - mp_set_ovr(san_path_err_recovery_time); - mp_set_hwe(san_path_err_recovery_time); - mp_set_conf(san_path_err_recovery_time); - mp_set_default(san_path_err_recovery_time, DEFAULT_ERR_CHECKS); + mp_set_mpe(marginal_path_err_recheck_gap_time); + mp_set_ovr(marginal_path_err_recheck_gap_time); + mp_set_hwe(marginal_path_err_recheck_gap_time); + mp_set_conf(marginal_path_err_recheck_gap_time); + mp_set_default(marginal_path_err_recheck_gap_time, DEFAULT_ERR_CHECKS); out: - print_off_int_undef(buff, 12, &mp->san_path_err_recovery_time); - condlog(3, "%s: san_path_err_recovery_time = %s %s", mp->alias, buff, origin); + print_off_int_undef(buff, 12, &mp->marginal_path_err_recheck_gap_time); + condlog(3, "%s: marginal_path_err_recheck_gap_time = %s %s", mp->alias, buff, + origin); return 0; +} +int select_marginal_path_double_failed_time(struct config *conf, struct multipath *mp) +{ + char *origin, buff[12]; + + mp_set_mpe(marginal_path_double_failed_time); + mp_set_ovr(marginal_path_double_failed_time); + mp_set_hwe(marginal_path_double_failed_time); + mp_set_conf(marginal_path_double_failed_time); + mp_set_default(marginal_path_double_failed_time, DEFAULT_ERR_CHECKS); +out: + print_off_int_undef(buff, 12, &mp->marginal_path_double_failed_time); + condlog(3, "%s: marginal_path_double_failed_time = %s %s", mp->alias, buff, + origin); + return 0; } + int select_skip_kpartx (struct config *conf, struct multipath * mp) { char *origin; diff --git a/libmultipath/propsel.h b/libmultipath/propsel.h index f8e96d8..347cb32 100644 --- a/libmultipath/propsel.h +++ b/libmultipath/propsel.h @@ -25,9 +25,10 @@ int select_delay_watch_checks (struct config *conf, struct multipath * mp); int select_delay_wait_checks (struct config *conf, struct multipath * mp); int select_skip_kpartx (struct config *conf, struct multipath * mp); int select_max_sectors_kb (struct config *conf, struct multipath * mp); -int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp); -int select_san_path_err_threshold(struct config *conf, struct multipath *mp); -int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp); +int select_marginal_path_err_sample_time(struct config *conf, struct multipath *mp); +int select_marginal_path_err_rate_threshold(struct config *conf, struct multipath *mp); +int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multipath *mp); +int select_marginal_path_double_failed_time(struct config *conf, struct multipath *mp); void reconcile_features_with_options(const char *id, char **features, int* no_path_retry, int *retain_hwhandler); diff --git a/libmultipath/structs.c b/libmultipath/structs.c index 828e790..3e057f5 100644 --- a/libmultipath/structs.c +++ b/libmultipath/structs.c @@ -99,6 +99,7 @@ alloc_path (void) pp->fd = -1; pp->tpgs = TPGS_UNDEF; pp->priority = PRIO_UNDEF; + checker_clear(&pp->checker); } return pp; } diff --git a/libmultipath/structs.h b/libmultipath/structs.h index f06824a..c2cf3fb 100644 --- a/libmultipath/structs.h +++ b/libmultipath/structs.h @@ -240,10 +240,10 @@ struct path { int initialized; int retriggers; int wwid_changed; - unsigned int path_failures; - time_t dis_reinstate_time; - int disable_reinstate; - int san_path_err_forget_rate; + time_t io_err_dis_reinstate_time; + int io_err_disable_reinstate; + int io_err_pathfail_cnt; + int io_err_pathfail_starttime; /* configlet pointers */ struct hwentry * hwe; }; @@ -275,9 +275,10 @@ struct multipath { int deferred_remove; int delay_watch_checks; int delay_wait_checks; - int san_path_err_threshold; - int san_path_err_forget_rate; - int san_path_err_recovery_time; + int marginal_path_err_sample_time; + int marginal_path_err_rate_threshold; + int marginal_path_err_recheck_gap_time; + int marginal_path_double_failed_time; int skip_kpartx; int max_sectors_kb; int force_readonly; diff --git a/libmultipath/uevent.c b/libmultipath/uevent.c index 0cbcc59..80bf1dd 100644 --- a/libmultipath/uevent.c +++ b/libmultipath/uevent.c @@ -922,3 +922,35 @@ char *uevent_get_dm_name(struct uevent *uev) } return p; } + +char *uevent_get_dm_path(struct uevent *uev) +{ + char *p = NULL; + int i; + + for (i = 0; uev->envp[i] != NULL; i++) { + if (!strncmp(uev->envp[i], "DM_PATH", 7) && + strlen(uev->envp[i]) > 8) { + p = MALLOC(strlen(uev->envp[i] + 8) + 1); + strcpy(p, uev->envp[i] + 8); + break; + } + } + return p; +} + +char *uevent_get_dm_action(struct uevent *uev) +{ + char *p = NULL; + int i; + + for (i = 0; uev->envp[i] != NULL; i++) { + if (!strncmp(uev->envp[i], "DM_ACTION", 9) && + strlen(uev->envp[i]) > 10) { + p = MALLOC(strlen(uev->envp[i] + 10) + 1); + strcpy(p, uev->envp[i] + 10); + break; + } + } + return p; +} diff --git a/libmultipath/uevent.h b/libmultipath/uevent.h index 61a4207..6f5af0a 100644 --- a/libmultipath/uevent.h +++ b/libmultipath/uevent.h @@ -37,5 +37,7 @@ int uevent_get_major(struct uevent *uev); int uevent_get_minor(struct uevent *uev); int uevent_get_disk_ro(struct uevent *uev); char *uevent_get_dm_name(struct uevent *uev); +char *uevent_get_dm_path(struct uevent *uev); +char *uevent_get_dm_action(struct uevent *uev); #endif /* _UEVENT_H */ diff --git a/libmultipath/util.c b/libmultipath/util.c index 0800da5..0b43d29 100644 --- a/libmultipath/util.c +++ b/libmultipath/util.c @@ -65,6 +65,15 @@ filepresent (char * run) { return 0; } +char *get_next_string(char **temp, char *split_char) +{ + char *token = NULL; + token = strsep(temp, split_char); + while (token != NULL && !strcmp(token, "")) + token = strsep(temp, split_char); + return token; +} + int get_word (char * sentence, char ** word) { diff --git a/libmultipath/util.h b/libmultipath/util.h index 3dc048e..51a6d54 100644 --- a/libmultipath/util.h +++ b/libmultipath/util.h @@ -7,6 +7,7 @@ size_t strchop(char *); int basenamecpy (const char * src, char * dst, int); int filepresent (char * run); +char *get_next_string(char **temp, char *split_char); int get_word (char * sentence, char ** word); size_t strlcpy(char *dst, const char *src, size_t size); size_t strlcat(char *dst, const char *src, size_t size); diff --git a/libmultipath/version.h b/libmultipath/version.h index 0a0da9e..ca628b2 100644 --- a/libmultipath/version.h +++ b/libmultipath/version.h @@ -20,8 +20,8 @@ #ifndef _VERSION_H #define _VERSION_H -#define VERSION_CODE 0x000703 -#define DATE_CODE 0x090514 +#define VERSION_CODE 0x000704 +#define DATE_CODE 0x0b0f11 #define PROG "multipath-tools" diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5 index 5b6dde7..36551b4 100644 --- a/multipath/multipath.conf.5 +++ b/multipath/multipath.conf.5 @@ -351,7 +351,7 @@ these values can be looked up through sysfs or by running \fImultipathd show pat .RE .TP 12 .I path_latency -Needs a value of the form \fI"|"\fR +Needs a value of the form "io_num=\fI<20>\fR base_num=\fI<10>\fR" .RS .TP 8 .I io_num @@ -653,7 +653,7 @@ seconds, or 68 years. It will be automatically adjusted to the overall retry interval \fIno_path_retry\fR * \fIpolling_interval\fR if a number of retries is given with \fIno_path_retry\fR and the overall retry interval is longer than the specified \fIdev_loss_tmo\fR value. -The Linux kernel will cap this value to \fI300\fR if \fIfast_io_fail_tmo\fR +The Linux kernel will cap this value to \fI600\fR if \fIfast_io_fail_tmo\fR is not set. See KNOWN ISSUES. .RS .TP @@ -682,6 +682,17 @@ The default is: \fB/etc/multipath/wwids\fR . . .TP +.B prkeys_file +The full pathname of the prkeys file, which is used by multipathd to keep +track of the persistent reservation key used for a specific WWID, when +\fIreservation_key\fR is set to \fBfile\fR. +.RS +.TP +The default is \fB/etc/multipath/prkeys\fR +.RE +. +. +.TP .B log_checker_err If set to .I once @@ -703,6 +714,12 @@ the same as the RESERVATION KEY field of the PERSISTENT RESERVE OUT parameter list which contains an 8-byte value provided by the application client to the device server to identify the I_T nexus. .RS +.PP +Alternatively, this can be set to \fBfile\fR, which will store the RESERVATION +KEY registered by mpathpersist in the \fIprkeys_file\fR. multipathd will then +use this key to register additional paths as they appear. When the +registration is removed, the RESERVATION KEY is removed from the +\fIprkeys_file\fR. .TP The default is: \fB\fR .RE @@ -824,13 +841,14 @@ The default is: \fB/etc/multipath/conf.d/\fR . . .TP -.B san_path_err_threshold -If set to a value greater than 0, multipathd will watch paths and check how many -times a path has been failed due to errors.If the number of failures on a particular -path is greater then the san_path_err_threshold then the path will not reinstante -till san_path_err_recovery_time.These path failures should occur within a -san_path_err_forget_rate checks, if not we will consider the path is good enough -to reinstantate. +.B marginal_path_double_failed_time +One of the four parameters of supporting path check based on accounting IO +error such as intermittent error. When a path failed event occurs twice in +\fImarginal_path_double_failed_time\fR seconds due to an IO error and all the +other three parameters are set, multipathd will fail the path and enqueue +this path into a queue of which members are sent a couple of continuous +direct reading asynchronous IOs at a fixed sample rate of 10HZ to start IO +error accounting process. .RS .TP The default is: \fBno\fR @@ -838,11 +856,21 @@ The default is: \fBno\fR . . .TP -.B san_path_err_forget_rate -If set to a value greater than 0, multipathd will check whether the path failures -has exceeded the san_path_err_threshold within this many checks i.e -san_path_err_forget_rate . If so we will not reinstante the path till -san_path_err_recovery_time. +.B marginal_path_err_sample_time +One of the four parameters of supporting path check based on accounting IO +error such as intermittent error. If it is set to a value no less than 120, +when a path fail event occurs twice in \fImarginal_path_double_failed_time\fR +second due to an IO error, multipathd will fail the path and enqueue this +path into a queue of which members are sent a couple of continuous direct +reading asynchronous IOs at a fixed sample rate of 10HZ to start the IO +accounting process for the path will last for +\fImarginal_path_err_sample_time\fR. +If the rate of IO error on a particular path is greater than the +\fImarginal_path_err_rate_threshold\fR, then the path will not reinstate for +\fImarginal_path_err_rate_threshold\fR seconds unless there is only one +active path. After \fImarginal_path_err_recheck_gap_time\fR expires, the path +will be requeueed for rechecking. If checking result is good enough, the +path will be reinstated. .RS .TP The default is: \fBno\fR @@ -850,12 +878,30 @@ The default is: \fBno\fR . . .TP -.B san_path_err_recovery_time -If set to a value greater than 0, multipathd will make sure that when path failures -has exceeded the san_path_err_threshold within san_path_err_forget_rate then the path -will be placed in failed state for san_path_err_recovery_time duration.Once san_path_err_recovery_time -has timeout we will reinstante the failed path . -san_path_err_recovery_time value should be in secs. +.B marginal_path_err_rate_threshold +The error rate threshold as a permillage (1/1000). One of the four parameters +of supporting path check based on accounting IO error such as intermittent +error. Refer to \fImarginal_path_err_sample_time\fR. If the rate of IO errors +on a particular path is greater than this parameter, then the path will not +reinstate for \fImarginal_path_err_rate_threshold\fR seconds unless there is +only one active path. +.RS +.TP +The default is: \fBno\fR +.RE +. +. +.TP +.B marginal_path_err_recheck_gap_time +One of the four parameters of supporting path check based on accounting IO +error such as intermittent error. Refer to +\fImarginal_path_err_sample_time\fR. If this parameter is set to a positive +value, the failed path of which the IO error rate is larger than +\fImarginal_path_err_rate_threshold\fR will be kept in failed state for +\fImarginal_path_err_recheck_gap_time\fR seconds. When +\fImarginal_path_err_recheck_gap_time\fR seconds expires, the path will be +requeueed for checking. If checking result is good enough, the path will be +reinstated, or else it will keep failed. .RS .TP The default is: \fBno\fR @@ -1127,11 +1173,13 @@ are taken from the \fIdefaults\fR or \fIdevices\fR section: .TP .B deferred_remove .TP -.B san_path_err_threshold +.B marginal_path_err_sample_time +.TP +.B marginal_path_err_rate_threshold .TP -.B san_path_err_forget_rate +.B marginal_path_err_recheck_gap_time .TP -.B san_path_err_recovery_time +.B marginal_path_double_failed_time .TP .B delay_watch_checks .TP @@ -1254,11 +1302,13 @@ section: .TP .B deferred_remove .TP -.B san_path_err_threshold +.B marginal_path_err_sample_time .TP -.B san_path_err_forget_rate +.B marginal_path_err_rate_threshold .TP -.B san_path_err_recovery_time +.B marginal_path_err_recheck_gap_time +.TP +.B marginal_path_double_failed_time .TP .B delay_watch_checks .TP @@ -1326,11 +1376,13 @@ the values are taken from the \fIdevices\fR or \fIdefaults\fR sections: .TP .B deferred_remove .TP -.B san_path_err_threshold +.B marginal_path_err_sample_time +.TP +.B marginal_path_err_rate_threshold .TP -.B san_path_err_forget_rate +.B marginal_path_err_recheck_gap_time .TP -.B san_path_err_recovery_time +.B marginal_path_double_failed_time .TP .B delay_watch_checks .TP diff --git a/multipathd/main.c b/multipathd/main.c index 8049da2..31ce923 100644 --- a/multipathd/main.c +++ b/multipathd/main.c @@ -84,6 +84,7 @@ int uxsock_timeout; #include "cli_handlers.h" #include "lock.h" #include "waiter.h" +#include "io_err_stat.h" #include "wwids.h" #include "../third-party/valgrind/drd.h" @@ -169,6 +170,7 @@ sd_notify_status(void) return NULL; } +#ifdef USE_SYSTEMD static void do_sd_notify(enum daemon_status old_state) { /* @@ -181,6 +183,7 @@ static void do_sd_notify(enum daemon_status old_state) return; sd_notify(0, sd_notify_status()); } +#endif static void config_cleanup(void *arg) { @@ -1066,6 +1069,42 @@ out: } static int +uev_pathfail_check(struct uevent *uev, struct vectors *vecs) +{ + char *action = NULL, *devt = NULL; + struct path *pp; + int r; + + action = uevent_get_dm_action(uev); + if (!action) + return 1; + if (strncmp(action, "PATH_FAILED", 11)) + goto out; + devt = uevent_get_dm_path(uev); + if (!devt) { + condlog(3, "%s: No DM_PATH in uevent", uev->kernel); + goto out; + } + + pthread_cleanup_push(cleanup_lock, &vecs->lock); + lock(&vecs->lock); + pthread_testcancel(); + pp = find_path_by_devt(vecs->pathvec, devt); + r = io_err_stat_handle_pathfail(pp); + lock_cleanup_pop(vecs->lock); + + if (r) + condlog(3, "io_err_stat: %s: cannot handle pathfail uevent", + pp->dev); + FREE(devt); + FREE(action); + return 0; +out: + FREE(action); + return 1; +} + +static int map_discovery (struct vectors * vecs) { struct multipath * mpp; @@ -1150,6 +1189,14 @@ uev_trigger (struct uevent * uev, void * trigger_data) if (!strncmp(uev->kernel, "dm-", 3)) { if (!strncmp(uev->action, "change", 6)) { r = uev_add_map(uev, vecs); + + /* + * the kernel-side dm-mpath issues a PATH_FAILED event + * when it encounters a path IO error. It is reason- + * able be the entry of path IO error accounting pro- + * cess. + */ + uev_pathfail_check(uev, vecs); goto out; } if (!strncmp(uev->action, "remove", 6)) { @@ -1507,83 +1554,6 @@ void repair_path(struct path * pp) LOG_MSG(1, checker_message(&pp->checker)); } -static int check_path_reinstate_state(struct path * pp) { - struct timespec curr_time; - if (!((pp->mpp->san_path_err_threshold > 0) && - (pp->mpp->san_path_err_forget_rate > 0) && - (pp->mpp->san_path_err_recovery_time >0))) { - return 0; - } - - if (pp->disable_reinstate) { - /* If we don't know how much time has passed, automatically - * reinstate the path, just to be safe. Also, if there are - * no other usable paths, reinstate the path - */ - if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 || - pp->mpp->nr_active == 0) { - condlog(2, "%s : reinstating path early", pp->dev); - goto reinstate_path; - } - if ((curr_time.tv_sec - pp->dis_reinstate_time ) > pp->mpp->san_path_err_recovery_time) { - condlog(2,"%s : reinstate the path after err recovery time", pp->dev); - goto reinstate_path; - } - return 1; - } - /* forget errors on a working path */ - if ((pp->state == PATH_UP || pp->state == PATH_GHOST) && - pp->path_failures > 0) { - if (pp->san_path_err_forget_rate > 0){ - pp->san_path_err_forget_rate--; - } else { - /* for every san_path_err_forget_rate number of - * successful path checks decrement path_failures by 1 - */ - pp->path_failures--; - pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate; - } - return 0; - } - - /* If the path isn't recovering from a failed state, do nothing */ - if (pp->state != PATH_DOWN && pp->state != PATH_SHAKY && - pp->state != PATH_TIMEOUT) - return 0; - - if (pp->path_failures == 0) - pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate; - - pp->path_failures++; - - /* if we don't know the currently time, we don't know how long to - * delay the path, so there's no point in checking if we should - */ - - if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0) - return 0; - /* when path failures has exceeded the san_path_err_threshold - * place the path in delayed state till san_path_err_recovery_time - * so that the cutomer can rectify the issue within this time. After - * the completion of san_path_err_recovery_time it should - * automatically reinstate the path - */ - if (pp->path_failures > pp->mpp->san_path_err_threshold) { - condlog(2, "%s : hit error threshold. Delaying path reinstatement", pp->dev); - pp->dis_reinstate_time = curr_time.tv_sec; - pp->disable_reinstate = 1; - return 1; - } else { - return 0; - } - -reinstate_path: - pp->path_failures = 0; - pp->disable_reinstate = 0; - pp->san_path_err_forget_rate = 0; - return 0; -} - /* * Returns '1' if the path has been checked, '-1' if it was blacklisted * and '0' otherwise @@ -1697,9 +1667,13 @@ check_path (struct vectors * vecs, struct path * pp, int ticks) if (!pp->mpp) return 0; - if ((newstate == PATH_UP || newstate == PATH_GHOST) && - check_path_reinstate_state(pp)) { - pp->state = PATH_DELAYED; + if (pp->io_err_disable_reinstate && hit_io_err_recheck_time(pp)) { + pp->state = PATH_SHAKY; + /* + * to reschedule as soon as possible,so that this path can + * be recoverd in time + */ + pp->tick = 1; return 1; } @@ -2396,6 +2370,7 @@ child (void * param) setup_thread_attr(&misc_attr, 64 * 1024, 0); setup_thread_attr(&uevent_attr, DEFAULT_UEVENT_STACKSIZE * 1024, 0); setup_thread_attr(&waiter_attr, 32 * 1024, 1); + setup_thread_attr(&io_err_stat_attr, 32 * 1024, 1); if (logsink == 1) { setup_thread_attr(&log_attr, 64 * 1024, 0); @@ -2518,6 +2493,10 @@ child (void * param) /* * start threads */ + rc = start_io_err_stat_thread(vecs); + if (rc) + goto failed; + if ((rc = pthread_create(&check_thr, &misc_attr, checkerloop, vecs))) { condlog(0,"failed to create checker loop thread: %d", rc); goto failed; @@ -2567,6 +2546,8 @@ child (void * param) remove_maps_and_stop_waiters(vecs); unlock(&vecs->lock); + stop_io_err_stat_thread(); + pthread_cancel(check_thr); pthread_cancel(uevent_thr); pthread_cancel(uxlsnr_thr); @@ -2612,6 +2593,7 @@ child (void * param) udev_unref(udev); udev = NULL; pthread_attr_destroy(&waiter_attr); + pthread_attr_destroy(&io_err_stat_attr); #ifdef _DEBUG_ dbg_free_final(NULL); #endif diff --git a/multipathd/multipathd.8 b/multipathd/multipathd.8 index 2615728..5c96680 100644 --- a/multipathd/multipathd.8 +++ b/multipathd/multipathd.8 @@ -247,6 +247,22 @@ Disable persistent reservation management on $map. Get the current persistent reservation management status of $map. . .TP +.B map|multipath $map getprkey +Get the current persistent reservation key associated with $map. +. +.TP +.B map|multipath $map setprkey key $key +Set the persistent reservation key associated with $map to $key in the +\fIprkeys_file\fR. This key will only be used by multipathd if +\fIreservation_key\fR is set to \fBfile\fR in \fI/etc/multipath.conf\fR. +. +.TP +.B map|multipath $map unsetprkey +Remove the persistent reservation key associated with $map from the +\fIprkeys_file\fR. This will only unset the key used by multipathd if +\fIreservation_key\fR is set to \fBfile\fR in \fI/etc/multipath.conf\fR. +. +.TP .B quit|exit End interactive session. . -- 2.7.4