$(RM) $$file; \
done
$(RM) $(DESTDIR)$(man3dir)/libdmmp.h*
+ $(RM) $(DESTDIR)$(pkgconfdir)/$(PKGFILE)
clean:
$(RM) core *.a *.o *.gz *.so *.so.*
$(INSTALL_PROGRAM) -v -m 644 -D docs/$$file docs/man/$$file; \
done
cat $(HEADERS) | \
- perl docs/doc-preclean.pl > $(TEMPFILE)
- perl docs/kernel-doc -man $(TEMPFILE) | \
+ perl docs/doc-preclean.pl > "$(TEMPFILE)"
+ perl docs/kernel-doc -man "$(TEMPFILE)" | \
perl docs/split-man.pl docs/man
- -rm -f $(TEMPFILE)
+ -rm -f "$(TEMPFILE)"
@for file in docs/man/*.3; do \
gzip -f $$file; \
done
memcpy(&prkey, paramp->sa_key, 8);
if (mpp->prkey_source == PRKEY_SOURCE_FILE && prkey &&
- ((!get_be64(mpp->reservation_key) && MPATH_PROUT_REG_SA) ||
- MPATH_PROUT_REG_IGN_SA)) {
+ ((!get_be64(mpp->reservation_key) &&
+ rq_servact == MPATH_PROUT_REG_SA) ||
+ rq_servact == MPATH_PROUT_REG_IGN_SA)) {
memcpy(&mpp->reservation_key, paramp->sa_key, 8);
if (update_prkey(alias, get_be64(mpp->reservation_key))) {
condlog(0, "%s: failed to set prkey for multipathd.",
CFLAGS += $(LIB_CFLAGS) -I$(mpathcmddir)
-LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -lurcu
+LIBDEPS += -lpthread -ldl -ldevmapper -ludev -L$(mpathcmddir) -lmpathcmd -lurcu -laio
ifdef SYSTEMD
CFLAGS += -DUSE_SYSTEMD=$(SYSTEMD)
pgpolicies.o debug.o defaults.o uevent.o time-util.o \
switchgroup.o uxsock.o print.o alias.o log_pthread.o \
log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \
- lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o
+ lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o prkey.o \
+ io_err_stat.o
all: $(LIBS)
"timeout",
"removed",
"delayed",
- "none",
};
static LIST_HEAD(checkers);
if (c) {
INIT_LIST_HEAD(&c->node);
c->refcount = 1;
+ c->fd = -1;
}
return c;
}
return 0;
}
+void checker_clear (struct checker *c)
+{
+ memset(c, 0x0, sizeof(struct checker));
+ c->fd = -1;
+}
+
void checker_put (struct checker * dst)
{
struct checker * src;
src = checker_lookup(dst->name);
if (dst->free)
dst->free(dst);
- memset(dst, 0x0, sizeof(struct checker));
+ checker_clear(dst);
free_checker(src);
}
*
* PATH_WILD:
* - Use: None of the checkers (returned if we don't have an fd)
- * - Description: Corner case where "fd <= 0" for path fd (see checker_check())
+ * - Description: Corner case where "fd < 0" for path fd (see checker_check())
*
* PATH_UNCHECKED:
* - Use: Only in directio checker
struct checker * add_checker (char *, char *);
struct checker * checker_lookup (char *);
int checker_init (struct checker *, void **);
+void checker_clear (struct checker *);
void checker_put (struct checker *);
void checker_reset (struct checker *);
void checker_set_sync (struct checker *);
LogvolInfo_struct lvi; // logical "volume" info
IOCTL_Command_struct cic; // cciss ioctl command
- if ((c->fd) <= 0) {
+ if ((c->fd) < 0) {
MSG(c,"no usable fd");
ret = -1;
goto out;
merge_num(delay_wait_checks);
merge_num(skip_kpartx);
merge_num(max_sectors_kb);
- merge_num(san_path_err_threshold);
- merge_num(san_path_err_forget_rate);
- merge_num(san_path_err_recovery_time);
snprintf(id, sizeof(id), "%s/%s", dst->vendor, dst->product);
reconcile_features_with_options(id, &dst->features,
int deferred_remove;
int delay_watch_checks;
int delay_wait_checks;
- int san_path_err_threshold;
- int san_path_err_forget_rate;
- int san_path_err_recovery_time;
+ int marginal_path_err_sample_time;
+ int marginal_path_err_rate_threshold;
+ int marginal_path_err_recheck_gap_time;
+ int marginal_path_double_failed_time;
int skip_kpartx;
int max_sectors_kb;
char * bl_product;
int deferred_remove;
int delay_watch_checks;
int delay_wait_checks;
- int san_path_err_threshold;
- int san_path_err_forget_rate;
- int san_path_err_recovery_time;
+ int marginal_path_err_sample_time;
+ int marginal_path_err_rate_threshold;
+ int marginal_path_err_recheck_gap_time;
+ int marginal_path_double_failed_time;
int skip_kpartx;
int max_sectors_kb;
uid_t uid;
int processed_main_config;
int delay_watch_checks;
int delay_wait_checks;
- int san_path_err_threshold;
- int san_path_err_forget_rate;
- int san_path_err_recovery_time;
+ int marginal_path_err_sample_time;
+ int marginal_path_err_rate_threshold;
+ int marginal_path_err_recheck_gap_time;
+ int marginal_path_double_failed_time;
int uxsock_timeout;
int strict_timing;
int retrigger_tries;
select_deferred_remove(conf, mpp);
select_delay_watch_checks(conf, mpp);
select_delay_wait_checks(conf, mpp);
- select_san_path_err_threshold(conf, mpp);
- select_san_path_err_forget_rate(conf, mpp);
- select_san_path_err_recovery_time(conf, mpp);
+ select_marginal_path_err_sample_time(conf, mpp);
+ select_marginal_path_err_rate_threshold(conf, mpp);
+ select_marginal_path_err_recheck_gap_time(conf, mpp);
+ select_marginal_path_double_failed_time(conf, mpp);
select_skip_kpartx(conf, mpp);
select_max_sectors_kb(conf, mpp);
declare_hw_snprint(delay_wait_checks, print_off_int_undef)
declare_mp_handler(delay_wait_checks, set_off_int_undef)
declare_mp_snprint(delay_wait_checks, print_off_int_undef)
-declare_def_handler(san_path_err_threshold, set_off_int_undef)
-declare_def_snprint_defint(san_path_err_threshold, print_off_int_undef,
+declare_def_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_err_sample_time, print_off_int_undef,
DEFAULT_ERR_CHECKS)
-declare_ovr_handler(san_path_err_threshold, set_off_int_undef)
-declare_ovr_snprint(san_path_err_threshold, print_off_int_undef)
-declare_hw_handler(san_path_err_threshold, set_off_int_undef)
-declare_hw_snprint(san_path_err_threshold, print_off_int_undef)
-declare_mp_handler(san_path_err_threshold, set_off_int_undef)
-declare_mp_snprint(san_path_err_threshold, print_off_int_undef)
-declare_def_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_def_snprint_defint(san_path_err_forget_rate, print_off_int_undef,
+declare_ovr_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_ovr_snprint(marginal_path_err_sample_time, print_off_int_undef)
+declare_hw_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_hw_snprint(marginal_path_err_sample_time, print_off_int_undef)
+declare_mp_handler(marginal_path_err_sample_time, set_off_int_undef)
+declare_mp_snprint(marginal_path_err_sample_time, print_off_int_undef)
+declare_def_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_err_rate_threshold, print_off_int_undef,
DEFAULT_ERR_CHECKS)
-declare_ovr_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_ovr_snprint(san_path_err_forget_rate, print_off_int_undef)
-declare_hw_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_hw_snprint(san_path_err_forget_rate, print_off_int_undef)
-declare_mp_handler(san_path_err_forget_rate, set_off_int_undef)
-declare_mp_snprint(san_path_err_forget_rate, print_off_int_undef)
-declare_def_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_def_snprint_defint(san_path_err_recovery_time, print_off_int_undef,
+declare_ovr_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_ovr_snprint(marginal_path_err_rate_threshold, print_off_int_undef)
+declare_hw_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_hw_snprint(marginal_path_err_rate_threshold, print_off_int_undef)
+declare_mp_handler(marginal_path_err_rate_threshold, set_off_int_undef)
+declare_mp_snprint(marginal_path_err_rate_threshold, print_off_int_undef)
+declare_def_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_err_recheck_gap_time, print_off_int_undef,
DEFAULT_ERR_CHECKS)
-declare_ovr_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_ovr_snprint(san_path_err_recovery_time, print_off_int_undef)
-declare_hw_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_hw_snprint(san_path_err_recovery_time, print_off_int_undef)
-declare_mp_handler(san_path_err_recovery_time, set_off_int_undef)
-declare_mp_snprint(san_path_err_recovery_time, print_off_int_undef)
+declare_ovr_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_ovr_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef)
+declare_hw_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_hw_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef)
+declare_mp_handler(marginal_path_err_recheck_gap_time, set_off_int_undef)
+declare_mp_snprint(marginal_path_err_recheck_gap_time, print_off_int_undef)
+declare_def_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_def_snprint_defint(marginal_path_double_failed_time, print_off_int_undef,
+ DEFAULT_ERR_CHECKS)
+declare_ovr_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_ovr_snprint(marginal_path_double_failed_time, print_off_int_undef)
+declare_hw_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_hw_snprint(marginal_path_double_failed_time, print_off_int_undef)
+declare_mp_handler(marginal_path_double_failed_time, set_off_int_undef)
+declare_mp_snprint(marginal_path_double_failed_time, print_off_int_undef)
+
+
+
static int
def_uxsock_timeout_handler(struct config *conf, vector strvec)
{
install_keyword("config_dir", &def_config_dir_handler, &snprint_def_config_dir);
install_keyword("delay_watch_checks", &def_delay_watch_checks_handler, &snprint_def_delay_watch_checks);
install_keyword("delay_wait_checks", &def_delay_wait_checks_handler, &snprint_def_delay_wait_checks);
- install_keyword("san_path_err_threshold", &def_san_path_err_threshold_handler, &snprint_def_san_path_err_threshold);
- install_keyword("san_path_err_forget_rate", &def_san_path_err_forget_rate_handler, &snprint_def_san_path_err_forget_rate);
- install_keyword("san_path_err_recovery_time", &def_san_path_err_recovery_time_handler, &snprint_def_san_path_err_recovery_time);
+ install_keyword("marginal_path_err_sample_time", &def_marginal_path_err_sample_time_handler, &snprint_def_marginal_path_err_sample_time);
+ install_keyword("marginal_path_err_rate_threshold", &def_marginal_path_err_rate_threshold_handler, &snprint_def_marginal_path_err_rate_threshold);
+ install_keyword("marginal_path_err_recheck_gap_time", &def_marginal_path_err_recheck_gap_time_handler, &snprint_def_marginal_path_err_recheck_gap_time);
+ install_keyword("marginal_path_double_failed_time", &def_marginal_path_double_failed_time_handler, &snprint_def_marginal_path_double_failed_time);
install_keyword("find_multipaths", &def_find_multipaths_handler, &snprint_def_find_multipaths);
install_keyword("uxsock_timeout", &def_uxsock_timeout_handler, &snprint_def_uxsock_timeout);
install_keyword("deferred_remove", &hw_deferred_remove_handler, &snprint_hw_deferred_remove);
install_keyword("delay_watch_checks", &hw_delay_watch_checks_handler, &snprint_hw_delay_watch_checks);
install_keyword("delay_wait_checks", &hw_delay_wait_checks_handler, &snprint_hw_delay_wait_checks);
- install_keyword("san_path_err_threshold", &hw_san_path_err_threshold_handler, &snprint_hw_san_path_err_threshold);
- install_keyword("san_path_err_forget_rate", &hw_san_path_err_forget_rate_handler, &snprint_hw_san_path_err_forget_rate);
- install_keyword("san_path_err_recovery_time", &hw_san_path_err_recovery_time_handler, &snprint_hw_san_path_err_recovery_time);
+ install_keyword("marginal_path_err_sample_time", &hw_marginal_path_err_sample_time_handler, &snprint_hw_marginal_path_err_sample_time);
+ install_keyword("marginal_path_err_rate_threshold", &hw_marginal_path_err_rate_threshold_handler, &snprint_hw_marginal_path_err_rate_threshold);
+ install_keyword("marginal_path_err_recheck_gap_time", &hw_marginal_path_err_recheck_gap_time_handler, &snprint_hw_marginal_path_err_recheck_gap_time);
+ install_keyword("marginal_path_double_failed_time", &hw_marginal_path_double_failed_time_handler, &snprint_hw_marginal_path_double_failed_time);
install_keyword("skip_kpartx", &hw_skip_kpartx_handler, &snprint_hw_skip_kpartx);
install_keyword("max_sectors_kb", &hw_max_sectors_kb_handler, &snprint_hw_max_sectors_kb);
install_sublevel_end();
install_keyword("deferred_remove", &ovr_deferred_remove_handler, &snprint_ovr_deferred_remove);
install_keyword("delay_watch_checks", &ovr_delay_watch_checks_handler, &snprint_ovr_delay_watch_checks);
install_keyword("delay_wait_checks", &ovr_delay_wait_checks_handler, &snprint_ovr_delay_wait_checks);
- install_keyword("san_path_err_threshold", &ovr_san_path_err_threshold_handler, &snprint_ovr_san_path_err_threshold);
- install_keyword("san_path_err_forget_rate", &ovr_san_path_err_forget_rate_handler, &snprint_ovr_san_path_err_forget_rate);
- install_keyword("san_path_err_recovery_time", &ovr_san_path_err_recovery_time_handler, &snprint_ovr_san_path_err_recovery_time);
+ install_keyword("marginal_path_err_sample_time", &ovr_marginal_path_err_sample_time_handler, &snprint_ovr_marginal_path_err_sample_time);
+ install_keyword("marginal_path_err_rate_threshold", &ovr_marginal_path_err_rate_threshold_handler, &snprint_ovr_marginal_path_err_rate_threshold);
+ install_keyword("marginal_path_err_recheck_gap_time", &ovr_marginal_path_err_recheck_gap_time_handler, &snprint_ovr_marginal_path_err_recheck_gap_time);
+ install_keyword("marginal_path_double_failed_time", &ovr_marginal_path_double_failed_time_handler, &snprint_ovr_marginal_path_double_failed_time);
install_keyword("skip_kpartx", &ovr_skip_kpartx_handler, &snprint_ovr_skip_kpartx);
install_keyword("max_sectors_kb", &ovr_max_sectors_kb_handler, &snprint_ovr_max_sectors_kb);
install_keyword("deferred_remove", &mp_deferred_remove_handler, &snprint_mp_deferred_remove);
install_keyword("delay_watch_checks", &mp_delay_watch_checks_handler, &snprint_mp_delay_watch_checks);
install_keyword("delay_wait_checks", &mp_delay_wait_checks_handler, &snprint_mp_delay_wait_checks);
- install_keyword("san_path_err_threshold", &mp_san_path_err_threshold_handler, &snprint_mp_san_path_err_threshold);
- install_keyword("san_path_err_forget_rate", &mp_san_path_err_forget_rate_handler, &snprint_mp_san_path_err_forget_rate);
- install_keyword("san_path_err_recovery_time", &mp_san_path_err_recovery_time_handler, &snprint_mp_san_path_err_recovery_time);
+ install_keyword("marginal_path_err_sample_time", &mp_marginal_path_err_sample_time_handler, &snprint_mp_marginal_path_err_sample_time);
+ install_keyword("marginal_path_err_rate_threshold", &mp_marginal_path_err_rate_threshold_handler, &snprint_mp_marginal_path_err_rate_threshold);
+ install_keyword("marginal_path_err_recheck_gap_time", &mp_marginal_path_err_recheck_gap_time_handler, &snprint_mp_marginal_path_err_recheck_gap_time);
+ install_keyword("marginal_path_double_failed_time", &mp_marginal_path_double_failed_time_handler, &snprint_mp_marginal_path_double_failed_time);
install_keyword("skip_kpartx", &mp_skip_kpartx_handler, &snprint_mp_skip_kpartx);
install_keyword("max_sectors_kb", &mp_max_sectors_kb_handler, &snprint_mp_max_sectors_kb);
install_sublevel_end();
}
checker_set_fd(c, pp->fd);
if (checker_init(c, pp->mpp?&pp->mpp->mpcontext:NULL)) {
- memset(c, 0x0, sizeof(struct checker));
+ checker_clear(c);
condlog(3, "%s: checker init failed", pp->dev);
return PATH_UNCHECKED;
}
},
{
/* XIV Storage System / FlashSystem A9000/A9000R */
- .vendor = "IBM",
- .product = "2810XIV",
+ .vendor = "(XIV|IBM)",
+ .product = "(NEXTRA|2810XIV)",
.no_path_retry = NO_PATH_RETRY_QUEUE,
.pgpolicy = MULTIBUS,
},
{
- /* FlashSystem 710/720/810/820/840/900 */
- .vendor = "IBM",
- .product = "FlashSystem",
+ /* TMS RamSan / FlashSystem 710/720/810/820/840/900 */
+ .vendor = "(TMS|IBM)",
+ .product = "(RamSan|FlashSystem)",
.pgpolicy = MULTIBUS,
},
{
/* OceanStor V3 */
.vendor = "HUAWEI",
.product = "XSG1",
- .pgpolicy = MULTIBUS,
+ .pgpolicy = GROUP_BY_PRIO,
+ .prio_name = PRIO_ALUA,
},
/*
* Red Hat
.pgpolicy = MULTIBUS,
.no_path_retry = 30,
},
+ {
+ /* Magnitude family */
+ .vendor = "(XIOTECH|XIOtech)",
+ .product = "Magnitude",
+ .pgpolicy = MULTIBUS,
+ .no_path_retry = 30,
+ },
/*
* Violin Memory
*/
.prio_name = PRIO_ALUA,
.no_path_retry = 30,
},
+ /*
+ * AccelStor
+ */
+ {
+ /* NeoSapphire */
+ .vendor = "AStor",
+ .product = "NeoSapphire",
+ .pgpolicy = MULTIBUS,
+ .no_path_retry = 30,
+ },
/*
* EOL
*/
--- /dev/null
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
+ *
+ * io_err_stat.c
+ * version 1.0
+ *
+ * IO error stream statistic process for path failure event from kernel
+ *
+ * Author(s): Guan Junxiong 2017 <guanjunxiong@huawei.com>
+ *
+ * This file is released under the GPL version 2, or any later version.
+ */
+
+#include <unistd.h>
+#include <pthread.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <libaio.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "vector.h"
+#include "memory.h"
+#include "checkers.h"
+#include "config.h"
+#include "structs.h"
+#include "structs_vec.h"
+#include "devmapper.h"
+#include "debug.h"
+#include "lock.h"
+#include "time-util.h"
+#include "io_err_stat.h"
+
+#define IOTIMEOUT_SEC 60
+#define TIMEOUT_NO_IO_NSEC 10000000 /*10ms = 10000000ns*/
+#define FLAKY_PATHFAIL_THRESHOLD 2
+#define CONCUR_NR_EVENT 32
+
+#define PATH_IO_ERR_IN_CHECKING -1
+#define PATH_IO_ERR_IN_POLLING_RECHECK -2
+
+#define io_err_stat_log(prio, fmt, args...) \
+ condlog(prio, "io error statistic: " fmt, ##args)
+
+
+struct io_err_stat_pathvec {
+ pthread_mutex_t mutex;
+ vector pathvec;
+};
+
+struct dio_ctx {
+ struct timespec io_starttime;
+ int blksize;
+ void *buf;
+ struct iocb io;
+};
+
+struct io_err_stat_path {
+ char devname[FILE_NAME_SIZE];
+ int fd;
+ struct dio_ctx *dio_ctx_array;
+ int io_err_nr;
+ int io_nr;
+ struct timespec start_time;
+
+ int total_time;
+ int err_rate_threshold;
+};
+
+pthread_t io_err_stat_thr;
+pthread_attr_t io_err_stat_attr;
+
+static struct io_err_stat_pathvec *paths;
+struct vectors *vecs;
+io_context_t ioctx;
+
+static void cancel_inflight_io(struct io_err_stat_path *pp);
+
+static void rcu_unregister(void *param)
+{
+ rcu_unregister_thread();
+}
+
+struct io_err_stat_path *find_err_path_by_dev(vector pathvec, char *dev)
+{
+ int i;
+ struct io_err_stat_path *pp;
+
+ if (!pathvec)
+ return NULL;
+ vector_foreach_slot(pathvec, pp, i)
+ if (!strcmp(pp->devname, dev))
+ return pp;
+
+ io_err_stat_log(4, "%s: not found in check queue", dev);
+
+ return NULL;
+}
+
+static int init_each_dio_ctx(struct dio_ctx *ct, int blksize,
+ unsigned long pgsize)
+{
+ ct->blksize = blksize;
+ if (posix_memalign(&ct->buf, pgsize, blksize))
+ return 1;
+ memset(ct->buf, 0, blksize);
+ ct->io_starttime.tv_sec = 0;
+ ct->io_starttime.tv_nsec = 0;
+
+ return 0;
+}
+
+static void deinit_each_dio_ctx(struct dio_ctx *ct)
+{
+ if (ct->buf)
+ free(ct->buf);
+}
+
+static int setup_directio_ctx(struct io_err_stat_path *p)
+{
+ unsigned long pgsize = getpagesize();
+ char fpath[PATH_MAX];
+ int blksize = 0;
+ int i;
+
+ if (snprintf(fpath, PATH_MAX, "/dev/%s", p->devname) >= PATH_MAX)
+ return 1;
+ if (p->fd < 0)
+ p->fd = open(fpath, O_RDONLY | O_DIRECT);
+ if (p->fd < 0)
+ return 1;
+
+ p->dio_ctx_array = MALLOC(sizeof(struct dio_ctx) * CONCUR_NR_EVENT);
+ if (!p->dio_ctx_array)
+ goto fail_close;
+
+ if (ioctl(p->fd, BLKBSZGET, &blksize) < 0) {
+ io_err_stat_log(4, "%s:cannot get blocksize, set default 512",
+ p->devname);
+ blksize = 512;
+ }
+ if (!blksize)
+ goto free_pdctx;
+
+ for (i = 0; i < CONCUR_NR_EVENT; i++) {
+ if (init_each_dio_ctx(p->dio_ctx_array + i, blksize, pgsize))
+ goto deinit;
+ }
+ return 0;
+
+deinit:
+ for (i = 0; i < CONCUR_NR_EVENT; i++)
+ deinit_each_dio_ctx(p->dio_ctx_array + i);
+free_pdctx:
+ FREE(p->dio_ctx_array);
+fail_close:
+ close(p->fd);
+
+ return 1;
+}
+
+static void destroy_directio_ctx(struct io_err_stat_path *p)
+{
+ int i;
+
+ if (!p || !p->dio_ctx_array)
+ return;
+ cancel_inflight_io(p);
+
+ for (i = 0; i < CONCUR_NR_EVENT; i++)
+ deinit_each_dio_ctx(p->dio_ctx_array + i);
+ FREE(p->dio_ctx_array);
+
+ if (p->fd > 0)
+ close(p->fd);
+}
+
+static struct io_err_stat_path *alloc_io_err_stat_path(void)
+{
+ struct io_err_stat_path *p;
+
+ p = (struct io_err_stat_path *)MALLOC(sizeof(*p));
+ if (!p)
+ return NULL;
+
+ memset(p->devname, 0, sizeof(p->devname));
+ p->io_err_nr = 0;
+ p->io_nr = 0;
+ p->total_time = 0;
+ p->start_time.tv_sec = 0;
+ p->start_time.tv_nsec = 0;
+ p->err_rate_threshold = 0;
+ p->fd = -1;
+
+ return p;
+}
+
+static void free_io_err_stat_path(struct io_err_stat_path *p)
+{
+ FREE(p);
+}
+
+static struct io_err_stat_pathvec *alloc_pathvec(void)
+{
+ struct io_err_stat_pathvec *p;
+ int r;
+
+ p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p));
+ if (!p)
+ return NULL;
+ p->pathvec = vector_alloc();
+ if (!p->pathvec)
+ goto out_free_struct_pathvec;
+ r = pthread_mutex_init(&p->mutex, NULL);
+ if (r)
+ goto out_free_member_pathvec;
+
+ return p;
+
+out_free_member_pathvec:
+ vector_free(p->pathvec);
+out_free_struct_pathvec:
+ FREE(p);
+ return NULL;
+}
+
+static void free_io_err_pathvec(struct io_err_stat_pathvec *p)
+{
+ struct io_err_stat_path *path;
+ int i;
+
+ if (!p)
+ return;
+ pthread_mutex_destroy(&p->mutex);
+ if (!p->pathvec) {
+ vector_foreach_slot(p->pathvec, path, i) {
+ destroy_directio_ctx(path);
+ free_io_err_stat_path(path);
+ }
+ vector_free(p->pathvec);
+ }
+ FREE(p);
+}
+
+/*
+ * return value
+ * 0: enqueue OK
+ * 1: fails because of internal error
+ * 2: fails because of existing already
+ */
+static int enqueue_io_err_stat_by_path(struct path *path)
+{
+ struct io_err_stat_path *p;
+
+ pthread_mutex_lock(&paths->mutex);
+ p = find_err_path_by_dev(paths->pathvec, path->dev);
+ if (p) {
+ pthread_mutex_unlock(&paths->mutex);
+ return 2;
+ }
+ pthread_mutex_unlock(&paths->mutex);
+
+ p = alloc_io_err_stat_path();
+ if (!p)
+ return 1;
+
+ memcpy(p->devname, path->dev, sizeof(p->devname));
+ p->total_time = path->mpp->marginal_path_err_sample_time;
+ p->err_rate_threshold = path->mpp->marginal_path_err_rate_threshold;
+
+ if (setup_directio_ctx(p))
+ goto free_ioerr_path;
+ pthread_mutex_lock(&paths->mutex);
+ if (!vector_alloc_slot(paths->pathvec))
+ goto unlock_destroy;
+ vector_set_slot(paths->pathvec, p);
+ pthread_mutex_unlock(&paths->mutex);
+
+ if (!path->io_err_disable_reinstate) {
+ /*
+ *fail the path in the kernel for the time of the to make
+ *the test more reliable
+ */
+ io_err_stat_log(3, "%s: fail dm path %s before checking",
+ path->mpp->alias, path->dev);
+ path->io_err_disable_reinstate = 1;
+ dm_fail_path(path->mpp->alias, path->dev_t);
+ update_queue_mode_del_path(path->mpp);
+
+ /*
+ * schedule path check as soon as possible to
+ * update path state to delayed state
+ */
+ path->tick = 1;
+
+ }
+ io_err_stat_log(2, "%s: enqueue path %s to check",
+ path->mpp->alias, path->dev);
+ return 0;
+
+unlock_destroy:
+ pthread_mutex_unlock(&paths->mutex);
+ destroy_directio_ctx(p);
+free_ioerr_path:
+ free_io_err_stat_path(p);
+
+ return 1;
+}
+
+int io_err_stat_handle_pathfail(struct path *path)
+{
+ struct timespec curr_time;
+ int res;
+
+ if (path->io_err_disable_reinstate) {
+ io_err_stat_log(3, "%s: reinstate is already disabled",
+ path->dev);
+ return 1;
+ }
+ if (path->io_err_pathfail_cnt < 0)
+ return 1;
+
+ if (!path->mpp)
+ return 1;
+ if (path->mpp->nr_active <= 1)
+ return 1;
+ if (path->mpp->marginal_path_double_failed_time <= 0 ||
+ path->mpp->marginal_path_err_sample_time <= 0 ||
+ path->mpp->marginal_path_err_recheck_gap_time <= 0 ||
+ path->mpp->marginal_path_err_rate_threshold < 0) {
+ io_err_stat_log(4, "%s: parameter not set", path->mpp->alias);
+ return 1;
+ }
+ if (path->mpp->marginal_path_err_sample_time < (2 * IOTIMEOUT_SEC)) {
+ io_err_stat_log(2, "%s: marginal_path_err_sample_time should not less than %d",
+ path->mpp->alias, 2 * IOTIMEOUT_SEC);
+ return 1;
+ }
+ /*
+ * The test should only be started for paths that have failed
+ * repeatedly in a certain time frame, so that we have reason
+ * to assume they're flaky. Without bother the admin to configure
+ * the repeated count threshold and time frame, we assume a path
+ * which fails at least twice within 60 seconds is flaky.
+ */
+ if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
+ return 1;
+ if (path->io_err_pathfail_cnt == 0) {
+ path->io_err_pathfail_cnt++;
+ path->io_err_pathfail_starttime = curr_time.tv_sec;
+ io_err_stat_log(5, "%s: start path flakiness pre-checking",
+ path->dev);
+ return 0;
+ }
+ if ((curr_time.tv_sec - path->io_err_pathfail_starttime) >
+ path->mpp->marginal_path_double_failed_time) {
+ path->io_err_pathfail_cnt = 0;
+ path->io_err_pathfail_starttime = curr_time.tv_sec;
+ io_err_stat_log(5, "%s: restart path flakiness pre-checking",
+ path->dev);
+ }
+ path->io_err_pathfail_cnt++;
+ if (path->io_err_pathfail_cnt >= FLAKY_PATHFAIL_THRESHOLD) {
+ res = enqueue_io_err_stat_by_path(path);
+ if (!res)
+ path->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
+ else
+ path->io_err_pathfail_cnt = 0;
+ }
+
+ return 0;
+}
+
+int hit_io_err_recheck_time(struct path *pp)
+{
+ struct timespec curr_time;
+ int r;
+
+ if (pp->io_err_disable_reinstate == 0)
+ return 1;
+ if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
+ return 1;
+ if (pp->io_err_pathfail_cnt != PATH_IO_ERR_IN_POLLING_RECHECK)
+ return 1;
+ if (pp->mpp->nr_active <= 0) {
+ io_err_stat_log(2, "%s: recover path early", pp->dev);
+ goto recover;
+ }
+ if ((curr_time.tv_sec - pp->io_err_dis_reinstate_time) >
+ pp->mpp->marginal_path_err_recheck_gap_time) {
+ io_err_stat_log(4, "%s: reschedule checking after %d seconds",
+ pp->dev,
+ pp->mpp->marginal_path_err_recheck_gap_time);
+ /*
+ * to reschedule io error checking again
+ * if the path is good enough, we claim it is good
+ * and can be reinsated as soon as possible in the
+ * check_path routine.
+ */
+ pp->io_err_dis_reinstate_time = curr_time.tv_sec;
+ r = enqueue_io_err_stat_by_path(pp);
+ /*
+ * Enqueue fails because of internal error.
+ * In this case , we recover this path
+ * Or else, return 1 to set path state to PATH_SHAKY
+ */
+ if (r == 1) {
+ io_err_stat_log(3, "%s: enqueue fails, to recover",
+ pp->dev);
+ goto recover;
+ } else if (!r) {
+ pp->io_err_pathfail_cnt = PATH_IO_ERR_IN_CHECKING;
+ }
+ }
+
+ return 1;
+
+recover:
+ pp->io_err_pathfail_cnt = 0;
+ pp->io_err_disable_reinstate = 0;
+ pp->tick = 1;
+ return 0;
+}
+
+static int delete_io_err_stat_by_addr(struct io_err_stat_path *p)
+{
+ int i;
+
+ i = find_slot(paths->pathvec, p);
+ if (i != -1)
+ vector_del_slot(paths->pathvec, i);
+
+ destroy_directio_ctx(p);
+ free_io_err_stat_path(p);
+
+ return 0;
+}
+
+static void account_async_io_state(struct io_err_stat_path *pp, int rc)
+{
+ switch (rc) {
+ case PATH_DOWN:
+ case PATH_TIMEOUT:
+ pp->io_err_nr++;
+ break;
+ case PATH_UNCHECKED:
+ case PATH_UP:
+ case PATH_PENDING:
+ break;
+ default:
+ break;
+ }
+}
+
+static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp)
+{
+ struct timespec currtime, difftime;
+ struct path *path;
+ double err_rate;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
+ return 1;
+ timespecsub(&currtime, &pp->start_time, &difftime);
+ if (difftime.tv_sec < pp->total_time)
+ return 0;
+
+ io_err_stat_log(4, "%s: check end", pp->devname);
+
+ err_rate = pp->io_nr == 0 ? 0 : (pp->io_err_nr * 1000.0f) / pp->io_nr;
+ io_err_stat_log(3, "%s: IO error rate (%.1f/1000)",
+ pp->devname, err_rate);
+ pthread_cleanup_push(cleanup_lock, &vecs->lock);
+ lock(&vecs->lock);
+ pthread_testcancel();
+ path = find_path_by_dev(vecs->pathvec, pp->devname);
+ if (!path) {
+ io_err_stat_log(4, "path %s not found'", pp->devname);
+ } else if (err_rate <= pp->err_rate_threshold) {
+ path->io_err_pathfail_cnt = 0;
+ path->io_err_disable_reinstate = 0;
+ io_err_stat_log(3, "%s: (%d/%d) good to enable reinstating",
+ pp->devname, pp->io_err_nr, pp->io_nr);
+ /*
+ * schedule path check as soon as possible to
+ * update path state. Do NOT reinstate dm path here
+ */
+ path->tick = 1;
+
+ } else if (path->mpp && path->mpp->nr_active > 1) {
+ io_err_stat_log(3, "%s: keep failing the dm path %s",
+ path->mpp->alias, path->dev);
+ path->io_err_pathfail_cnt = PATH_IO_ERR_IN_POLLING_RECHECK;
+ path->io_err_disable_reinstate = 1;
+ path->io_err_dis_reinstate_time = currtime.tv_sec;
+ io_err_stat_log(3, "%s: disable reinstating of %s",
+ path->mpp->alias, path->dev);
+ } else {
+ path->io_err_pathfail_cnt = 0;
+ path->io_err_disable_reinstate = 0;
+ io_err_stat_log(3, "%s: there is orphan path, enable reinstating",
+ pp->devname);
+ }
+ lock_cleanup_pop(vecs->lock);
+
+ delete_io_err_stat_by_addr(pp);
+
+ return 0;
+}
+
+static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev)
+{
+ int rc = -1;
+
+ if (ct->io_starttime.tv_nsec == 0 &&
+ ct->io_starttime.tv_sec == 0) {
+ struct iocb *ios[1] = { &ct->io };
+
+ if (clock_gettime(CLOCK_MONOTONIC, &ct->io_starttime) != 0) {
+ ct->io_starttime.tv_sec = 0;
+ ct->io_starttime.tv_nsec = 0;
+ return rc;
+ }
+ io_prep_pread(&ct->io, fd, ct->buf, ct->blksize, 0);
+ if (io_submit(ioctx, 1, ios) != 1) {
+ io_err_stat_log(5, "%s: io_submit error %i",
+ dev, errno);
+ return rc;
+ }
+ rc = 0;
+ }
+
+ return rc;
+}
+
+static void send_batch_async_ios(struct io_err_stat_path *pp)
+{
+ int i;
+ struct dio_ctx *ct;
+ struct timespec currtime, difftime;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0)
+ return;
+ /*
+ * Give a free time for all IO to complete or timeout
+ */
+ if (pp->start_time.tv_sec != 0) {
+ timespecsub(&currtime, &pp->start_time, &difftime);
+ if (difftime.tv_sec + IOTIMEOUT_SEC >= pp->total_time)
+ return;
+ }
+
+ for (i = 0; i < CONCUR_NR_EVENT; i++) {
+ ct = pp->dio_ctx_array + i;
+ if (!send_each_async_io(ct, pp->fd, pp->devname))
+ pp->io_nr++;
+ }
+ if (pp->start_time.tv_sec == 0 && pp->start_time.tv_nsec == 0 &&
+ clock_gettime(CLOCK_MONOTONIC, &pp->start_time)) {
+ pp->start_time.tv_sec = 0;
+ pp->start_time.tv_nsec = 0;
+ }
+}
+
+static int try_to_cancel_timeout_io(struct dio_ctx *ct, struct timespec *t,
+ char *dev)
+{
+ struct timespec difftime;
+ struct io_event event;
+ int rc = PATH_UNCHECKED;
+ int r;
+
+ if (ct->io_starttime.tv_sec == 0)
+ return rc;
+ timespecsub(t, &ct->io_starttime, &difftime);
+ if (difftime.tv_sec > IOTIMEOUT_SEC) {
+ struct iocb *ios[1] = { &ct->io };
+
+ io_err_stat_log(5, "%s: abort check on timeout", dev);
+ r = io_cancel(ioctx, ios[0], &event);
+ if (r)
+ io_err_stat_log(5, "%s: io_cancel error %i",
+ dev, errno);
+ ct->io_starttime.tv_sec = 0;
+ ct->io_starttime.tv_nsec = 0;
+ rc = PATH_TIMEOUT;
+ } else {
+ rc = PATH_PENDING;
+ }
+
+ return rc;
+}
+
+static void poll_async_io_timeout(void)
+{
+ struct io_err_stat_path *pp;
+ struct timespec curr_time;
+ int rc = PATH_UNCHECKED;
+ int i, j;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
+ return;
+ vector_foreach_slot(paths->pathvec, pp, i) {
+ for (j = 0; j < CONCUR_NR_EVENT; j++) {
+ rc = try_to_cancel_timeout_io(pp->dio_ctx_array + j,
+ &curr_time, pp->devname);
+ account_async_io_state(pp, rc);
+ }
+ }
+}
+
+static void cancel_inflight_io(struct io_err_stat_path *pp)
+{
+ struct io_event event;
+ int i, r;
+
+ for (i = 0; i < CONCUR_NR_EVENT; i++) {
+ struct dio_ctx *ct = pp->dio_ctx_array + i;
+ struct iocb *ios[1] = { &ct->io };
+
+ if (ct->io_starttime.tv_sec == 0
+ && ct->io_starttime.tv_nsec == 0)
+ continue;
+ io_err_stat_log(5, "%s: abort infligh io",
+ pp->devname);
+ r = io_cancel(ioctx, ios[0], &event);
+ if (r)
+ io_err_stat_log(5, "%s: io_cancel error %d, %i",
+ pp->devname, r, errno);
+ ct->io_starttime.tv_sec = 0;
+ ct->io_starttime.tv_nsec = 0;
+ }
+}
+
+static inline int handle_done_dio_ctx(struct dio_ctx *ct, struct io_event *ev)
+{
+ ct->io_starttime.tv_sec = 0;
+ ct->io_starttime.tv_nsec = 0;
+ return (ev->res == ct->blksize) ? PATH_UP : PATH_DOWN;
+}
+
+static void handle_async_io_done_event(struct io_event *io_evt)
+{
+ struct io_err_stat_path *pp;
+ struct dio_ctx *ct;
+ int rc = PATH_UNCHECKED;
+ int i, j;
+
+ vector_foreach_slot(paths->pathvec, pp, i) {
+ for (j = 0; j < CONCUR_NR_EVENT; j++) {
+ ct = pp->dio_ctx_array + j;
+ if (&ct->io == io_evt->obj) {
+ rc = handle_done_dio_ctx(ct, io_evt);
+ account_async_io_state(pp, rc);
+ return;
+ }
+ }
+ }
+}
+
+static void process_async_ios_event(int timeout_nsecs, char *dev)
+{
+ struct io_event events[CONCUR_NR_EVENT];
+ int i, n;
+ struct timespec timeout = { .tv_nsec = timeout_nsecs };
+
+ errno = 0;
+ n = io_getevents(ioctx, 1L, CONCUR_NR_EVENT, events, &timeout);
+ if (n < 0) {
+ io_err_stat_log(3, "%s: async io events returned %d (errno=%s)",
+ dev, n, strerror(errno));
+ } else {
+ for (i = 0; i < n; i++)
+ handle_async_io_done_event(&events[i]);
+ }
+}
+
+static void service_paths(void)
+{
+ struct io_err_stat_path *pp;
+ int i;
+
+ pthread_mutex_lock(&paths->mutex);
+ vector_foreach_slot(paths->pathvec, pp, i) {
+ send_batch_async_ios(pp);
+ process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp->devname);
+ poll_async_io_timeout();
+ poll_io_err_stat(vecs, pp);
+ }
+ pthread_mutex_unlock(&paths->mutex);
+}
+
+static void *io_err_stat_loop(void *data)
+{
+ vecs = (struct vectors *)data;
+ pthread_cleanup_push(rcu_unregister, NULL);
+ rcu_register_thread();
+
+ mlockall(MCL_CURRENT | MCL_FUTURE);
+ while (1) {
+ service_paths();
+ usleep(100000);
+ }
+
+ pthread_cleanup_pop(1);
+ return NULL;
+}
+
+int start_io_err_stat_thread(void *data)
+{
+ if (io_setup(CONCUR_NR_EVENT, &ioctx) != 0) {
+ io_err_stat_log(4, "io_setup failed");
+ return 1;
+ }
+ paths = alloc_pathvec();
+ if (!paths)
+ goto destroy_ctx;
+
+ if (pthread_create(&io_err_stat_thr, &io_err_stat_attr,
+ io_err_stat_loop, data)) {
+ io_err_stat_log(0, "cannot create io_error statistic thread");
+ goto out_free;
+ }
+ io_err_stat_log(3, "thread started");
+ return 0;
+
+out_free:
+ free_io_err_pathvec(paths);
+destroy_ctx:
+ io_destroy(ioctx);
+ io_err_stat_log(0, "failed to start io_error statistic thread");
+ return 1;
+}
+
+void stop_io_err_stat_thread(void)
+{
+ pthread_cancel(io_err_stat_thr);
+ pthread_kill(io_err_stat_thr, SIGUSR2);
+ free_io_err_pathvec(paths);
+ io_destroy(ioctx);
+}
--- /dev/null
+#ifndef _IO_ERR_STAT_H
+#define _IO_ERR_STAT_H
+
+#include "vector.h"
+#include "lock.h"
+
+
+extern pthread_attr_t io_err_stat_attr;
+
+int start_io_err_stat_thread(void *data);
+void stop_io_err_stat_thread(void);
+int io_err_stat_handle_pathfail(struct path *path);
+int hit_io_err_recheck_time(struct path *pp);
+
+#endif /* _IO_ERR_STAT_H */
* scale, the priority "rc" of each path can be provided.
*
* Author(s): Yang Feng <philip.yang@huawei.com>
+ * Revised: Guan Junxiong <guanjunxiong@huawei.com>
*
* This file is released under the GPL version 2, or any later version.
*/
+#define _GNU_SOURCE
#include <stdio.h>
#include <math.h>
#include <ctype.h>
#include <time.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <unistd.h>
#include "debug.h"
#include "prio.h"
#include "structs.h"
-#include "../checkers/libsg.h"
+#include "util.h"
#define pp_pl_log(prio, fmt, args...) condlog(prio, "path_latency prio: " fmt, ##args)
#define MAX_IO_NUM 200
-#define MIN_IO_NUM 2
+#define MIN_IO_NUM 20
+#define DEF_IO_NUM 100
#define MAX_BASE_NUM 10
-#define MIN_BASE_NUM 2
+#define MIN_BASE_NUM 1.01
+#define DEF_BASE_NUM 1.5
#define MAX_AVG_LATENCY 100000000. /* Unit: us */
#define MIN_AVG_LATENCY 1. /* Unit: us */
#define DEFAULT_PRIORITY 0
-#define MAX_CHAR_SIZE 30
-
#define USEC_PER_SEC 1000000LL
#define NSEC_PER_USEC 1000LL
-static long long path_latency[MAX_IO_NUM];
+#define DEF_BLK_SIZE 4096
+
+static double lg_path_latency[MAX_IO_NUM];
static inline long long timeval_to_us(const struct timespec *tv)
{
(tv->tv_nsec / NSEC_PER_USEC);
}
-static int do_readsector0(int fd, unsigned int timeout)
+static int prepare_directio_read(int fd, int *blksz, char **pbuf,
+ int *restore_flags)
+{
+ unsigned long pgsize = getpagesize();
+ long flags;
+
+ if (ioctl(fd, BLKBSZGET, blksz) < 0) {
+ pp_pl_log(3,"catnnot get blocksize, set default");
+ *blksz = DEF_BLK_SIZE;
+ }
+ if (posix_memalign((void **)pbuf, pgsize, *blksz))
+ return -1;
+
+ flags = fcntl(fd, F_GETFL);
+ if (flags < 0)
+ goto free_out;
+ if (!(flags & O_DIRECT)) {
+ flags |= O_DIRECT;
+ if (fcntl(fd, F_SETFL, flags) < 0)
+ goto free_out;
+ *restore_flags = 1;
+ }
+
+ return 0;
+
+free_out:
+ free(*pbuf);
+
+ return -1;
+}
+
+static void cleanup_directio_read(int fd, char *buf, int restore_flags)
{
- unsigned char buf[4096];
- unsigned char sbuf[SENSE_BUFF_LEN];
+ long flags;
+
+ free(buf);
+
+ if (!restore_flags)
+ return;
+ if ((flags = fcntl(fd, F_GETFL)) >= 0) {
+ int ret __attribute__ ((unused));
+ flags &= ~O_DIRECT;
+ /* No point in checking for errors */
+ ret = fcntl(fd, F_SETFL, flags);
+ }
+}
+
+static int do_directio_read(int fd, unsigned int timeout, char *buf, int sz)
+{
+ fd_set read_fds;
+ struct timeval tm = { .tv_sec = timeout };
int ret;
+ int num_read;
- ret = sg_read(fd, &buf[0], 4096, &sbuf[0], SENSE_BUFF_LEN, timeout);
+ if (lseek(fd, 0, SEEK_SET) == -1)
+ return -1;
+ FD_ZERO(&read_fds);
+ FD_SET(fd, &read_fds);
+ ret = select(fd+1, &read_fds, NULL, NULL, &tm);
+ if (ret <= 0)
+ return -1;
+ num_read = read(fd, buf, sz);
+ if (num_read != sz)
+ return -1;
- return ret;
+ return 0;
}
-int check_args_valid(int io_num, int base_num)
+int check_args_valid(int io_num, double base_num)
{
if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM)) {
pp_pl_log(0, "args io_num is outside the valid range");
}
/*
- * In multipath.conf, args form: io_num|base_num. For example,
- * args is "20|10", this function can get io_num value 20, and
+ * In multipath.conf, args form: io_num=n base_num=m. For example, args are
+ * "io_num=20 base_num=10", this function can get io_num value 20 and
* base_num value 10.
*/
-static int get_ionum_and_basenum(char *args, int *ionum, int *basenum)
+static int get_ionum_and_basenum(char *args, int *ionum, double *basenum)
{
- char source[MAX_CHAR_SIZE];
- char vertica = '|';
- char *endstrbefore = NULL;
- char *endstrafter = NULL;
- unsigned int size = strlen(args);
+ char split_char[] = " \t";
+ char *arg, *temp;
+ char *str, *str_inval;
+ int i;
+ int flag_io = 0, flag_base = 0;
if ((args == NULL) || (ionum == NULL) || (basenum == NULL)) {
pp_pl_log(0, "args string is NULL");
return 0;
}
- if ((size < 1) || (size > MAX_CHAR_SIZE - 1)) {
- pp_pl_log(0, "args string's size is too long");
+ arg = temp = STRDUP(args);
+ if (!arg)
return 0;
- }
- memcpy(source, args, size + 1);
-
- if (!isdigit(source[0])) {
- pp_pl_log(0, "invalid prio_args format: %s", source);
- return 0;
- }
-
- *ionum = (int)strtoul(source, &endstrbefore, 10);
- if (endstrbefore[0] != vertica) {
- pp_pl_log(0, "invalid prio_args format: %s", source);
- return 0;
+ for (i = 0; i < 2; i++) {
+ str = get_next_string(&temp, split_char);
+ if (!str)
+ goto out;
+ if (!strncmp(str, "io_num=", 7) && strlen(str) > 7) {
+ *ionum = (int)strtoul(str + 7, &str_inval, 10);
+ if (str == str_inval)
+ goto out;
+ flag_io = 1;
+ }
+ else if (!strncmp(str, "base_num=", 9) && strlen(str) > 9) {
+ *basenum = strtod(str + 9, &str_inval);
+ if (str == str_inval)
+ goto out;
+ flag_base = 1;
+ }
}
- if (!isdigit(endstrbefore[1])) {
- pp_pl_log(0, "invalid prio_args format: %s", source);
- return 0;
- }
-
- *basenum = (long long)strtol(&endstrbefore[1], &endstrafter, 10);
- if (check_args_valid(*ionum, *basenum) == 0) {
- return 0;
- }
+ if (!flag_io || !flag_base)
+ goto out;
+ if (check_args_valid(*ionum, *basenum) == 0)
+ goto out;
+ FREE(arg);
return 1;
+out:
+ FREE(arg);
+ return 0;
}
-long long calc_standard_deviation(long long *path_latency, int size,
- long long avglatency)
+double calc_standard_deviation(double *lg_path_latency, int size,
+ double lg_avglatency)
{
int index;
- long long total = 0;
+ double sum = 0;
for (index = 0; index < size; index++) {
- total +=
- (path_latency[index] - avglatency) * (path_latency[index] -
- avglatency);
+ sum += (lg_path_latency[index] - lg_avglatency) *
+ (lg_path_latency[index] - lg_avglatency);
}
- total /= (size - 1);
+ sum /= (size - 1);
- return (long long)sqrt((double)total);
+ return sqrt(sum);
}
-int calcPrio(double avglatency, double max_avglatency, double min_avglatency,
- double base_num)
+/*
+ * Do not scale the prioriy in a certain range such as [0, 1024]
+ * because scaling will eliminate the effect of base_num.
+ */
+int calcPrio(double lg_avglatency, double lg_maxavglatency,
+ double lg_minavglatency)
{
- double lavglatency = log(avglatency) / log(base_num);
- double lmax_avglatency = log(max_avglatency) / log(base_num);
- double lmin_avglatency = log(min_avglatency) / log(base_num);
-
- if (lavglatency <= lmin_avglatency)
- return (int)(lmax_avglatency + 1.);
+ if (lg_avglatency <= lg_minavglatency)
+ return lg_maxavglatency - lg_minavglatency;
- if (lavglatency > lmax_avglatency)
+ if (lg_avglatency >= lg_maxavglatency)
return 0;
- return (int)(lmax_avglatency - lavglatency + 1.);
-}
-
-/* Calc the latency interval corresponding to the average latency */
-long long calc_latency_interval(double avglatency, double max_avglatency,
- double min_avglatency, double base_num)
-{
- double lavglatency = log(avglatency) / log(base_num);
- double lmax_avglatency = log(max_avglatency) / log(base_num);
- double lmin_avglatency = log(min_avglatency) / log(base_num);
-
- if ((lavglatency <= lmin_avglatency)
- || (lavglatency > lmax_avglatency))
- return 0; /* Invalid value */
-
- if ((double)((int)lavglatency) == lavglatency)
- return (long long)(avglatency - (avglatency / base_num));
- else
- return (long long)(pow(base_num, (double)((int)lavglatency + 1))
- - pow(base_num, (double)((int)lavglatency)));
+ return lg_maxavglatency - lg_avglatency;
}
int getprio(struct path *pp, char *args, unsigned int timeout)
{
int rc, temp;
int index = 0;
- int io_num;
- int base_num;
- long long avglatency;
- long long latency_interval;
- long long standard_deviation;
- long long toldelay = 0;
+ int io_num = 0;
+ double base_num = 0;
+ double lg_avglatency, lg_maxavglatency, lg_minavglatency;
+ double standard_deviation;
+ double lg_toldelay = 0;
long long before, after;
struct timespec tv;
+ int blksize;
+ char *buf;
+ int restore_flags = 0;
+ double lg_base;
+ long long sum_latency = 0;
+ long long arith_mean_lat;
if (pp->fd < 0)
return -1;
if (get_ionum_and_basenum(args, &io_num, &base_num) == 0) {
- pp_pl_log(0, "%s: get path_latency args fail", pp->dev);
- return DEFAULT_PRIORITY;
+ io_num = DEF_IO_NUM;
+ base_num = DEF_BASE_NUM;
+ pp_pl_log(0, "%s: fails to get path_latency args, set default:"
+ "io_num=%d base_num=%.3lf",
+ pp->dev, io_num, base_num);
}
- memset(path_latency, 0, sizeof(path_latency));
+ memset(lg_path_latency, 0, sizeof(lg_path_latency));
+ lg_base = log(base_num);
+ lg_maxavglatency = log(MAX_AVG_LATENCY) / lg_base;
+ lg_minavglatency = log(MIN_AVG_LATENCY) / lg_base;
+
+ prepare_directio_read(pp->fd, &blksize, &buf, &restore_flags);
temp = io_num;
while (temp-- > 0) {
(void)clock_gettime(CLOCK_MONOTONIC, &tv);
before = timeval_to_us(&tv);
- if (do_readsector0(pp->fd, timeout) == 2) {
+ if (do_directio_read(pp->fd, timeout, buf, blksize)) {
pp_pl_log(0, "%s: path down", pp->dev);
+ cleanup_directio_read(pp->fd, buf, restore_flags);
return -1;
}
(void)clock_gettime(CLOCK_MONOTONIC, &tv);
after = timeval_to_us(&tv);
-
- path_latency[index] = after - before;
- toldelay += path_latency[index++];
+ /*
+ * We assume that the latency complies with Log-normal
+ * distribution. The logarithm of latency is in normal
+ * distribution.
+ */
+ lg_path_latency[index] = log(after - before) / lg_base;
+ lg_toldelay += lg_path_latency[index++];
+ sum_latency += after - before;
}
- avglatency = toldelay / (long long)io_num;
- pp_pl_log(4, "%s: average latency is (%lld us)", pp->dev, avglatency);
+ cleanup_directio_read(pp->fd, buf, restore_flags);
+
+ lg_avglatency = lg_toldelay / (long long)io_num;
+ arith_mean_lat = sum_latency / (long long)io_num;
+ pp_pl_log(4, "%s: arithmetic mean latency is (%lld us), geometric mean latency is (%lld us)",
+ pp->dev, arith_mean_lat,
+ (long long)pow(base_num, lg_avglatency));
- if (avglatency > MAX_AVG_LATENCY) {
+ if (lg_avglatency > lg_maxavglatency) {
pp_pl_log(0,
"%s: average latency (%lld us) is outside the thresold (%lld us)",
- pp->dev, avglatency, (long long)MAX_AVG_LATENCY);
+ pp->dev, (long long)pow(base_num, lg_avglatency),
+ (long long)MAX_AVG_LATENCY);
return DEFAULT_PRIORITY;
}
+ standard_deviation = calc_standard_deviation(lg_path_latency,
+ index, lg_avglatency);
/*
- * Min average latency and max average latency are constant, the args
- * base_num set can change latency_interval value corresponding to
- * avglatency and is not constant.
- * Warn the user if latency_interval is smaller than (2 * standard_deviation),
- * or equal.
+ * In calPrio(), we let prio y = f(x) = log(max, base) - log (x, base);
+ * So if we want to let the priority of the latency outside 2 standard
+ * deviations can be distinguished from the latency inside 2 standard
+ * deviation, in others words at most 95% are the same and at least 5%
+ * are different according interval estimation of normal distribution,
+ * we should warn the user to set the base_num to be smaller if the
+ * log(x_threshold, base) is small than 2 standard deviation.
+ * x_threshold is derived from:
+ * y + 1 = f(x) + 1 = f(x) + log(base, base), so x_threadshold =
+ * base_num; Note that we only can compare the logarithm of x_threshold
+ * with the standard deviation because the standard deviation is derived
+ * from logarithm of latency.
+ *
+ * therefore , we recommend the base_num to meet the condition :
+ * 1 <= 2 * standard_deviation
*/
- standard_deviation =
- calc_standard_deviation(path_latency, index, avglatency);
- latency_interval =
- calc_latency_interval(avglatency, MAX_AVG_LATENCY, MIN_AVG_LATENCY,
- base_num);
- if ((latency_interval != 0)
- && (latency_interval <= (2 * standard_deviation)))
- pp_pl_log(3,
- "%s: latency interval (%lld) according to average latency (%lld us) is smaller than "
- "2 * standard deviation (%lld us), or equal, args base_num (%d) needs to be set bigger value",
- pp->dev, latency_interval, avglatency,
- standard_deviation, base_num);
-
- rc = calcPrio(avglatency, MAX_AVG_LATENCY, MIN_AVG_LATENCY, base_num);
+ pp_pl_log(5, "%s: standard deviation for logarithm of latency = %.6f",
+ pp->dev, standard_deviation);
+ if (standard_deviation <= 0.5)
+ pp_pl_log(3, "%s: the base_num(%.3lf) is too big to distinguish different priority "
+ "of two far-away latency. It is recommend to be set smaller",
+ pp->dev, base_num);
+ /*
+ * If the standard deviation is too large , we should also warn the user
+ */
+
+ if (standard_deviation > 4)
+ pp_pl_log(3, "%s: the base_num(%.3lf) is too small to avoid noise disturbance "
+ ".It is recommend to be set larger",
+ pp->dev, base_num);
+
+
+ rc = calcPrio(lg_avglatency, lg_maxavglatency, lg_minavglatency);
+
return rc;
}
#include <regex.h>
#include "structs_vec.h"
#include "print.h"
-
-char *get_next_string(char **temp, char *split_char)
-{
- char *token = NULL;
- token = strsep(temp, split_char);
- while (token != NULL && !strcmp(token, ""))
- token = strsep(temp, split_char);
- return token;
-}
+#include "util.h"
#define CHECK_LEN \
do { \
return 0;
}
+/*
+ * Current RDAC (NetApp E-Series) firmware relies
+ * on periodic REPORT TARGET PORT GROUPS for
+ * internal load balancing.
+ * Using the sysfs priority checker defeats this purpose.
+ *
+ * Moreover, NetApp would also prefer the RDAC checker over ALUA.
+ * (https://www.redhat.com/archives/dm-devel/2017-September/msg00326.html)
+ */
+static int
+check_rdac(struct path * pp)
+{
+ int len;
+ char buff[44];
+
+ len = get_vpd_sgio(pp->fd, 0xC9, buff, 44);
+ if (len <= 0)
+ return 0;
+ return !(memcmp(buff + 4, "vac1", 4));
+}
+
int select_checker(struct config *conf, struct path *pp)
{
char *origin, *checker_name;
struct checker * c = &pp->checker;
- if (pp->detect_checker == DETECT_CHECKER_ON && pp->tpgs > 0) {
- checker_name = TUR;
+ if (pp->detect_checker == DETECT_CHECKER_ON) {
origin = "(setting: storage device autodetected)";
- goto out;
- }
+ if (check_rdac(pp)) {
+ checker_name = RDAC;
+ goto out;
+ } else if (pp->tpgs > 0) {
+ checker_name = TUR;
+ goto out;
+ }
+ }
do_set(checker_name, conf->overrides, checker_name, "(setting: multipath.conf overrides section)");
do_set(checker_name, pp->hwe, checker_name, "(setting: storage device configuration)");
do_set(checker_name, conf, checker_name, "(setting: multipath.conf defaults/devices section)");
return 0;
}
-/*
- * Current RDAC (NetApp E-Series) firmware relies
- * on periodic REPORT TARGET PORT GROUPS for
- * internal load balancing.
- * Using the sysfs priority checker defeats this purpose.
- */
-static int
-check_rdac(struct path * pp)
-{
- int len;
- char buff[44];
-
- len = get_vpd_sgio(pp->fd, 0xC9, buff, 44);
- if (len <= 0)
- return 0;
- return !(memcmp(buff + 4, "vac1", 4));
-}
-
void
detect_prio(struct config *conf, struct path * pp)
{
return 0;
}
-int select_san_path_err_threshold(struct config *conf, struct multipath *mp)
+
+int select_marginal_path_err_sample_time(struct config *conf, struct multipath *mp)
{
char *origin, buff[12];
- mp_set_mpe(san_path_err_threshold);
- mp_set_ovr(san_path_err_threshold);
- mp_set_hwe(san_path_err_threshold);
- mp_set_conf(san_path_err_threshold);
- mp_set_default(san_path_err_threshold, DEFAULT_ERR_CHECKS);
+ mp_set_mpe(marginal_path_err_sample_time);
+ mp_set_ovr(marginal_path_err_sample_time);
+ mp_set_hwe(marginal_path_err_sample_time);
+ mp_set_conf(marginal_path_err_sample_time);
+ mp_set_default(marginal_path_err_sample_time, DEFAULT_ERR_CHECKS);
out:
- print_off_int_undef(buff, 12, &mp->san_path_err_threshold);
- condlog(3, "%s: san_path_err_threshold = %s %s", mp->alias, buff, origin);
+ print_off_int_undef(buff, 12, &mp->marginal_path_err_sample_time);
+ condlog(3, "%s: marginal_path_err_sample_time = %s %s", mp->alias, buff,
+ origin);
return 0;
}
-int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp)
+int select_marginal_path_err_rate_threshold(struct config *conf, struct multipath *mp)
{
char *origin, buff[12];
- mp_set_mpe(san_path_err_forget_rate);
- mp_set_ovr(san_path_err_forget_rate);
- mp_set_hwe(san_path_err_forget_rate);
- mp_set_conf(san_path_err_forget_rate);
- mp_set_default(san_path_err_forget_rate, DEFAULT_ERR_CHECKS);
+ mp_set_mpe(marginal_path_err_rate_threshold);
+ mp_set_ovr(marginal_path_err_rate_threshold);
+ mp_set_hwe(marginal_path_err_rate_threshold);
+ mp_set_conf(marginal_path_err_rate_threshold);
+ mp_set_default(marginal_path_err_rate_threshold, DEFAULT_ERR_CHECKS);
out:
- print_off_int_undef(buff, 12, &mp->san_path_err_forget_rate);
- condlog(3, "%s: san_path_err_forget_rate = %s %s", mp->alias, buff, origin);
+ print_off_int_undef(buff, 12, &mp->marginal_path_err_rate_threshold);
+ condlog(3, "%s: marginal_path_err_rate_threshold = %s %s", mp->alias, buff,
+ origin);
return 0;
-
}
-int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp)
+
+int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multipath *mp)
{
char *origin, buff[12];
- mp_set_mpe(san_path_err_recovery_time);
- mp_set_ovr(san_path_err_recovery_time);
- mp_set_hwe(san_path_err_recovery_time);
- mp_set_conf(san_path_err_recovery_time);
- mp_set_default(san_path_err_recovery_time, DEFAULT_ERR_CHECKS);
+ mp_set_mpe(marginal_path_err_recheck_gap_time);
+ mp_set_ovr(marginal_path_err_recheck_gap_time);
+ mp_set_hwe(marginal_path_err_recheck_gap_time);
+ mp_set_conf(marginal_path_err_recheck_gap_time);
+ mp_set_default(marginal_path_err_recheck_gap_time, DEFAULT_ERR_CHECKS);
out:
- print_off_int_undef(buff, 12, &mp->san_path_err_recovery_time);
- condlog(3, "%s: san_path_err_recovery_time = %s %s", mp->alias, buff, origin);
+ print_off_int_undef(buff, 12, &mp->marginal_path_err_recheck_gap_time);
+ condlog(3, "%s: marginal_path_err_recheck_gap_time = %s %s", mp->alias, buff,
+ origin);
return 0;
+}
+int select_marginal_path_double_failed_time(struct config *conf, struct multipath *mp)
+{
+ char *origin, buff[12];
+
+ mp_set_mpe(marginal_path_double_failed_time);
+ mp_set_ovr(marginal_path_double_failed_time);
+ mp_set_hwe(marginal_path_double_failed_time);
+ mp_set_conf(marginal_path_double_failed_time);
+ mp_set_default(marginal_path_double_failed_time, DEFAULT_ERR_CHECKS);
+out:
+ print_off_int_undef(buff, 12, &mp->marginal_path_double_failed_time);
+ condlog(3, "%s: marginal_path_double_failed_time = %s %s", mp->alias, buff,
+ origin);
+ return 0;
}
+
int select_skip_kpartx (struct config *conf, struct multipath * mp)
{
char *origin;
int select_delay_wait_checks (struct config *conf, struct multipath * mp);
int select_skip_kpartx (struct config *conf, struct multipath * mp);
int select_max_sectors_kb (struct config *conf, struct multipath * mp);
-int select_san_path_err_forget_rate(struct config *conf, struct multipath *mp);
-int select_san_path_err_threshold(struct config *conf, struct multipath *mp);
-int select_san_path_err_recovery_time(struct config *conf, struct multipath *mp);
+int select_marginal_path_err_sample_time(struct config *conf, struct multipath *mp);
+int select_marginal_path_err_rate_threshold(struct config *conf, struct multipath *mp);
+int select_marginal_path_err_recheck_gap_time(struct config *conf, struct multipath *mp);
+int select_marginal_path_double_failed_time(struct config *conf, struct multipath *mp);
void reconcile_features_with_options(const char *id, char **features,
int* no_path_retry,
int *retain_hwhandler);
pp->fd = -1;
pp->tpgs = TPGS_UNDEF;
pp->priority = PRIO_UNDEF;
+ checker_clear(&pp->checker);
}
return pp;
}
int initialized;
int retriggers;
int wwid_changed;
- unsigned int path_failures;
- time_t dis_reinstate_time;
- int disable_reinstate;
- int san_path_err_forget_rate;
+ time_t io_err_dis_reinstate_time;
+ int io_err_disable_reinstate;
+ int io_err_pathfail_cnt;
+ int io_err_pathfail_starttime;
/* configlet pointers */
struct hwentry * hwe;
};
int deferred_remove;
int delay_watch_checks;
int delay_wait_checks;
- int san_path_err_threshold;
- int san_path_err_forget_rate;
- int san_path_err_recovery_time;
+ int marginal_path_err_sample_time;
+ int marginal_path_err_rate_threshold;
+ int marginal_path_err_recheck_gap_time;
+ int marginal_path_double_failed_time;
int skip_kpartx;
int max_sectors_kb;
int force_readonly;
}
return p;
}
+
+char *uevent_get_dm_path(struct uevent *uev)
+{
+ char *p = NULL;
+ int i;
+
+ for (i = 0; uev->envp[i] != NULL; i++) {
+ if (!strncmp(uev->envp[i], "DM_PATH", 7) &&
+ strlen(uev->envp[i]) > 8) {
+ p = MALLOC(strlen(uev->envp[i] + 8) + 1);
+ strcpy(p, uev->envp[i] + 8);
+ break;
+ }
+ }
+ return p;
+}
+
+char *uevent_get_dm_action(struct uevent *uev)
+{
+ char *p = NULL;
+ int i;
+
+ for (i = 0; uev->envp[i] != NULL; i++) {
+ if (!strncmp(uev->envp[i], "DM_ACTION", 9) &&
+ strlen(uev->envp[i]) > 10) {
+ p = MALLOC(strlen(uev->envp[i] + 10) + 1);
+ strcpy(p, uev->envp[i] + 10);
+ break;
+ }
+ }
+ return p;
+}
int uevent_get_minor(struct uevent *uev);
int uevent_get_disk_ro(struct uevent *uev);
char *uevent_get_dm_name(struct uevent *uev);
+char *uevent_get_dm_path(struct uevent *uev);
+char *uevent_get_dm_action(struct uevent *uev);
#endif /* _UEVENT_H */
return 0;
}
+char *get_next_string(char **temp, char *split_char)
+{
+ char *token = NULL;
+ token = strsep(temp, split_char);
+ while (token != NULL && !strcmp(token, ""))
+ token = strsep(temp, split_char);
+ return token;
+}
+
int
get_word (char * sentence, char ** word)
{
size_t strchop(char *);
int basenamecpy (const char * src, char * dst, int);
int filepresent (char * run);
+char *get_next_string(char **temp, char *split_char);
int get_word (char * sentence, char ** word);
size_t strlcpy(char *dst, const char *src, size_t size);
size_t strlcat(char *dst, const char *src, size_t size);
#ifndef _VERSION_H
#define _VERSION_H
-#define VERSION_CODE 0x000703
-#define DATE_CODE 0x090514
+#define VERSION_CODE 0x000704
+#define DATE_CODE 0x0b0f11
#define PROG "multipath-tools"
.RE
.TP 12
.I path_latency
-Needs a value of the form \fI"<io_num>|<base_num>"\fR
+Needs a value of the form "io_num=\fI<20>\fR base_num=\fI<10>\fR"
.RS
.TP 8
.I io_num
retry interval \fIno_path_retry\fR * \fIpolling_interval\fR
if a number of retries is given with \fIno_path_retry\fR and the
overall retry interval is longer than the specified \fIdev_loss_tmo\fR value.
-The Linux kernel will cap this value to \fI300\fR if \fIfast_io_fail_tmo\fR
+The Linux kernel will cap this value to \fI600\fR if \fIfast_io_fail_tmo\fR
is not set. See KNOWN ISSUES.
.RS
.TP
.
.
.TP
+.B prkeys_file
+The full pathname of the prkeys file, which is used by multipathd to keep
+track of the persistent reservation key used for a specific WWID, when
+\fIreservation_key\fR is set to \fBfile\fR.
+.RS
+.TP
+The default is \fB/etc/multipath/prkeys\fR
+.RE
+.
+.
+.TP
.B log_checker_err
If set to
.I once
list which contains an 8-byte value provided by the application client to the
device server to identify the I_T nexus.
.RS
+.PP
+Alternatively, this can be set to \fBfile\fR, which will store the RESERVATION
+KEY registered by mpathpersist in the \fIprkeys_file\fR. multipathd will then
+use this key to register additional paths as they appear. When the
+registration is removed, the RESERVATION KEY is removed from the
+\fIprkeys_file\fR.
.TP
The default is: \fB<unset>\fR
.RE
.
.
.TP
-.B san_path_err_threshold
-If set to a value greater than 0, multipathd will watch paths and check how many
-times a path has been failed due to errors.If the number of failures on a particular
-path is greater then the san_path_err_threshold then the path will not reinstante
-till san_path_err_recovery_time.These path failures should occur within a
-san_path_err_forget_rate checks, if not we will consider the path is good enough
-to reinstantate.
+.B marginal_path_double_failed_time
+One of the four parameters of supporting path check based on accounting IO
+error such as intermittent error. When a path failed event occurs twice in
+\fImarginal_path_double_failed_time\fR seconds due to an IO error and all the
+other three parameters are set, multipathd will fail the path and enqueue
+this path into a queue of which members are sent a couple of continuous
+direct reading asynchronous IOs at a fixed sample rate of 10HZ to start IO
+error accounting process.
.RS
.TP
The default is: \fBno\fR
.
.
.TP
-.B san_path_err_forget_rate
-If set to a value greater than 0, multipathd will check whether the path failures
-has exceeded the san_path_err_threshold within this many checks i.e
-san_path_err_forget_rate . If so we will not reinstante the path till
-san_path_err_recovery_time.
+.B marginal_path_err_sample_time
+One of the four parameters of supporting path check based on accounting IO
+error such as intermittent error. If it is set to a value no less than 120,
+when a path fail event occurs twice in \fImarginal_path_double_failed_time\fR
+second due to an IO error, multipathd will fail the path and enqueue this
+path into a queue of which members are sent a couple of continuous direct
+reading asynchronous IOs at a fixed sample rate of 10HZ to start the IO
+accounting process for the path will last for
+\fImarginal_path_err_sample_time\fR.
+If the rate of IO error on a particular path is greater than the
+\fImarginal_path_err_rate_threshold\fR, then the path will not reinstate for
+\fImarginal_path_err_rate_threshold\fR seconds unless there is only one
+active path. After \fImarginal_path_err_recheck_gap_time\fR expires, the path
+will be requeueed for rechecking. If checking result is good enough, the
+path will be reinstated.
.RS
.TP
The default is: \fBno\fR
.
.
.TP
-.B san_path_err_recovery_time
-If set to a value greater than 0, multipathd will make sure that when path failures
-has exceeded the san_path_err_threshold within san_path_err_forget_rate then the path
-will be placed in failed state for san_path_err_recovery_time duration.Once san_path_err_recovery_time
-has timeout we will reinstante the failed path .
-san_path_err_recovery_time value should be in secs.
+.B marginal_path_err_rate_threshold
+The error rate threshold as a permillage (1/1000). One of the four parameters
+of supporting path check based on accounting IO error such as intermittent
+error. Refer to \fImarginal_path_err_sample_time\fR. If the rate of IO errors
+on a particular path is greater than this parameter, then the path will not
+reinstate for \fImarginal_path_err_rate_threshold\fR seconds unless there is
+only one active path.
+.RS
+.TP
+The default is: \fBno\fR
+.RE
+.
+.
+.TP
+.B marginal_path_err_recheck_gap_time
+One of the four parameters of supporting path check based on accounting IO
+error such as intermittent error. Refer to
+\fImarginal_path_err_sample_time\fR. If this parameter is set to a positive
+value, the failed path of which the IO error rate is larger than
+\fImarginal_path_err_rate_threshold\fR will be kept in failed state for
+\fImarginal_path_err_recheck_gap_time\fR seconds. When
+\fImarginal_path_err_recheck_gap_time\fR seconds expires, the path will be
+requeueed for checking. If checking result is good enough, the path will be
+reinstated, or else it will keep failed.
.RS
.TP
The default is: \fBno\fR
.TP
.B deferred_remove
.TP
-.B san_path_err_threshold
+.B marginal_path_err_sample_time
+.TP
+.B marginal_path_err_rate_threshold
.TP
-.B san_path_err_forget_rate
+.B marginal_path_err_recheck_gap_time
.TP
-.B san_path_err_recovery_time
+.B marginal_path_double_failed_time
.TP
.B delay_watch_checks
.TP
.TP
.B deferred_remove
.TP
-.B san_path_err_threshold
+.B marginal_path_err_sample_time
.TP
-.B san_path_err_forget_rate
+.B marginal_path_err_rate_threshold
.TP
-.B san_path_err_recovery_time
+.B marginal_path_err_recheck_gap_time
+.TP
+.B marginal_path_double_failed_time
.TP
.B delay_watch_checks
.TP
.TP
.B deferred_remove
.TP
-.B san_path_err_threshold
+.B marginal_path_err_sample_time
+.TP
+.B marginal_path_err_rate_threshold
.TP
-.B san_path_err_forget_rate
+.B marginal_path_err_recheck_gap_time
.TP
-.B san_path_err_recovery_time
+.B marginal_path_double_failed_time
.TP
.B delay_watch_checks
.TP
#include "cli_handlers.h"
#include "lock.h"
#include "waiter.h"
+#include "io_err_stat.h"
#include "wwids.h"
#include "../third-party/valgrind/drd.h"
return NULL;
}
+#ifdef USE_SYSTEMD
static void do_sd_notify(enum daemon_status old_state)
{
/*
return;
sd_notify(0, sd_notify_status());
}
+#endif
static void config_cleanup(void *arg)
{
return retval;
}
+static int
+uev_pathfail_check(struct uevent *uev, struct vectors *vecs)
+{
+ char *action = NULL, *devt = NULL;
+ struct path *pp;
+ int r;
+
+ action = uevent_get_dm_action(uev);
+ if (!action)
+ return 1;
+ if (strncmp(action, "PATH_FAILED", 11))
+ goto out;
+ devt = uevent_get_dm_path(uev);
+ if (!devt) {
+ condlog(3, "%s: No DM_PATH in uevent", uev->kernel);
+ goto out;
+ }
+
+ pthread_cleanup_push(cleanup_lock, &vecs->lock);
+ lock(&vecs->lock);
+ pthread_testcancel();
+ pp = find_path_by_devt(vecs->pathvec, devt);
+ r = io_err_stat_handle_pathfail(pp);
+ lock_cleanup_pop(vecs->lock);
+
+ if (r)
+ condlog(3, "io_err_stat: %s: cannot handle pathfail uevent",
+ pp->dev);
+ FREE(devt);
+ FREE(action);
+ return 0;
+out:
+ FREE(action);
+ return 1;
+}
+
static int
map_discovery (struct vectors * vecs)
{
if (!strncmp(uev->kernel, "dm-", 3)) {
if (!strncmp(uev->action, "change", 6)) {
r = uev_add_map(uev, vecs);
+
+ /*
+ * the kernel-side dm-mpath issues a PATH_FAILED event
+ * when it encounters a path IO error. It is reason-
+ * able be the entry of path IO error accounting pro-
+ * cess.
+ */
+ uev_pathfail_check(uev, vecs);
goto out;
}
if (!strncmp(uev->action, "remove", 6)) {
LOG_MSG(1, checker_message(&pp->checker));
}
-static int check_path_reinstate_state(struct path * pp) {
- struct timespec curr_time;
- if (!((pp->mpp->san_path_err_threshold > 0) &&
- (pp->mpp->san_path_err_forget_rate > 0) &&
- (pp->mpp->san_path_err_recovery_time >0))) {
- return 0;
- }
-
- if (pp->disable_reinstate) {
- /* If we don't know how much time has passed, automatically
- * reinstate the path, just to be safe. Also, if there are
- * no other usable paths, reinstate the path
- */
- if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0 ||
- pp->mpp->nr_active == 0) {
- condlog(2, "%s : reinstating path early", pp->dev);
- goto reinstate_path;
- }
- if ((curr_time.tv_sec - pp->dis_reinstate_time ) > pp->mpp->san_path_err_recovery_time) {
- condlog(2,"%s : reinstate the path after err recovery time", pp->dev);
- goto reinstate_path;
- }
- return 1;
- }
- /* forget errors on a working path */
- if ((pp->state == PATH_UP || pp->state == PATH_GHOST) &&
- pp->path_failures > 0) {
- if (pp->san_path_err_forget_rate > 0){
- pp->san_path_err_forget_rate--;
- } else {
- /* for every san_path_err_forget_rate number of
- * successful path checks decrement path_failures by 1
- */
- pp->path_failures--;
- pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate;
- }
- return 0;
- }
-
- /* If the path isn't recovering from a failed state, do nothing */
- if (pp->state != PATH_DOWN && pp->state != PATH_SHAKY &&
- pp->state != PATH_TIMEOUT)
- return 0;
-
- if (pp->path_failures == 0)
- pp->san_path_err_forget_rate = pp->mpp->san_path_err_forget_rate;
-
- pp->path_failures++;
-
- /* if we don't know the currently time, we don't know how long to
- * delay the path, so there's no point in checking if we should
- */
-
- if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0)
- return 0;
- /* when path failures has exceeded the san_path_err_threshold
- * place the path in delayed state till san_path_err_recovery_time
- * so that the cutomer can rectify the issue within this time. After
- * the completion of san_path_err_recovery_time it should
- * automatically reinstate the path
- */
- if (pp->path_failures > pp->mpp->san_path_err_threshold) {
- condlog(2, "%s : hit error threshold. Delaying path reinstatement", pp->dev);
- pp->dis_reinstate_time = curr_time.tv_sec;
- pp->disable_reinstate = 1;
- return 1;
- } else {
- return 0;
- }
-
-reinstate_path:
- pp->path_failures = 0;
- pp->disable_reinstate = 0;
- pp->san_path_err_forget_rate = 0;
- return 0;
-}
-
/*
* Returns '1' if the path has been checked, '-1' if it was blacklisted
* and '0' otherwise
if (!pp->mpp)
return 0;
- if ((newstate == PATH_UP || newstate == PATH_GHOST) &&
- check_path_reinstate_state(pp)) {
- pp->state = PATH_DELAYED;
+ if (pp->io_err_disable_reinstate && hit_io_err_recheck_time(pp)) {
+ pp->state = PATH_SHAKY;
+ /*
+ * to reschedule as soon as possible,so that this path can
+ * be recoverd in time
+ */
+ pp->tick = 1;
return 1;
}
setup_thread_attr(&misc_attr, 64 * 1024, 0);
setup_thread_attr(&uevent_attr, DEFAULT_UEVENT_STACKSIZE * 1024, 0);
setup_thread_attr(&waiter_attr, 32 * 1024, 1);
+ setup_thread_attr(&io_err_stat_attr, 32 * 1024, 1);
if (logsink == 1) {
setup_thread_attr(&log_attr, 64 * 1024, 0);
/*
* start threads
*/
+ rc = start_io_err_stat_thread(vecs);
+ if (rc)
+ goto failed;
+
if ((rc = pthread_create(&check_thr, &misc_attr, checkerloop, vecs))) {
condlog(0,"failed to create checker loop thread: %d", rc);
goto failed;
remove_maps_and_stop_waiters(vecs);
unlock(&vecs->lock);
+ stop_io_err_stat_thread();
+
pthread_cancel(check_thr);
pthread_cancel(uevent_thr);
pthread_cancel(uxlsnr_thr);
udev_unref(udev);
udev = NULL;
pthread_attr_destroy(&waiter_attr);
+ pthread_attr_destroy(&io_err_stat_attr);
#ifdef _DEBUG_
dbg_free_final(NULL);
#endif
Get the current persistent reservation management status of $map.
.
.TP
+.B map|multipath $map getprkey
+Get the current persistent reservation key associated with $map.
+.
+.TP
+.B map|multipath $map setprkey key $key
+Set the persistent reservation key associated with $map to $key in the
+\fIprkeys_file\fR. This key will only be used by multipathd if
+\fIreservation_key\fR is set to \fBfile\fR in \fI/etc/multipath.conf\fR.
+.
+.TP
+.B map|multipath $map unsetprkey
+Remove the persistent reservation key associated with $map from the
+\fIprkeys_file\fR. This will only unset the key used by multipathd if
+\fIreservation_key\fR is set to \fBfile\fR in \fI/etc/multipath.conf\fR.
+.
+.TP
.B quit|exit
End interactive session.
.