5 #include <linux/unistd.h>
10 #include <libdevmapper.h>
15 #include <sys/mount.h>
21 #include <sysfs/libsysfs.h>
22 #include <sysfs/dlist.h>
28 #include <path_state.h>
39 #include <blacklist.h>
44 #include <devmapper.h>
46 #include <discovery.h>
50 #include <switchgroup.h>
54 #include "clone_platform.h"
57 #define FILE_NAME_SIZE 256
60 #define CALLOUT_DIR "/var/cache/multipathd"
62 #define LOG_MSG(a,b) \
64 log_safe(LOG_WARNING, "%s: %s", b, a); \
65 memset(a, 0, MAX_CHECKER_MSG_SIZE); \
70 fprintf(stderr, "%s:%s(%i) lock %p\n", __FILE__, __FUNCTION__, __LINE__, a); \
73 fprintf(stderr, "%s:%s(%i) unlock %p\n", __FILE__, __FUNCTION__, __LINE__, a); \
74 pthread_mutex_unlock(a)
76 #define lock(a) pthread_mutex_lock(a)
77 #define unlock(a) pthread_mutex_unlock(a)
84 pthread_mutex_t *lock;
92 char mapname[WWID_SIZE];
93 struct paths *allpaths;
100 struct event_thread * wp;
102 wp = MALLOC(sizeof(struct event_thread));
107 wp->thread = MALLOC(sizeof(pthread_t));
116 log_safe(LOG_ERR, "failed to alloc waiter");
121 set_paths_owner (struct paths * allpaths, struct multipath * mpp)
126 vector_foreach_slot (allpaths->pathvec, pp, i) {
127 if (!strncmp(mpp->wwid, pp->wwid, WWID_SIZE)) {
128 log_safe(LOG_DEBUG, "%s ownership set",
136 update_multipath_table (struct multipath *mpp, vector pathvec)
138 if (dm_get_map(mpp->alias, &mpp->size, mpp->params))
141 if(disassemble_map(pathvec, mpp->params, mpp))
148 update_multipath_status (struct multipath *mpp)
150 if(dm_get_status(mpp->alias, mpp->status))
153 if (disassemble_status(mpp->status, mpp))
160 update_multipath_strings (struct multipath *mpp, vector pathvec)
162 if (update_multipath_table(mpp, pathvec))
165 if (update_multipath_status(mpp))
172 setup_multipath (struct paths * allpaths, struct multipath * mpp)
176 wwid = get_mpe_wwid(mpp->alias);
179 strncpy(mpp->wwid, wwid, WWID_SIZE);
182 strncpy(mpp->wwid, mpp->alias, WWID_SIZE);
184 log_safe(LOG_DEBUG, "discovered map %s", mpp->alias);
186 if (update_multipath_strings(mpp, allpaths->pathvec))
189 set_paths_owner(allpaths, mpp);
190 mpp->mpe = find_mpe(mpp->wwid);
191 select_pgfailback(mpp);
195 free_multipath(mpp, KEEP_PATHS);
196 log_safe(LOG_ERR, "failed to setup multipath");
201 switch_pathgroup (struct multipath * mpp)
203 struct pathgroup * pgp;
207 if (!mpp || mpp->pgfailback == FAILBACK_MANUAL)
210 * Refresh path priority values
212 vector_foreach_slot (mpp->pg, pgp, i)
213 vector_foreach_slot (pgp->paths, pp, j)
214 pathinfo(pp, conf->hwtable, DI_PRIO);
216 select_path_group(mpp); /* sets mpp->nextpg */
217 pgp = VECTOR_SLOT(mpp->pg, mpp->nextpg - 1);
219 if (pgp && pgp->status != PGSTATE_ACTIVE) {
220 dm_switchgroup(mpp->alias, mpp->nextpg);
221 log_safe(LOG_NOTICE, "%s: switch to path group #%i",
222 mpp->alias, mpp->nextpg);
227 update_multipath (struct paths *allpaths, char *mapname)
229 struct multipath *mpp;
230 struct pathgroup *pgp;
235 lock(allpaths->lock);
236 mpp = find_mp(allpaths->mpvec, mapname);
241 free_pgvec(mpp->pg, KEEP_PATHS);
244 setup_multipath(allpaths, mpp);
247 * compare checkers states with DM states
249 vector_foreach_slot (mpp->pg, pgp, i) {
250 vector_foreach_slot (pgp->paths, pp, j) {
251 if (pp->dmstate != PSTATE_FAILED)
254 if (pp->state != PATH_DOWN) {
255 log_safe(LOG_NOTICE, "%s: mark as failed",
257 pp->state = PATH_DOWN;
261 * schedule the next check earlier
263 if (pp->tick > conf->checkint)
264 pp->tick = conf->checkint;
270 unlock(allpaths->lock);
273 log_safe(LOG_ERR, "failed to update multipath");
279 * returns the reschedule delay
280 * negative means *stop*
283 waiteventloop (struct event_thread * waiter)
287 int r = 1; /* upon problem reschedule 1s later */
289 if (!waiter->event_nr)
290 waiter->event_nr = dm_geteventnr(waiter->mapname);
292 if (!(dmt = dm_task_create(DM_DEVICE_WAITEVENT)))
295 if (!dm_task_set_name(dmt, waiter->mapname))
298 if (waiter->event_nr && !dm_task_set_event_nr(dmt, waiter->event_nr))
301 dm_task_no_open_count(dmt);
311 log_safe(LOG_NOTICE, "devmap event (%i) on %s",
312 waiter->event_nr, waiter->mapname);
317 * 1) a table reload, which means our mpp structure is
318 * obsolete : refresh it through update_multipath()
319 * 2) a path failed by DM : mark as such through
321 * 3) map has gone away : stop the thread.
322 * 4) a path reinstate : nothing to do
323 * 5) a switch group : nothing to do
325 if (update_multipath(waiter->allpaths, waiter->mapname)) {
326 r = -1; /* stop the thread */
329 event_nr = dm_geteventnr(waiter->mapname);
331 if (waiter->event_nr == event_nr)
334 waiter->event_nr = event_nr;
338 dm_task_destroy(dmt);
343 waitevent (void * et)
346 struct event_thread *waiter;
348 mlockall(MCL_CURRENT | MCL_FUTURE);
350 waiter = (struct event_thread *)et;
351 pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
354 r = waiteventloop(waiter);
362 pthread_exit(waiter->thread);
368 free_waiter (struct event_thread * wp)
375 stop_waiter_thread (struct multipath * mpp, struct paths * allpaths)
377 struct event_thread * wp;
382 wp = (struct event_thread *)mpp->waiter;
387 log_safe(LOG_NOTICE, "reap event checker : %s",
390 pthread_cancel(*wp->thread);
397 start_waiter_thread (struct multipath * mpp, struct paths * allpaths)
400 struct event_thread * wp;
405 if (pthread_attr_init(&attr))
408 pthread_attr_setstacksize(&attr, 32 * 1024);
414 mpp->waiter = (void *)wp;
415 strncpy(wp->mapname, mpp->alias, WWID_SIZE);
416 wp->allpaths = allpaths;
418 if (pthread_create(wp->thread, &attr, waitevent, wp)) {
419 log_safe(LOG_ERR, "%s: cannot create event checker",
423 log_safe(LOG_NOTICE, "%s: event checker started", wp->mapname);
429 log_safe(LOG_ERR, "failed to start waiter thread");
434 remove_map (struct multipath * mpp, struct paths * allpaths)
438 stop_waiter_thread(mpp, allpaths);
439 i = find_slot(allpaths->mpvec, (void *)mpp);
440 vector_del_slot(allpaths->mpvec, i);
441 free_multipath(mpp, KEEP_PATHS);
445 uev_add_map (char * devname, struct paths * allpaths)
448 char dev_t[BLK_DEV_SIZE];
450 struct multipath * mpp;
452 if (sysfs_get_dev(sysfs_path, devname, dev_t, BLK_DEV_SIZE))
455 if (sscanf(dev_t, "%d:%d", &major, &minor) != 2)
458 buff = dm_mapname(major, minor, "multipath");
463 mpp = find_mp(allpaths->mpvec, buff);
467 * devmap already in mpvec
468 * but remove DM uevent are somewhet unreliable
469 * so for now consider safer to remove and re-add the map
471 log_safe(LOG_NOTICE, "%s: remove dead config", mpp->alias);
472 remove_map(mpp, allpaths);
476 mpp = alloc_multipath();
482 mpp->alias = MALLOC(strlen(buff) + 1);
487 strncat(mpp->alias, buff, strlen(buff));
489 dm_get_map(mpp->alias, &mpp->size, mpp->params);
490 dm_get_status(mpp->alias, mpp->status);
492 if (setup_multipath(allpaths, mpp))
493 return 1; /* mpp freed in setup_multipath */
495 if (!vector_alloc_slot(allpaths->mpvec))
498 vector_set_slot(allpaths->mpvec, mpp);
499 set_paths_owner(allpaths, mpp);
501 if (start_waiter_thread(mpp, allpaths))
506 free_multipath(mpp, KEEP_PATHS);
511 uev_remove_map (char * devname, struct paths * allpaths)
514 struct multipath * mpp;
516 mpp->minor = atoi(devname + 3);
517 mpp = find_mp_by_minor(allpaths->mpvec, minor);
520 remove_map(mpp, allpaths);
526 uev_add_path (char * devname, struct paths * allpaths)
530 pp = find_path_by_dev(allpaths->pathvec, devname);
533 log_safe(LOG_INFO, "%s: already in pathvec");
536 log_safe(LOG_NOTICE, "add %s path checker", devname);
537 pp = store_pathinfo(allpaths->pathvec, conf->hwtable,
538 devname, DI_SYSFS | DI_WWID);
543 pp->mpp = find_mp_by_wwid(allpaths->mpvec, pp->wwid);
544 log_safe(LOG_DEBUG, "%s: ownership set to %s",
545 pp->dev_t, pp->mpp->alias);
551 uev_remove_path (char * devname, struct paths * allpaths)
556 pp = find_path_by_dev(allpaths->pathvec, devname);
559 log_safe(LOG_INFO, "%s: not in pathvec");
562 log_safe(LOG_NOTICE, "remove %s path checker", devname);
563 i = find_slot(allpaths->pathvec, (void *)pp);
564 vector_del_slot(allpaths->pathvec, i);
571 uev_trigger (struct uevent * uev, void * trigger_data)
575 struct paths * allpaths;
577 allpaths = (struct paths *)trigger_data;
578 lock(allpaths->lock);
580 if (strncmp(uev->devpath, "/block", 6))
583 basename(uev->devpath, devname);
586 * device map add/remove event
588 if (!strncmp(devname, "dm-", 3)) {
589 condlog(2, "%s %s devmap", uev->action, devname);
591 if (!strncmp(uev->action, "add", 3)) {
592 r = uev_add_map(devname, allpaths);
595 if (!strncmp(uev->action, "remove", 6)) {
596 r = uev_remove_map(devname, allpaths);
603 * path add/remove event
605 if (blacklist(conf->blist, devname))
608 if (!strncmp(uev->action, "add", 3)) {
609 r = uev_add_path(devname, allpaths);
612 if (!strncmp(uev->action, "remove", 6)) {
613 r = uev_remove_path(devname, allpaths);
619 unlock(allpaths->lock);
624 ueventloop (void * ap)
626 uevent_listen(&uev_trigger, ap);
632 strvec_free (vector vec)
637 vector_foreach_slot (vec, str, i)
645 exit_daemon (int status)
648 fprintf(stderr, "bad exit status. see daemon.log\n");
650 log_safe(LOG_INFO, "umount ramfs");
653 log_safe(LOG_INFO, "unlink pidfile");
654 unlink(DEFAULT_PIDFILE);
656 log_safe(LOG_NOTICE, "--------shut down-------");
662 * caller must have locked the path list before calling that function
665 get_dm_mpvec (struct paths * allpaths)
668 struct multipath * mpp;
670 if (dm_get_maps(allpaths->mpvec, "multipath"))
673 vector_foreach_slot (allpaths->mpvec, mpp, i) {
674 setup_multipath(allpaths, mpp);
675 mpp->minor = dm_get_minor(mpp->alias);
676 start_waiter_thread(mpp, allpaths);
683 fail_path (struct path * pp)
688 log_safe(LOG_NOTICE, "checker failed path %s in map %s",
689 pp->dev_t, pp->mpp->alias);
691 dm_fail_path(pp->mpp->alias, pp->dev_t);
695 * caller must have locked the path list before calling that function
698 reinstate_path (struct path * pp)
701 if (dm_reinstate(pp->mpp->alias, pp->dev_t))
702 log_safe(LOG_ERR, "%s: reinstate failed", pp->dev_t);
704 log_safe(LOG_NOTICE, "%s: reinstated", pp->dev_t);
709 checkerloop (void *ap)
711 struct paths *allpaths;
715 char checker_msg[MAX_CHECKER_MSG_SIZE];
717 mlockall(MCL_CURRENT | MCL_FUTURE);
719 memset(checker_msg, 0, MAX_CHECKER_MSG_SIZE);
720 allpaths = (struct paths *)ap;
722 log_safe(LOG_NOTICE, "path checkers start up");
725 lock(allpaths->lock);
726 log_safe(LOG_DEBUG, "tick");
728 vector_foreach_slot (allpaths->pathvec, pp, i) {
731 * don't check this path yet
738 * provision a next check soonest,
739 * in case we exit abnormaly from here
741 pp->tick = conf->checkint;
744 pathinfo(pp, conf->hwtable, DI_SYSFS);
749 log_safe(LOG_ERR, "%s: checkfn is void",
753 newstate = pp->checkfn(pp->fd, checker_msg,
754 &pp->checker_context);
756 if (newstate != pp->state) {
757 pp->state = newstate;
758 LOG_MSG(checker_msg, pp->dev_t);
761 * upon state change, reset the checkint
762 * to the shortest delay
764 pp->checkint = conf->checkint;
766 if (newstate == PATH_DOWN ||
767 newstate == PATH_SHAKY) {
769 * proactively fail path in the DM
774 * cancel scheduled failback
776 pp->mpp->failback_tick = 0;
781 * reinstate this path
786 * need to switch group ?
788 update_multipath_strings(pp->mpp,
792 * schedule defered failback
794 if (pp->mpp->pgfailback > 0)
795 pp->mpp->failback_tick =
798 if (pp->mpp->pgfailback == FAILBACK_IMMEDIATE)
799 switch_pathgroup(pp->mpp);
801 else if (newstate == PATH_UP) {
803 * PATH_UP for last two checks
804 * defered failback getting sooner
806 if (pp->mpp->pgfailback > 0) {
807 if (pp->mpp->failback_tick > 0) {
808 pp->mpp->failback_tick--;
810 if (!pp->mpp->failback_tick)
811 switch_pathgroup(pp->mpp);
816 * and double the next check delay.
817 * max at conf->max_checkint
819 if (pp->checkint < (conf->max_checkint / 2))
820 pp->checkint = 2 * pp->checkint;
822 pp->checkint = conf->max_checkint;
824 pp->tick = pp->checkint;
825 log_safe(LOG_DEBUG, "%s: delay next check %is",
826 pp->dev_t, pp->tick);
828 pp->state = newstate;
830 unlock(allpaths->lock);
836 static struct paths *
839 struct paths *allpaths;
841 allpaths = MALLOC(sizeof(struct paths));
847 (pthread_mutex_t *)MALLOC(sizeof(pthread_mutex_t));
852 allpaths->pathvec = vector_alloc();
854 if (!allpaths->pathvec)
857 allpaths->mpvec = vector_alloc();
859 if (!allpaths->mpvec)
862 pthread_mutex_init(allpaths->lock, NULL);
867 vector_free(allpaths->pathvec);
869 FREE(allpaths->lock);
872 log_safe(LOG_ERR, "failed to init paths");
877 * this logic is all about keeping callouts working in case of
878 * system disk outage (think system over SAN)
879 * this needs the clone syscall, so don't bother if not present
884 prepare_namespace(void)
886 mode_t mode = S_IRWXU;
895 buf = MALLOC(sizeof(struct stat));
898 * create a temp mount point for ramfs
900 if (stat(CALLOUT_DIR, buf) < 0) {
901 if (mkdir(CALLOUT_DIR, mode) < 0) {
902 log_safe(LOG_ERR, "cannot create " CALLOUT_DIR);
905 log_safe(LOG_DEBUG, "created " CALLOUT_DIR);
909 * compute the optimal ramdisk size
911 vector_foreach_slot (conf->binvec, bin,i) {
912 if ((fd = open(bin, O_RDONLY)) < 0) {
913 log_safe(LOG_ERR, "cannot open %s", bin);
916 if (fstat(fd, &statbuf) < 0) {
917 log_safe(LOG_ERR, "cannot stat %s", bin);
920 size += statbuf.st_size;
923 log_safe(LOG_INFO, "ramfs maxsize is %u", (unsigned int) size);
928 if (safe_sprintf(ramfs_args, "maxsize=%u", (unsigned int) size)) {
929 fprintf(stderr, "ramfs_args too small\n");
932 if (mount(NULL, CALLOUT_DIR, "ramfs", MS_SYNCHRONOUS, ramfs_args) < 0) {
933 log_safe(LOG_ERR, "cannot mount ramfs on " CALLOUT_DIR);
936 log_safe(LOG_DEBUG, "mount ramfs on " CALLOUT_DIR);
939 * populate the ramfs with callout binaries
941 vector_foreach_slot (conf->binvec, bin,i) {
942 if (copytodir(bin, CALLOUT_DIR) < 0) {
943 log_safe(LOG_ERR, "cannot copy %s in ramfs", bin);
946 log_safe(LOG_DEBUG, "cp %s in ramfs", bin);
948 strvec_free(conf->binvec);
951 * bind the ramfs to :
952 * /sbin : default home of multipath ...
953 * /bin : default home of scsi_id ...
954 * /tmp : home of scsi_id temp files
956 if (mount(CALLOUT_DIR, "/sbin", NULL, MS_BIND, NULL) < 0) {
957 log_safe(LOG_ERR, "cannot bind ramfs on /sbin");
960 log_safe(LOG_DEBUG, "bind ramfs on /sbin");
961 if (mount(CALLOUT_DIR, "/bin", NULL, MS_BIND, NULL) < 0) {
962 log_safe(LOG_ERR, "cannot bind ramfs on /bin");
965 log_safe(LOG_DEBUG, "bind ramfs on /bin");
966 if (mount(CALLOUT_DIR, "/tmp", NULL, MS_BIND, NULL) < 0) {
967 log_safe(LOG_ERR, "cannot bind ramfs on /tmp");
970 log_safe(LOG_DEBUG, "bind ramfs on /tmp");
977 signal_set(int signo, void (*func) (int))
980 struct sigaction sig;
981 struct sigaction osig;
983 sig.sa_handler = func;
984 sigemptyset(&sig.sa_mask);
987 r = sigaction(signo, &sig, &osig);
992 return (osig.sa_handler);
998 log_safe(LOG_NOTICE, "SIGHUP received");
1001 dbg_free_final(NULL);
1014 signal_set(SIGHUP, sighup);
1015 signal_set(SIGINT, sigend);
1016 signal_set(SIGTERM, sigend);
1017 signal_set(SIGKILL, sigend);
1024 static struct sched_param sched_param = {
1028 res = sched_setscheduler (0, SCHED_RR, &sched_param);
1031 log_safe(LOG_WARNING, "Could not set SCHED_RR at priority 99");
1036 set_oom_adj (int val)
1040 fp = fopen("/proc/self/oom_adj", "w");
1045 fprintf(fp, "%i", val);
1050 child (void * param)
1052 pthread_t check_thr, uevent_thr;
1053 pthread_attr_t attr;
1054 struct paths * allpaths;
1056 mlockall(MCL_CURRENT | MCL_FUTURE);
1059 log_safe(LOG_NOTICE, "--------start up--------");
1060 log_safe(LOG_NOTICE, "read " DEFAULT_CONFIGFILE);
1062 if (load_config(DEFAULT_CONFIGFILE))
1065 setlogmask(LOG_UPTO(conf->verbosity + 3));
1068 * fill the voids left in the config file
1070 if (!conf->binvec) {
1071 conf->binvec = vector_alloc();
1072 push_callout("/sbin/scsi_id");
1074 if (!conf->multipath) {
1075 conf->multipath = MULTIPATH;
1076 push_callout(conf->multipath);
1078 if (!conf->checkint) {
1079 conf->checkint = CHECKINT;
1080 conf->max_checkint = MAX_CHECKINT;
1083 if (pidfile_create(DEFAULT_PIDFILE, getpid())) {
1090 allpaths = init_paths();
1095 if (sysfs_get_mnt_path(sysfs_path, FILE_NAME_SIZE)) {
1096 log_safe(LOG_ERR, "can not find sysfs mount point");
1101 if (prepare_namespace() < 0) {
1102 log_safe(LOG_ERR, "cannot prepare namespace");
1108 * fetch paths and multipaths lists
1109 * no paths and/or no multipaths are valid scenarii
1110 * vectors maintenance will be driven by events
1112 path_discovery(allpaths->pathvec, conf, DI_SYSFS | DI_WWID);
1113 get_dm_mpvec(allpaths);
1118 pthread_attr_init(&attr);
1119 pthread_attr_setstacksize(&attr, 64 * 1024);
1121 pthread_create(&check_thr, &attr, checkerloop, allpaths);
1122 pthread_create(&uevent_thr, &attr, ueventloop, allpaths);
1123 pthread_join(check_thr, NULL);
1124 pthread_join(uevent_thr, NULL);
1130 main (int argc, char *argv[])
1132 extern char *optarg;
1138 if (getuid() != 0) {
1139 fprintf(stderr, "need to be root\n");
1143 /* make sure we don't lock any path */
1145 umask(umask(077) | 022);
1147 child_stack = (void *)malloc(CHILD_STACK_SIZE);
1152 conf = alloc_config();
1157 while ((arg = getopt(argc, argv, ":v:")) != EOF ) {
1160 if (sizeof(optarg) > sizeof(char *) ||
1161 !isdigit(optarg[0]))
1164 conf->verbosity = atoi(optarg);
1171 #ifdef CLONE_NEWNS /* recent systems have clone() */
1173 # if defined(__hppa__) || defined(__powerpc64__)
1174 err = clone(child, child_stack, CLONE_NEWNS, NULL);
1175 # elif defined(__ia64__)
1176 err = clone2(child, child_stack,
1177 CHILD_STACK_SIZE, CLONE_NEWNS, NULL,
1180 err = clone(child, child_stack + CHILD_STACK_SIZE, CLONE_NEWNS, NULL);
1186 #else /* older system fallback to fork() */
1192 return (child(child_stack));