2 * (C) Copyright HUAWEI Technology Corp. 2017, All Rights Reserved.
6 * Prioritizer for device mapper multipath, where the corresponding priority
7 * values of specific paths are provided by a latency algorithm. And the
8 * latency algorithm is dependent on arguments("io_num" and "base_num").
10 * The principle of the algorithm as follows:
11 * 1. By sending a certain number "io_num" of read IOs to the current path
12 * continuously, the IOs' average latency can be calculated.
13 * 2. Max value and min value of average latency are constant. According to
14 * the average latency of each path and the "base_num" of logarithmic
15 * scale, the priority "rc" of each path can be provided.
17 * Author(s): Yang Feng <philip.yang@huawei.com>
18 * Revised: Guan Junxiong <guanjunxiong@huawei.com>
20 * This file is released under the GPL version 2, or any later version.
29 #include <sys/ioctl.h>
37 #include "time-util.h"
39 #define pp_pl_log(prio, fmt, args...) condlog(prio, "path_latency prio: " fmt, ##args)
41 #define MAX_IO_NUM 200
43 #define DEF_IO_NUM 100
45 #define MAX_BASE_NUM 10
46 #define MIN_BASE_NUM 1.1
47 // This is 10**(1/4). 4 prio steps correspond to a factor of 10.
48 #define DEF_BASE_NUM 1.77827941004
50 #define MAX_AVG_LATENCY 100000000. /* Unit: us */
51 #define MIN_AVG_LATENCY 1. /* Unit: us */
53 #define DEFAULT_PRIORITY 0
55 #define USEC_PER_SEC 1000000LL
56 #define NSEC_PER_USEC 1000LL
58 #define DEF_BLK_SIZE 4096
60 static int prepare_directio_read(int fd, int *blksz, char **pbuf,
63 unsigned long pgsize = getpagesize();
66 if (ioctl(fd, BLKBSZGET, blksz) < 0) {
67 pp_pl_log(3,"catnnot get blocksize, set default");
68 *blksz = DEF_BLK_SIZE;
70 if (posix_memalign((void **)pbuf, pgsize, *blksz))
73 flags = fcntl(fd, F_GETFL);
76 if (!(flags & O_DIRECT)) {
78 if (fcntl(fd, F_SETFL, flags) < 0)
91 static void cleanup_directio_read(int fd, char *buf, int restore_flags)
99 if ((flags = fcntl(fd, F_GETFL)) >= 0) {
100 int ret __attribute__ ((unused));
102 /* No point in checking for errors */
103 ret = fcntl(fd, F_SETFL, flags);
107 static int do_directio_read(int fd, unsigned int timeout, char *buf, int sz)
110 struct timeval tm = { .tv_sec = timeout };
114 if (lseek(fd, 0, SEEK_SET) == -1)
117 FD_SET(fd, &read_fds);
118 ret = select(fd+1, &read_fds, NULL, NULL, &tm);
121 num_read = read(fd, buf, sz);
128 int check_args_valid(int io_num, double base_num)
130 if ((io_num < MIN_IO_NUM) || (io_num > MAX_IO_NUM)) {
131 pp_pl_log(0, "args io_num is outside the valid range");
135 if ((base_num < MIN_BASE_NUM) || (base_num > MAX_BASE_NUM)) {
136 pp_pl_log(0, "args base_num is outside the valid range");
144 * In multipath.conf, args form: io_num=n base_num=m. For example, args are
145 * "io_num=20 base_num=10", this function can get io_num value 20 and
148 static int get_ionum_and_basenum(char *args, int *ionum, double *basenum)
150 char split_char[] = " \t";
152 char *str, *str_inval;
154 int flag_io = 0, flag_base = 0;
156 if ((args == NULL) || (ionum == NULL) || (basenum == NULL)) {
157 pp_pl_log(0, "args string is NULL");
161 arg = temp = strdup(args);
165 for (i = 0; i < 2; i++) {
166 str = get_next_string(&temp, split_char);
169 if (!strncmp(str, "io_num=", 7) && strlen(str) > 7) {
170 *ionum = (int)strtoul(str + 7, &str_inval, 10);
171 if (str == str_inval)
175 else if (!strncmp(str, "base_num=", 9) && strlen(str) > 9) {
176 *basenum = strtod(str + 9, &str_inval);
177 if (str == str_inval)
183 if (!flag_io || !flag_base)
185 if (check_args_valid(*ionum, *basenum) == 0)
196 * Do not scale the prioriy in a certain range such as [0, 1024]
197 * because scaling will eliminate the effect of base_num.
199 int calcPrio(double lg_avglatency, double lg_maxavglatency,
200 double lg_minavglatency)
202 if (lg_avglatency <= lg_minavglatency)
203 return lg_maxavglatency - lg_minavglatency;
205 if (lg_avglatency >= lg_maxavglatency)
208 return lg_maxavglatency - lg_avglatency;
211 int getprio(struct path *pp, char *args, unsigned int timeout)
216 double lg_avglatency, lg_maxavglatency, lg_minavglatency;
217 double standard_deviation;
218 double lg_toldelay = 0;
221 int restore_flags = 0;
223 double sum_squares = 0;
228 if (get_ionum_and_basenum(args, &io_num, &base_num) == 0) {
230 base_num = DEF_BASE_NUM;
231 pp_pl_log(0, "%s: fails to get path_latency args, set default:"
232 "io_num=%d base_num=%.3lf",
233 pp->dev, io_num, base_num);
236 lg_base = log(base_num);
237 lg_maxavglatency = log(MAX_AVG_LATENCY) / lg_base;
238 lg_minavglatency = log(MIN_AVG_LATENCY) / lg_base;
240 if (prepare_directio_read(pp->fd, &blksize, &buf, &restore_flags) < 0)
245 struct timespec tv_before, tv_after, tv_diff;
246 double diff, reldiff;
248 (void)clock_gettime(CLOCK_MONOTONIC, &tv_before);
250 if (do_directio_read(pp->fd, timeout, buf, blksize)) {
251 pp_pl_log(0, "%s: path down", pp->dev);
252 cleanup_directio_read(pp->fd, buf, restore_flags);
256 (void)clock_gettime(CLOCK_MONOTONIC, &tv_after);
258 timespecsub(&tv_after, &tv_before, &tv_diff);
259 diff = tv_diff.tv_sec * 1000 * 1000 + tv_diff.tv_nsec / 1000;
263 * Avoid taking log(0).
264 * This unlikely case is treated as minimum -
265 * the sums don't increase
269 /* we scale by lg_base here */
270 reldiff = log(diff) / lg_base;
273 * We assume that the latency complies with Log-normal
274 * distribution. The logarithm of latency is in normal
277 lg_toldelay += reldiff;
278 sum_squares += reldiff * reldiff;
281 cleanup_directio_read(pp->fd, buf, restore_flags);
283 lg_avglatency = lg_toldelay / (long long)io_num;
285 if (lg_avglatency > lg_maxavglatency) {
287 "%s: average latency (%lld us) is outside the thresold (%lld us)",
288 pp->dev, (long long)pow(base_num, lg_avglatency),
289 (long long)MAX_AVG_LATENCY);
290 return DEFAULT_PRIORITY;
293 standard_deviation = sqrt((sum_squares - lg_toldelay * lg_avglatency)
296 rc = calcPrio(lg_avglatency, lg_maxavglatency, lg_minavglatency);
298 pp_pl_log(3, "%s: latency avg=%.2e uncertainty=%.1f prio=%d\n",
299 pp->dev, exp(lg_avglatency * lg_base),
300 exp(standard_deviation * lg_base), rc);