fs: dlm: remove waiter warnings
authorAlexander Aring <aahringo@redhat.com>
Wed, 22 Jun 2022 18:45:20 +0000 (14:45 -0400)
committerDavid Teigland <teigland@redhat.com>
Fri, 24 Jun 2022 16:57:52 +0000 (11:57 -0500)
This patch removes warning messages that could be logged when
remote requests had been waiting on a reply message for some timeout
period (which could be set through configfs, but was rarely enabled.)
The improved midcomms layer now carefully tracks all messages and
replies, and logs much more useful messages if there is an actual
problem.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
fs/dlm/config.c
fs/dlm/config.h
fs/dlm/dlm_internal.h
fs/dlm/lock.c
fs/dlm/lock.h
fs/dlm/lockspace.c

index 42eee278375620e86c6e6ae9a5844bf73bb2238c..081fd201e3a8554e40f7a212cb76457937f7d711 100644 (file)
@@ -76,7 +76,6 @@ struct dlm_cluster {
        unsigned int cl_protocol;
        unsigned int cl_mark;
        unsigned int cl_timewarn_cs;
-       unsigned int cl_waitwarn_us;
        unsigned int cl_new_rsb_count;
        unsigned int cl_recover_callbacks;
        char cl_cluster_name[DLM_LOCKSPACE_LEN];
@@ -103,7 +102,6 @@ enum {
        CLUSTER_ATTR_PROTOCOL,
        CLUSTER_ATTR_MARK,
        CLUSTER_ATTR_TIMEWARN_CS,
-       CLUSTER_ATTR_WAITWARN_US,
        CLUSTER_ATTR_NEW_RSB_COUNT,
        CLUSTER_ATTR_RECOVER_CALLBACKS,
        CLUSTER_ATTR_CLUSTER_NAME,
@@ -225,7 +223,6 @@ CLUSTER_ATTR(log_info, NULL);
 CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
 CLUSTER_ATTR(mark, NULL);
 CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
-CLUSTER_ATTR(waitwarn_us, NULL);
 CLUSTER_ATTR(new_rsb_count, NULL);
 CLUSTER_ATTR(recover_callbacks, NULL);
 
@@ -241,7 +238,6 @@ static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
        [CLUSTER_ATTR_MARK] = &cluster_attr_mark,
        [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
-       [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
        [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
        [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
        [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
@@ -433,7 +429,6 @@ static struct config_group *make_cluster(struct config_group *g,
        cl->cl_log_info = dlm_config.ci_log_info;
        cl->cl_protocol = dlm_config.ci_protocol;
        cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
-       cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
        cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
        cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
        memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
@@ -955,7 +950,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_PROTOCOL           DLM_PROTO_TCP
 #define DEFAULT_MARK               0
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
-#define DEFAULT_WAITWARN_US       0
 #define DEFAULT_NEW_RSB_COUNT    128
 #define DEFAULT_RECOVER_CALLBACKS  0
 #define DEFAULT_CLUSTER_NAME      ""
@@ -972,7 +966,6 @@ struct dlm_config_info dlm_config = {
        .ci_protocol = DEFAULT_PROTOCOL,
        .ci_mark = DEFAULT_MARK,
        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
-       .ci_waitwarn_us = DEFAULT_WAITWARN_US,
        .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
        .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
        .ci_cluster_name = DEFAULT_CLUSTER_NAME
index df92b0a07fc6c5300940f27f2ba30eadbda651c1..cb23d018e863cdeccbacfaae458d5950058f707b 100644 (file)
@@ -38,7 +38,6 @@ struct dlm_config_info {
        int ci_protocol;
        int ci_mark;
        int ci_timewarn_cs;
-       int ci_waitwarn_us;
        int ci_new_rsb_count;
        int ci_recover_callbacks;
        char ci_cluster_name[DLM_LOCKSPACE_LEN];
index afec22b1a65fb078172b74f64401d50f548e0850..84dad619081ee1cf7935dde7f646516fca5c787b 100644 (file)
@@ -259,7 +259,6 @@ struct dlm_lkb {
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
        ktime_t                 lkb_timestamp;
-       ktime_t                 lkb_wait_time;
        unsigned long           lkb_timeout_cs;
 
        struct mutex            lkb_cb_mutex;
index e80d42ba64ae75bc5d7c9d3bf2b71c8b677d41c6..080cd216a9a4d282ad440fd98e5edb7a39a1eff0 100644 (file)
@@ -1402,75 +1402,6 @@ static int msg_reply_type(int mstype)
        return -1;
 }
 
-static int nodeid_warned(int nodeid, int num_nodes, int *warned)
-{
-       int i;
-
-       for (i = 0; i < num_nodes; i++) {
-               if (!warned[i]) {
-                       warned[i] = nodeid;
-                       return 0;
-               }
-               if (warned[i] == nodeid)
-                       return 1;
-       }
-       return 0;
-}
-
-void dlm_scan_waiters(struct dlm_ls *ls)
-{
-       struct dlm_lkb *lkb;
-       s64 us;
-       s64 debug_maxus = 0;
-       u32 debug_scanned = 0;
-       u32 debug_expired = 0;
-       int num_nodes = 0;
-       int *warned = NULL;
-
-       if (!dlm_config.ci_waitwarn_us)
-               return;
-
-       mutex_lock(&ls->ls_waiters_mutex);
-
-       list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
-               if (!lkb->lkb_wait_time)
-                       continue;
-
-               debug_scanned++;
-
-               us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
-
-               if (us < dlm_config.ci_waitwarn_us)
-                       continue;
-
-               lkb->lkb_wait_time = 0;
-
-               debug_expired++;
-               if (us > debug_maxus)
-                       debug_maxus = us;
-
-               if (!num_nodes) {
-                       num_nodes = ls->ls_num_nodes;
-                       warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL);
-               }
-               if (!warned)
-                       continue;
-               if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
-                       continue;
-
-               log_error(ls, "waitwarn %x %lld %d us check connection to "
-                         "node %d", lkb->lkb_id, (long long)us,
-                         dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
-       }
-       mutex_unlock(&ls->ls_waiters_mutex);
-       kfree(warned);
-
-       if (debug_expired)
-               log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
-                         debug_scanned, debug_expired,
-                         dlm_config.ci_waitwarn_us, (long long)debug_maxus);
-}
-
 /* add/remove lkb from global waiters list of lkb's waiting for
    a reply from a remote node */
 
@@ -1514,7 +1445,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 
        lkb->lkb_wait_count++;
        lkb->lkb_wait_type = mstype;
-       lkb->lkb_wait_time = ktime_get();
        lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
        hold_lkb(lkb);
        list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
@@ -1962,16 +1892,6 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
        list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
                lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
        mutex_unlock(&ls->ls_timeout_mutex);
-
-       if (!dlm_config.ci_waitwarn_us)
-               return;
-
-       mutex_lock(&ls->ls_waiters_mutex);
-       list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
-               if (ktime_to_us(lkb->lkb_wait_time))
-                       lkb->lkb_wait_time = ktime_get();
-       }
-       mutex_unlock(&ls->ls_waiters_mutex);
 }
 
 /* lkb is master or local copy */
index 252a5898f9081de8786c1cf2bf17879a5da665d8..40781d9a24df30d1c8c418e59e20901c23682676 100644 (file)
@@ -24,7 +24,6 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
 void dlm_scan_rsbs(struct dlm_ls *ls);
 int dlm_lock_recovery_try(struct dlm_ls *ls);
 void dlm_unlock_recovery(struct dlm_ls *ls);
-void dlm_scan_waiters(struct dlm_ls *ls);
 void dlm_scan_timeout(struct dlm_ls *ls);
 void dlm_adjust_timeouts(struct dlm_ls *ls);
 int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
index 0c3613d09c5e3059328035733956f797ebecfe80..ca1eca0809d416fda93ffd70590ce164baf64c49 100644 (file)
@@ -275,7 +275,6 @@ static int dlm_scand(void *data)
                                ls->ls_scan_time = jiffies;
                                dlm_scan_rsbs(ls);
                                dlm_scan_timeout(ls);
-                               dlm_scan_waiters(ls);
                                dlm_unlock_recovery(ls);
                        } else {
                                ls->ls_scan_time += HZ;