staging/lustre/obdclass: Eliminate hash bucket scans in lu_cache_shrink

author Ann Koehler <amk@cray.com>

Mon, 14 Sep 2015 22:41:30 +0000 (18:41 -0400)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 15 Sep 2015 13:26:54 +0000 (06:26 -0700)
author Ann Koehler <amk@cray.com>
Mon, 14 Sep 2015 22:41:30 +0000 (18:41 -0400)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 15 Sep 2015 13:26:54 +0000 (06:26 -0700)
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h

index ea13a82..96e271d 100644 (file)
--- a/drivers/staging/lustre/lustre/include/lu_object.h
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -584,6 +584,7 @@ enum {
         LU_SS_CACHE_RACE,
         LU_SS_CACHE_DEATH_RACE,
         LU_SS_LRU_PURGED,
+       LU_SS_LRU_LEN,  /* # of objects in lsb_lru lists */
         LU_SS_LAST_STAT
  };
  
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c

index 4f7899f..c892e82 100644 (file)
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -59,6 +59,7 @@
  #include <linux/list.h>
  
  static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
  
  /**
   * Decrease reference counter on object. If last reference is freed, return
@@ -126,6 +127,9 @@ void lu_object_put(const struct lu_env *env, struct lu_object *o)
                 LASSERT(list_empty(&top->loh_lru));
                 list_add_tail(&top->loh_lru, &bkt->lsb_lru);
                 bkt->lsb_lru_len++;
+               lprocfs_counter_incr(site->ls_stats, LU_SS_LRU_LEN);
+               CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p, lru_len: %ld\n",
+                      o, site->ls_obj_hash, bkt, bkt->lsb_lru_len);
                 cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
                 return;
         }
@@ -174,16 +178,18 @@ void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
         top = o->lo_header;
         set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
         if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
-               struct cfs_hash *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+               struct lu_site *site = o->lo_dev->ld_site;
+               struct cfs_hash *obj_hash = site->ls_obj_hash;
                 struct cfs_hash_bd bd;
  
                 cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
                 if (!list_empty(&top->loh_lru)) {
                         struct lu_site_bkt_data *bkt;
  
-               list_del_init(&top->loh_lru);
+                       list_del_init(&top->loh_lru);
                         bkt = cfs_hash_bd_extra_get(obj_hash, &bd);
                         bkt->lsb_lru_len--;
+                       lprocfs_counter_decr(site->ls_stats, LU_SS_LRU_LEN);
                 }
                 cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
                 cfs_hash_bd_unlock(obj_hash, &bd, 1);
@@ -355,6 +361,7 @@ int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
                                                &bd2, &h->loh_hash);
                         list_move(&h->loh_lru, &dispose);
                         bkt->lsb_lru_len--;
+                       lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN);
                         if (did_sth == 0)
                                 did_sth = 1;
  
@@ -568,8 +575,9 @@ static struct lu_object *htable_lookup(struct lu_site *s,
                 cfs_hash_get(s->ls_obj_hash, hnode);
                 lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
                 if (!list_empty(&h->loh_lru)) {
-               list_del_init(&h->loh_lru);
+                       list_del_init(&h->loh_lru);
                         bkt->lsb_lru_len--;
+                       lprocfs_counter_decr(s->ls_stats, LU_SS_LRU_LEN);
                 }
                 return lu_object_top(h);
         }
@@ -1029,6 +1037,12 @@ int lu_site_init(struct lu_site *s, struct lu_device *top)
                              0, "cache_death_race", "cache_death_race");
         lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
                              0, "lru_purged", "lru_purged");
+       /*
+        * Unlike other counters, lru_len can be decremented so
+        * need lc_sum instead of just lc_count
+        */
+       lprocfs_counter_init(s->ls_stats, LU_SS_LRU_LEN,
+                            LPROCFS_CNTR_AVGMINMAX, "lru_len", "lru_len");
  
         INIT_LIST_HEAD(&s->ls_linkage);
         s->ls_top_dev = top;
@@ -1817,27 +1831,21 @@ static void lu_site_stats_get(struct cfs_hash *hs,
  
  
  /*
- * There exists a potential lock inversion deadlock scenario when using
- * Lustre on top of ZFS. This occurs between one of ZFS's
- * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
- * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
- * while thread B will take the ht_lock and sleep on the lu_sites_guard
- * lock. Obviously neither thread will wake and drop their respective hold
- * on their lock.
+ * lu_cache_shrink_count returns the number of cached objects that are
+ * candidates to be freed by shrink_slab(). A counter, which tracks
+ * the number of items in the site's lru, is maintained in the per cpu
+ * stats of each site. The counter is incremented when an object is added
+ * to a site's lru and decremented when one is removed. The number of
+ * free-able objects is the sum of all per cpu counters for all sites.
   *
- * To prevent this from happening we must ensure the lu_sites_guard lock is
- * not taken while down this code path. ZFS reliably does not set the
- * __GFP_FS bit in its code paths, so this can be used to determine if it
- * is safe to take the lu_sites_guard lock.
- *
- * Ideally we should accurately return the remaining number of cached
- * objects without taking the  lu_sites_guard lock, but this is not
- * possible in the current implementation.
+ * Using a per cpu counter is a compromise solution to concurrent access:
+ * lu_object_put() can update the counter without locking the site and
+ * lu_cache_shrink_count can sum the counters without locking each
+ * ls_obj_hash bucket.
   */
  static unsigned long lu_cache_shrink_count(struct shrinker *sk,
                                            struct shrink_control *sc)
  {
-       struct lu_site_stats stats;
         struct lu_site *s;
         struct lu_site *tmp;
         unsigned long cached = 0;
@@ -1847,14 +1855,14 @@ static unsigned long lu_cache_shrink_count(struct shrinker *sk,
  
         mutex_lock(&lu_sites_guard);
         list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
-               memset(&stats, 0, sizeof(stats));
-               lu_site_stats_get(s->ls_obj_hash, &stats, 0);
-               cached += stats.lss_total - stats.lss_busy;
+               cached += ls_stats_read(s->ls_stats, LU_SS_LRU_LEN);
         }
         mutex_unlock(&lu_sites_guard);
  
         cached = (cached / 100) * sysctl_vfs_cache_pressure;
-       CDEBUG(D_INODE, "%ld objects cached\n", cached);
+       CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n",
+              cached, sysctl_vfs_cache_pressure);
+
         return cached;
  }
  
@@ -1988,6 +1996,13 @@ static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
         struct lprocfs_counter ret;
  
         lprocfs_stats_collect(stats, idx, &ret);
+       if (idx == LU_SS_LRU_LEN)
+               /*
+                * protect against counter on cpu A being decremented
+                * before counter is incremented on cpu B; unlikely
+                */
+               return (__u32)((ret.lc_sum > 0) ? ret.lc_sum : 0);
+
         return (__u32)ret.lc_count;
  }
  
@@ -2002,7 +2017,7 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
         memset(&stats, 0, sizeof(stats));
         lu_site_stats_get(s->ls_obj_hash, &stats, 1);
  
-       seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+       seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d %d\n",
                    stats.lss_busy,
                    stats.lss_total,
                    stats.lss_populated,
@@ -2013,7 +2028,8 @@ int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
                    ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
                    ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
                    ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
-                  ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+                  ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED),
+                  ls_stats_read(s->ls_stats, LU_SS_LRU_LEN));
         return 0;
  }
  EXPORT_SYMBOL(lu_site_stats_print);
author	Ann Koehler <amk@cray.com>
	Mon, 14 Sep 2015 22:41:30 +0000 (18:41 -0400)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 15 Sep 2015 13:26:54 +0000 (06:26 -0700)
drivers/staging/lustre/lustre/include/lu_object.h		patch \| blob \| history
drivers/staging/lustre/lustre/obdclass/lu_object.c		patch \| blob \| history