bcache: Write out full stripes
authorKent Overstreet <koverstreet@google.com>
Wed, 5 Jun 2013 13:24:39 +0000 (06:24 -0700)
committerKent Overstreet <koverstreet@google.com>
Thu, 27 Jun 2013 04:58:04 +0000 (21:58 -0700)
Now that we're tracking dirty data per stripe, we can add two
optimizations for raid5/6:

 * If a stripe is already dirty, force writes to that stripe to
   writeback mode - to help build up full stripes of dirty data

 * When flushing dirty data, preferentially write out full stripes first
   if there are any.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
drivers/md/bcache/bcache.h
drivers/md/bcache/btree.c
drivers/md/bcache/btree.h
drivers/md/bcache/debug.c
drivers/md/bcache/movinggc.c
drivers/md/bcache/request.c
drivers/md/bcache/sysfs.c
drivers/md/bcache/writeback.c
drivers/md/bcache/writeback.h

index dbddef0..342ba86 100644 (file)
@@ -387,8 +387,6 @@ struct keybuf_key {
 typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
 
 struct keybuf {
-       keybuf_pred_fn          *key_predicate;
-
        struct bkey             last_scanned;
        spinlock_t              lock;
 
@@ -532,6 +530,7 @@ struct cached_dev {
        unsigned                sequential_merge:1;
        unsigned                verify:1;
 
+       unsigned                partial_stripes_expensive:1;
        unsigned                writeback_metadata:1;
        unsigned                writeback_running:1;
        unsigned char           writeback_percent;
index b93cf56..09fb8a2 100644 (file)
@@ -2252,7 +2252,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 }
 
 static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
-                                  struct keybuf *buf, struct bkey *end)
+                                  struct keybuf *buf, struct bkey *end,
+                                  keybuf_pred_fn *pred)
 {
        struct btree_iter iter;
        bch_btree_iter_init(b, &iter, &buf->last_scanned);
@@ -2271,7 +2272,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
                        if (bkey_cmp(&buf->last_scanned, end) >= 0)
                                break;
 
-                       if (buf->key_predicate(buf, k)) {
+                       if (pred(buf, k)) {
                                struct keybuf_key *w;
 
                                spin_lock(&buf->lock);
@@ -2290,7 +2291,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
                        if (!k)
                                break;
 
-                       btree(refill_keybuf, k, b, op, buf, end);
+                       btree(refill_keybuf, k, b, op, buf, end, pred);
                        /*
                         * Might get an error here, but can't really do anything
                         * and it'll get logged elsewhere. Just read what we
@@ -2308,7 +2309,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
 }
 
 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
-                         struct bkey *end)
+                      struct bkey *end, keybuf_pred_fn *pred)
 {
        struct bkey start = buf->last_scanned;
        struct btree_op op;
@@ -2316,7 +2317,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 
        cond_resched();
 
-       btree_root(refill_keybuf, c, &op, buf, end);
+       btree_root(refill_keybuf, c, &op, buf, end, pred);
        closure_sync(&op.cl);
 
        pr_debug("found %s keys from %llu:%llu to %llu:%llu",
@@ -2402,7 +2403,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
 
 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
                                             struct keybuf *buf,
-                                            struct bkey *end)
+                                            struct bkey *end,
+                                            keybuf_pred_fn *pred)
 {
        struct keybuf_key *ret;
 
@@ -2416,15 +2418,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
                        break;
                }
 
-               bch_refill_keybuf(c, buf, end);
+               bch_refill_keybuf(c, buf, end, pred);
        }
 
        return ret;
 }
 
-void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
+void bch_keybuf_init(struct keybuf *buf)
 {
-       buf->key_predicate      = fn;
        buf->last_scanned       = MAX_KEY;
        buf->keys               = RB_ROOT;
 
index 2b016b9..f66d69a 100644 (file)
@@ -391,13 +391,14 @@ void bch_moving_gc(struct closure *);
 int bch_btree_check(struct cache_set *, struct btree_op *);
 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
 
-void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
-void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
+void bch_keybuf_init(struct keybuf *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
+                      keybuf_pred_fn *);
 bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
                                  struct bkey *);
 void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
 struct keybuf_key *bch_keybuf_next(struct keybuf *);
-struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
-                                         struct keybuf *, struct bkey *);
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
+                                         struct bkey *, keybuf_pred_fn *);
 
 #endif
index 82e3a07..1c8fd31 100644 (file)
@@ -357,7 +357,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
                if (i->bytes)
                        break;
 
-               w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
+               w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
                if (!w)
                        break;
 
@@ -380,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)
 
        file->private_data = i;
        i->c = c;
-       bch_keybuf_init(&i->keys, dump_pred);
+       bch_keybuf_init(&i->keys);
        i->keys.last_scanned = KEY(0, 0, 0);
 
        return 0;
index 04f6b97..a241e9f 100644 (file)
@@ -136,7 +136,8 @@ static void read_moving(struct closure *cl)
        /* XXX: if we error, background writeback could stall indefinitely */
 
        while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
-               w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
+               w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
+                                          &MAX_KEY, moving_pred);
                if (!w)
                        break;
 
@@ -248,5 +249,5 @@ void bch_moving_gc(struct closure *cl)
 
 void bch_moving_init_cache_set(struct cache_set *c)
 {
-       bch_keybuf_init(&c->moving_gc_keys, moving_pred);
+       bch_keybuf_init(&c->moving_gc_keys);
 }
index 017c95f..17bd597 100644 (file)
@@ -22,8 +22,6 @@
 
 #define CUTOFF_CACHE_ADD       95
 #define CUTOFF_CACHE_READA     90
-#define CUTOFF_WRITEBACK       50
-#define CUTOFF_WRITEBACK_SYNC  75
 
 struct kmem_cache *bch_search_cache;
 
@@ -998,17 +996,6 @@ static void cached_dev_write_complete(struct closure *cl)
        cached_dev_bio_complete(cl);
 }
 
-static bool should_writeback(struct cached_dev *dc, struct bio *bio)
-{
-       unsigned threshold = (bio->bi_rw & REQ_SYNC)
-               ? CUTOFF_WRITEBACK_SYNC
-               : CUTOFF_WRITEBACK;
-
-       return !atomic_read(&dc->disk.detaching) &&
-               cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
-               dc->disk.c->gc_stats.in_use < threshold;
-}
-
 static void request_write(struct cached_dev *dc, struct search *s)
 {
        struct closure *cl = &s->cl;
@@ -1030,12 +1017,16 @@ static void request_write(struct cached_dev *dc, struct search *s)
        if (bio->bi_rw & REQ_DISCARD)
                goto skip;
 
+       if (should_writeback(dc, s->orig_bio,
+                            cache_mode(dc, bio),
+                            s->op.skip)) {
+               s->op.skip = false;
+               s->writeback = true;
+       }
+
        if (s->op.skip)
                goto skip;
 
-       if (should_writeback(dc, s->orig_bio))
-               s->writeback = true;
-
        trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
 
        if (!s->writeback) {
index cf8d91e..70c6dff 100644 (file)
@@ -81,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
 rw_attribute(writeback_rate_d_smooth);
 read_attribute(writeback_rate_debug);
 
+read_attribute(stripe_size);
+read_attribute(partial_stripes_expensive);
+
 rw_attribute(synchronous);
 rw_attribute(journal_delay_ms);
 rw_attribute(discard);
@@ -147,6 +150,9 @@ SHOW(__bch_cached_dev)
        sysfs_hprint(dirty_data,
                     bcache_dev_sectors_dirty(&dc->disk) << 9);
 
+       sysfs_hprint(stripe_size,       (1 << dc->disk.stripe_size_bits) << 9);
+       var_printf(partial_stripes_expensive,   "%u");
+
        var_printf(sequential_merge,    "%i");
        var_hprint(sequential_cutoff);
        var_hprint(readahead);
@@ -286,6 +292,8 @@ static struct attribute *bch_cached_dev_files[] = {
        &sysfs_writeback_rate_d_smooth,
        &sysfs_writeback_rate_debug,
        &sysfs_dirty_data,
+       &sysfs_stripe_size,
+       &sysfs_partial_stripes_expensive,
        &sysfs_sequential_cutoff,
        &sysfs_sequential_merge,
        &sysfs_clear_stats,
index dd81547..d81ee5c 100644 (file)
@@ -108,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
        return KEY_DIRTY(k);
 }
 
+static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
+{
+       uint64_t stripe;
+       unsigned nr_sectors = KEY_SIZE(k);
+       struct cached_dev *dc = container_of(buf, struct cached_dev,
+                                            writeback_keys);
+       unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
+
+       if (!KEY_DIRTY(k))
+               return false;
+
+       stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
+       while (1) {
+               if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
+                   stripe_size)
+                       return false;
+
+               if (nr_sectors <= stripe_size)
+                       return true;
+
+               nr_sectors -= stripe_size;
+               stripe++;
+       }
+}
+
 static void dirty_init(struct keybuf_key *w)
 {
        struct dirty_io *io = w->private;
@@ -152,7 +177,22 @@ static void refill_dirty(struct closure *cl)
                searched_from_start = true;
        }
 
-       bch_refill_keybuf(dc->disk.c, buf, &end);
+       if (dc->partial_stripes_expensive) {
+               uint64_t i;
+
+               for (i = 0; i < dc->disk.nr_stripes; i++)
+                       if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
+                           1 << dc->disk.stripe_size_bits)
+                               goto full_stripes;
+
+               goto normal_refill;
+full_stripes:
+               bch_refill_keybuf(dc->disk.c, buf, &end,
+                                 dirty_full_stripe_pred);
+       } else {
+normal_refill:
+               bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
+       }
 
        if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
                /* Searched the entire btree  - delay awhile */
@@ -446,7 +486,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
        closure_init_unlocked(&dc->writeback);
        init_rwsem(&dc->writeback_lock);
 
-       bch_keybuf_init(&dc->writeback_keys, dirty_pred);
+       bch_keybuf_init(&dc->writeback_keys);
 
        dc->writeback_metadata          = true;
        dc->writeback_running           = true;
index 5ce9771..c91f61b 100644 (file)
@@ -1,6 +1,9 @@
 #ifndef _BCACHE_WRITEBACK_H
 #define _BCACHE_WRITEBACK_H
 
+#define CUTOFF_WRITEBACK       40
+#define CUTOFF_WRITEBACK_SYNC  70
+
 static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 {
        uint64_t i, ret = 0;
@@ -11,6 +14,46 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
        return ret;
 }
 
+static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
+                                          uint64_t offset,
+                                          unsigned nr_sectors)
+{
+       uint64_t stripe = offset >> d->stripe_size_bits;
+
+       while (1) {
+               if (atomic_read(d->stripe_sectors_dirty + stripe))
+                       return true;
+
+               if (nr_sectors <= 1 << d->stripe_size_bits)
+                       return false;
+
+               nr_sectors -= 1 << d->stripe_size_bits;
+               stripe++;
+       }
+}
+
+static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
+                                   unsigned cache_mode, bool would_skip)
+{
+       unsigned in_use = dc->disk.c->gc_stats.in_use;
+
+       if (cache_mode != CACHE_MODE_WRITEBACK ||
+           atomic_read(&dc->disk.detaching) ||
+           in_use > CUTOFF_WRITEBACK_SYNC)
+               return false;
+
+       if (dc->partial_stripes_expensive &&
+           bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
+                                   bio_sectors(bio)))
+               return true;
+
+       if (would_skip)
+               return false;
+
+       return bio->bi_rw & REQ_SYNC ||
+               in_use <= CUTOFF_WRITEBACK;
+}
+
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 void bch_writeback_queue(struct cached_dev *);
 void bch_writeback_add(struct cached_dev *);