Merge remote-tracking branch 'stable/linux-5.15.y' into rpi-5.15.y
[platform/kernel/linux-rpi.git] / drivers / md / dm-era-target.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "dm.h"
3 #include "persistent-data/dm-transaction-manager.h"
4 #include "persistent-data/dm-bitset.h"
5 #include "persistent-data/dm-space-map.h"
6
7 #include <linux/dm-io.h>
8 #include <linux/dm-kcopyd.h>
9 #include <linux/init.h>
10 #include <linux/mempool.h>
11 #include <linux/module.h>
12 #include <linux/slab.h>
13 #include <linux/vmalloc.h>
14
15 #define DM_MSG_PREFIX "era"
16
17 #define SUPERBLOCK_LOCATION 0
18 #define SUPERBLOCK_MAGIC 2126579579
19 #define SUPERBLOCK_CSUM_XOR 146538381
20 #define MIN_ERA_VERSION 1
21 #define MAX_ERA_VERSION 1
22 #define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION
23 #define MIN_BLOCK_SIZE 8
24
25 /*----------------------------------------------------------------
26  * Writeset
27  *--------------------------------------------------------------*/
28 struct writeset_metadata {
29         uint32_t nr_bits;
30         dm_block_t root;
31 };
32
33 struct writeset {
34         struct writeset_metadata md;
35
36         /*
37          * An in core copy of the bits to save constantly doing look ups on
38          * disk.
39          */
40         unsigned long *bits;
41 };
42
43 /*
44  * This does not free off the on disk bitset as this will normally be done
45  * after digesting into the era array.
46  */
47 static void writeset_free(struct writeset *ws)
48 {
49         vfree(ws->bits);
50         ws->bits = NULL;
51 }
52
53 static int setup_on_disk_bitset(struct dm_disk_bitset *info,
54                                 unsigned nr_bits, dm_block_t *root)
55 {
56         int r;
57
58         r = dm_bitset_empty(info, root);
59         if (r)
60                 return r;
61
62         return dm_bitset_resize(info, *root, 0, nr_bits, false, root);
63 }
64
65 static size_t bitset_size(unsigned nr_bits)
66 {
67         return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG);
68 }
69
70 /*
71  * Allocates memory for the in core bitset.
72  */
73 static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
74 {
75         ws->bits = vzalloc(bitset_size(nr_blocks));
76         if (!ws->bits) {
77                 DMERR("%s: couldn't allocate in memory bitset", __func__);
78                 return -ENOMEM;
79         }
80
81         return 0;
82 }
83
84 /*
85  * Wipes the in-core bitset, and creates a new on disk bitset.
86  */
87 static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws,
88                          dm_block_t nr_blocks)
89 {
90         int r;
91
92         memset(ws->bits, 0, bitset_size(nr_blocks));
93
94         ws->md.nr_bits = nr_blocks;
95         r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
96         if (r) {
97                 DMERR("%s: setup_on_disk_bitset failed", __func__);
98                 return r;
99         }
100
101         return 0;
102 }
103
104 static bool writeset_marked(struct writeset *ws, dm_block_t block)
105 {
106         return test_bit(block, ws->bits);
107 }
108
109 static int writeset_marked_on_disk(struct dm_disk_bitset *info,
110                                    struct writeset_metadata *m, dm_block_t block,
111                                    bool *result)
112 {
113         dm_block_t old = m->root;
114
115         /*
116          * The bitset was flushed when it was archived, so we know there'll
117          * be no change to the root.
118          */
119         int r = dm_bitset_test_bit(info, m->root, block, &m->root, result);
120         if (r) {
121                 DMERR("%s: dm_bitset_test_bit failed", __func__);
122                 return r;
123         }
124
125         BUG_ON(m->root != old);
126
127         return r;
128 }
129
130 /*
131  * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was.
132  */
133 static int writeset_test_and_set(struct dm_disk_bitset *info,
134                                  struct writeset *ws, uint32_t block)
135 {
136         int r;
137
138         if (!test_bit(block, ws->bits)) {
139                 r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
140                 if (r) {
141                         /* FIXME: fail mode */
142                         return r;
143                 }
144
145                 return 0;
146         }
147
148         return 1;
149 }
150
151 /*----------------------------------------------------------------
152  * On disk metadata layout
153  *--------------------------------------------------------------*/
154 #define SPACE_MAP_ROOT_SIZE 128
155 #define UUID_LEN 16
156
157 struct writeset_disk {
158         __le32 nr_bits;
159         __le64 root;
160 } __packed;
161
162 struct superblock_disk {
163         __le32 csum;
164         __le32 flags;
165         __le64 blocknr;
166
167         __u8 uuid[UUID_LEN];
168         __le64 magic;
169         __le32 version;
170
171         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
172
173         __le32 data_block_size;
174         __le32 metadata_block_size;
175         __le32 nr_blocks;
176
177         __le32 current_era;
178         struct writeset_disk current_writeset;
179
180         /*
181          * Only these two fields are valid within the metadata snapshot.
182          */
183         __le64 writeset_tree_root;
184         __le64 era_array_root;
185
186         __le64 metadata_snap;
187 } __packed;
188
189 /*----------------------------------------------------------------
190  * Superblock validation
191  *--------------------------------------------------------------*/
192 static void sb_prepare_for_write(struct dm_block_validator *v,
193                                  struct dm_block *b,
194                                  size_t sb_block_size)
195 {
196         struct superblock_disk *disk = dm_block_data(b);
197
198         disk->blocknr = cpu_to_le64(dm_block_location(b));
199         disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags,
200                                                 sb_block_size - sizeof(__le32),
201                                                 SUPERBLOCK_CSUM_XOR));
202 }
203
204 static int check_metadata_version(struct superblock_disk *disk)
205 {
206         uint32_t metadata_version = le32_to_cpu(disk->version);
207         if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) {
208                 DMERR("Era metadata version %u found, but only versions between %u and %u supported.",
209                       metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION);
210                 return -EINVAL;
211         }
212
213         return 0;
214 }
215
216 static int sb_check(struct dm_block_validator *v,
217                     struct dm_block *b,
218                     size_t sb_block_size)
219 {
220         struct superblock_disk *disk = dm_block_data(b);
221         __le32 csum_le;
222
223         if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) {
224                 DMERR("sb_check failed: blocknr %llu: wanted %llu",
225                       le64_to_cpu(disk->blocknr),
226                       (unsigned long long)dm_block_location(b));
227                 return -ENOTBLK;
228         }
229
230         if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) {
231                 DMERR("sb_check failed: magic %llu: wanted %llu",
232                       le64_to_cpu(disk->magic),
233                       (unsigned long long) SUPERBLOCK_MAGIC);
234                 return -EILSEQ;
235         }
236
237         csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags,
238                                              sb_block_size - sizeof(__le32),
239                                              SUPERBLOCK_CSUM_XOR));
240         if (csum_le != disk->csum) {
241                 DMERR("sb_check failed: csum %u: wanted %u",
242                       le32_to_cpu(csum_le), le32_to_cpu(disk->csum));
243                 return -EILSEQ;
244         }
245
246         return check_metadata_version(disk);
247 }
248
249 static struct dm_block_validator sb_validator = {
250         .name = "superblock",
251         .prepare_for_write = sb_prepare_for_write,
252         .check = sb_check
253 };
254
255 /*----------------------------------------------------------------
256  * Low level metadata handling
257  *--------------------------------------------------------------*/
258 #define DM_ERA_METADATA_BLOCK_SIZE 4096
259 #define ERA_MAX_CONCURRENT_LOCKS 5
260
261 struct era_metadata {
262         struct block_device *bdev;
263         struct dm_block_manager *bm;
264         struct dm_space_map *sm;
265         struct dm_transaction_manager *tm;
266
267         dm_block_t block_size;
268         uint32_t nr_blocks;
269
270         uint32_t current_era;
271
272         /*
273          * We preallocate 2 writesets.  When an era rolls over we
274          * switch between them. This means the allocation is done at
275          * preresume time, rather than on the io path.
276          */
277         struct writeset writesets[2];
278         struct writeset *current_writeset;
279
280         dm_block_t writeset_tree_root;
281         dm_block_t era_array_root;
282
283         struct dm_disk_bitset bitset_info;
284         struct dm_btree_info writeset_tree_info;
285         struct dm_array_info era_array_info;
286
287         dm_block_t metadata_snap;
288
289         /*
290          * A flag that is set whenever a writeset has been archived.
291          */
292         bool archived_writesets;
293
294         /*
295          * Reading the space map root can fail, so we read it into this
296          * buffer before the superblock is locked and updated.
297          */
298         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
299 };
300
301 static int superblock_read_lock(struct era_metadata *md,
302                                 struct dm_block **sblock)
303 {
304         return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION,
305                                &sb_validator, sblock);
306 }
307
308 static int superblock_lock_zero(struct era_metadata *md,
309                                 struct dm_block **sblock)
310 {
311         return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION,
312                                      &sb_validator, sblock);
313 }
314
315 static int superblock_lock(struct era_metadata *md,
316                            struct dm_block **sblock)
317 {
318         return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION,
319                                 &sb_validator, sblock);
320 }
321
322 /* FIXME: duplication with cache and thin */
323 static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
324 {
325         int r;
326         unsigned i;
327         struct dm_block *b;
328         __le64 *data_le, zero = cpu_to_le64(0);
329         unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
330
331         /*
332          * We can't use a validator here - it may be all zeroes.
333          */
334         r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b);
335         if (r)
336                 return r;
337
338         data_le = dm_block_data(b);
339         *result = true;
340         for (i = 0; i < sb_block_size; i++) {
341                 if (data_le[i] != zero) {
342                         *result = false;
343                         break;
344                 }
345         }
346
347         dm_bm_unlock(b);
348
349         return 0;
350 }
351
352 /*----------------------------------------------------------------*/
353
354 static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk)
355 {
356         disk->nr_bits = cpu_to_le32(core->nr_bits);
357         disk->root = cpu_to_le64(core->root);
358 }
359
360 static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core)
361 {
362         core->nr_bits = le32_to_cpu(disk->nr_bits);
363         core->root = le64_to_cpu(disk->root);
364 }
365
366 static void ws_inc(void *context, const void *value, unsigned count)
367 {
368         struct era_metadata *md = context;
369         struct writeset_disk ws_d;
370         dm_block_t b;
371         unsigned i;
372
373         for (i = 0; i < count; i++) {
374                 memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
375                 b = le64_to_cpu(ws_d.root);
376                 dm_tm_inc(md->tm, b);
377         }
378 }
379
380 static void ws_dec(void *context, const void *value, unsigned count)
381 {
382         struct era_metadata *md = context;
383         struct writeset_disk ws_d;
384         dm_block_t b;
385         unsigned i;
386
387         for (i = 0; i < count; i++) {
388                 memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
389                 b = le64_to_cpu(ws_d.root);
390                 dm_bitset_del(&md->bitset_info, b);
391         }
392 }
393
394 static int ws_eq(void *context, const void *value1, const void *value2)
395 {
396         return !memcmp(value1, value2, sizeof(struct writeset_disk));
397 }
398
399 /*----------------------------------------------------------------*/
400
401 static void setup_writeset_tree_info(struct era_metadata *md)
402 {
403         struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type;
404         md->writeset_tree_info.tm = md->tm;
405         md->writeset_tree_info.levels = 1;
406         vt->context = md;
407         vt->size = sizeof(struct writeset_disk);
408         vt->inc = ws_inc;
409         vt->dec = ws_dec;
410         vt->equal = ws_eq;
411 }
412
413 static void setup_era_array_info(struct era_metadata *md)
414
415 {
416         struct dm_btree_value_type vt;
417         vt.context = NULL;
418         vt.size = sizeof(__le32);
419         vt.inc = NULL;
420         vt.dec = NULL;
421         vt.equal = NULL;
422
423         dm_array_info_init(&md->era_array_info, md->tm, &vt);
424 }
425
426 static void setup_infos(struct era_metadata *md)
427 {
428         dm_disk_bitset_init(md->tm, &md->bitset_info);
429         setup_writeset_tree_info(md);
430         setup_era_array_info(md);
431 }
432
433 /*----------------------------------------------------------------*/
434
435 static int create_fresh_metadata(struct era_metadata *md)
436 {
437         int r;
438
439         r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION,
440                                  &md->tm, &md->sm);
441         if (r < 0) {
442                 DMERR("dm_tm_create_with_sm failed");
443                 return r;
444         }
445
446         setup_infos(md);
447
448         r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root);
449         if (r) {
450                 DMERR("couldn't create new writeset tree");
451                 goto bad;
452         }
453
454         r = dm_array_empty(&md->era_array_info, &md->era_array_root);
455         if (r) {
456                 DMERR("couldn't create era array");
457                 goto bad;
458         }
459
460         return 0;
461
462 bad:
463         dm_sm_destroy(md->sm);
464         dm_tm_destroy(md->tm);
465
466         return r;
467 }
468
469 static int save_sm_root(struct era_metadata *md)
470 {
471         int r;
472         size_t metadata_len;
473
474         r = dm_sm_root_size(md->sm, &metadata_len);
475         if (r < 0)
476                 return r;
477
478         return dm_sm_copy_root(md->sm, &md->metadata_space_map_root,
479                                metadata_len);
480 }
481
482 static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk)
483 {
484         memcpy(&disk->metadata_space_map_root,
485                &md->metadata_space_map_root,
486                sizeof(md->metadata_space_map_root));
487 }
488
489 /*
490  * Writes a superblock, including the static fields that don't get updated
491  * with every commit (possible optimisation here).  'md' should be fully
492  * constructed when this is called.
493  */
494 static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk)
495 {
496         disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
497         disk->flags = cpu_to_le32(0ul);
498
499         /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */
500         memset(disk->uuid, 0, sizeof(disk->uuid));
501         disk->version = cpu_to_le32(MAX_ERA_VERSION);
502
503         copy_sm_root(md, disk);
504
505         disk->data_block_size = cpu_to_le32(md->block_size);
506         disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
507         disk->nr_blocks = cpu_to_le32(md->nr_blocks);
508         disk->current_era = cpu_to_le32(md->current_era);
509
510         ws_pack(&md->current_writeset->md, &disk->current_writeset);
511         disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root);
512         disk->era_array_root = cpu_to_le64(md->era_array_root);
513         disk->metadata_snap = cpu_to_le64(md->metadata_snap);
514 }
515
516 static int write_superblock(struct era_metadata *md)
517 {
518         int r;
519         struct dm_block *sblock;
520         struct superblock_disk *disk;
521
522         r = save_sm_root(md);
523         if (r) {
524                 DMERR("%s: save_sm_root failed", __func__);
525                 return r;
526         }
527
528         r = superblock_lock_zero(md, &sblock);
529         if (r)
530                 return r;
531
532         disk = dm_block_data(sblock);
533         prepare_superblock(md, disk);
534
535         return dm_tm_commit(md->tm, sblock);
536 }
537
538 /*
539  * Assumes block_size and the infos are set.
540  */
541 static int format_metadata(struct era_metadata *md)
542 {
543         int r;
544
545         r = create_fresh_metadata(md);
546         if (r)
547                 return r;
548
549         r = write_superblock(md);
550         if (r) {
551                 dm_sm_destroy(md->sm);
552                 dm_tm_destroy(md->tm);
553                 return r;
554         }
555
556         return 0;
557 }
558
559 static int open_metadata(struct era_metadata *md)
560 {
561         int r;
562         struct dm_block *sblock;
563         struct superblock_disk *disk;
564
565         r = superblock_read_lock(md, &sblock);
566         if (r) {
567                 DMERR("couldn't read_lock superblock");
568                 return r;
569         }
570
571         disk = dm_block_data(sblock);
572
573         /* Verify the data block size hasn't changed */
574         if (le32_to_cpu(disk->data_block_size) != md->block_size) {
575                 DMERR("changing the data block size (from %u to %llu) is not supported",
576                       le32_to_cpu(disk->data_block_size), md->block_size);
577                 r = -EINVAL;
578                 goto bad;
579         }
580
581         r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
582                                disk->metadata_space_map_root,
583                                sizeof(disk->metadata_space_map_root),
584                                &md->tm, &md->sm);
585         if (r) {
586                 DMERR("dm_tm_open_with_sm failed");
587                 goto bad;
588         }
589
590         setup_infos(md);
591
592         md->nr_blocks = le32_to_cpu(disk->nr_blocks);
593         md->current_era = le32_to_cpu(disk->current_era);
594
595         ws_unpack(&disk->current_writeset, &md->current_writeset->md);
596         md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
597         md->era_array_root = le64_to_cpu(disk->era_array_root);
598         md->metadata_snap = le64_to_cpu(disk->metadata_snap);
599         md->archived_writesets = true;
600
601         dm_bm_unlock(sblock);
602
603         return 0;
604
605 bad:
606         dm_bm_unlock(sblock);
607         return r;
608 }
609
610 static int open_or_format_metadata(struct era_metadata *md,
611                                    bool may_format)
612 {
613         int r;
614         bool unformatted = false;
615
616         r = superblock_all_zeroes(md->bm, &unformatted);
617         if (r)
618                 return r;
619
620         if (unformatted)
621                 return may_format ? format_metadata(md) : -EPERM;
622
623         return open_metadata(md);
624 }
625
626 static int create_persistent_data_objects(struct era_metadata *md,
627                                           bool may_format)
628 {
629         int r;
630
631         md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
632                                          ERA_MAX_CONCURRENT_LOCKS);
633         if (IS_ERR(md->bm)) {
634                 DMERR("could not create block manager");
635                 return PTR_ERR(md->bm);
636         }
637
638         r = open_or_format_metadata(md, may_format);
639         if (r)
640                 dm_block_manager_destroy(md->bm);
641
642         return r;
643 }
644
645 static void destroy_persistent_data_objects(struct era_metadata *md)
646 {
647         dm_sm_destroy(md->sm);
648         dm_tm_destroy(md->tm);
649         dm_block_manager_destroy(md->bm);
650 }
651
652 /*
653  * This waits until all era_map threads have picked up the new filter.
654  */
655 static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset)
656 {
657         rcu_assign_pointer(md->current_writeset, new_writeset);
658         synchronize_rcu();
659 }
660
661 /*----------------------------------------------------------------
662  * Writesets get 'digested' into the main era array.
663  *
664  * We're using a coroutine here so the worker thread can do the digestion,
665  * thus avoiding synchronisation of the metadata.  Digesting a whole
666  * writeset in one go would cause too much latency.
667  *--------------------------------------------------------------*/
668 struct digest {
669         uint32_t era;
670         unsigned nr_bits, current_bit;
671         struct writeset_metadata writeset;
672         __le32 value;
673         struct dm_disk_bitset info;
674
675         int (*step)(struct era_metadata *, struct digest *);
676 };
677
678 static int metadata_digest_lookup_writeset(struct era_metadata *md,
679                                            struct digest *d);
680
681 static int metadata_digest_remove_writeset(struct era_metadata *md,
682                                            struct digest *d)
683 {
684         int r;
685         uint64_t key = d->era;
686
687         r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root,
688                             &key, &md->writeset_tree_root);
689         if (r) {
690                 DMERR("%s: dm_btree_remove failed", __func__);
691                 return r;
692         }
693
694         d->step = metadata_digest_lookup_writeset;
695         return 0;
696 }
697
698 #define INSERTS_PER_STEP 100
699
700 static int metadata_digest_transcribe_writeset(struct era_metadata *md,
701                                                struct digest *d)
702 {
703         int r;
704         bool marked;
705         unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits);
706
707         for (b = d->current_bit; b < e; b++) {
708                 r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked);
709                 if (r) {
710                         DMERR("%s: writeset_marked_on_disk failed", __func__);
711                         return r;
712                 }
713
714                 if (!marked)
715                         continue;
716
717                 __dm_bless_for_disk(&d->value);
718                 r = dm_array_set_value(&md->era_array_info, md->era_array_root,
719                                        b, &d->value, &md->era_array_root);
720                 if (r) {
721                         DMERR("%s: dm_array_set_value failed", __func__);
722                         return r;
723                 }
724         }
725
726         if (b == d->nr_bits)
727                 d->step = metadata_digest_remove_writeset;
728         else
729                 d->current_bit = b;
730
731         return 0;
732 }
733
734 static int metadata_digest_lookup_writeset(struct era_metadata *md,
735                                            struct digest *d)
736 {
737         int r;
738         uint64_t key;
739         struct writeset_disk disk;
740
741         r = dm_btree_find_lowest_key(&md->writeset_tree_info,
742                                      md->writeset_tree_root, &key);
743         if (r < 0)
744                 return r;
745
746         d->era = key;
747
748         r = dm_btree_lookup(&md->writeset_tree_info,
749                             md->writeset_tree_root, &key, &disk);
750         if (r) {
751                 if (r == -ENODATA) {
752                         d->step = NULL;
753                         return 0;
754                 }
755
756                 DMERR("%s: dm_btree_lookup failed", __func__);
757                 return r;
758         }
759
760         ws_unpack(&disk, &d->writeset);
761         d->value = cpu_to_le32(key);
762
763         /*
764          * We initialise another bitset info to avoid any caching side effects
765          * with the previous one.
766          */
767         dm_disk_bitset_init(md->tm, &d->info);
768
769         d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
770         d->current_bit = 0;
771         d->step = metadata_digest_transcribe_writeset;
772
773         return 0;
774 }
775
776 static int metadata_digest_start(struct era_metadata *md, struct digest *d)
777 {
778         if (d->step)
779                 return 0;
780
781         memset(d, 0, sizeof(*d));
782         d->step = metadata_digest_lookup_writeset;
783
784         return 0;
785 }
786
787 /*----------------------------------------------------------------
788  * High level metadata interface.  Target methods should use these, and not
789  * the lower level ones.
790  *--------------------------------------------------------------*/
791 static struct era_metadata *metadata_open(struct block_device *bdev,
792                                           sector_t block_size,
793                                           bool may_format)
794 {
795         int r;
796         struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL);
797
798         if (!md)
799                 return NULL;
800
801         md->bdev = bdev;
802         md->block_size = block_size;
803
804         md->writesets[0].md.root = INVALID_WRITESET_ROOT;
805         md->writesets[1].md.root = INVALID_WRITESET_ROOT;
806         md->current_writeset = &md->writesets[0];
807
808         r = create_persistent_data_objects(md, may_format);
809         if (r) {
810                 kfree(md);
811                 return ERR_PTR(r);
812         }
813
814         return md;
815 }
816
817 static void metadata_close(struct era_metadata *md)
818 {
819         writeset_free(&md->writesets[0]);
820         writeset_free(&md->writesets[1]);
821         destroy_persistent_data_objects(md);
822         kfree(md);
823 }
824
825 static bool valid_nr_blocks(dm_block_t n)
826 {
827         /*
828          * dm_bitset restricts us to 2^32.  test_bit & co. restrict us
829          * further to 2^31 - 1
830          */
831         return n < (1ull << 31);
832 }
833
834 static int metadata_resize(struct era_metadata *md, void *arg)
835 {
836         int r;
837         dm_block_t *new_size = arg;
838         __le32 value;
839
840         if (!valid_nr_blocks(*new_size)) {
841                 DMERR("Invalid number of origin blocks %llu",
842                       (unsigned long long) *new_size);
843                 return -EINVAL;
844         }
845
846         writeset_free(&md->writesets[0]);
847         writeset_free(&md->writesets[1]);
848
849         r = writeset_alloc(&md->writesets[0], *new_size);
850         if (r) {
851                 DMERR("%s: writeset_alloc failed for writeset 0", __func__);
852                 return r;
853         }
854
855         r = writeset_alloc(&md->writesets[1], *new_size);
856         if (r) {
857                 DMERR("%s: writeset_alloc failed for writeset 1", __func__);
858                 writeset_free(&md->writesets[0]);
859                 return r;
860         }
861
862         value = cpu_to_le32(0u);
863         __dm_bless_for_disk(&value);
864         r = dm_array_resize(&md->era_array_info, md->era_array_root,
865                             md->nr_blocks, *new_size,
866                             &value, &md->era_array_root);
867         if (r) {
868                 DMERR("%s: dm_array_resize failed", __func__);
869                 writeset_free(&md->writesets[0]);
870                 writeset_free(&md->writesets[1]);
871                 return r;
872         }
873
874         md->nr_blocks = *new_size;
875         return 0;
876 }
877
878 static int metadata_era_archive(struct era_metadata *md)
879 {
880         int r;
881         uint64_t keys[1];
882         struct writeset_disk value;
883
884         r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
885                             &md->current_writeset->md.root);
886         if (r) {
887                 DMERR("%s: dm_bitset_flush failed", __func__);
888                 return r;
889         }
890
891         ws_pack(&md->current_writeset->md, &value);
892
893         keys[0] = md->current_era;
894         __dm_bless_for_disk(&value);
895         r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root,
896                             keys, &value, &md->writeset_tree_root);
897         if (r) {
898                 DMERR("%s: couldn't insert writeset into btree", __func__);
899                 /* FIXME: fail mode */
900                 return r;
901         }
902
903         md->current_writeset->md.root = INVALID_WRITESET_ROOT;
904         md->archived_writesets = true;
905
906         return 0;
907 }
908
909 static struct writeset *next_writeset(struct era_metadata *md)
910 {
911         return (md->current_writeset == &md->writesets[0]) ?
912                 &md->writesets[1] : &md->writesets[0];
913 }
914
915 static int metadata_new_era(struct era_metadata *md)
916 {
917         int r;
918         struct writeset *new_writeset = next_writeset(md);
919
920         r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks);
921         if (r) {
922                 DMERR("%s: writeset_init failed", __func__);
923                 return r;
924         }
925
926         swap_writeset(md, new_writeset);
927         md->current_era++;
928
929         return 0;
930 }
931
932 static int metadata_era_rollover(struct era_metadata *md)
933 {
934         int r;
935
936         if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
937                 r = metadata_era_archive(md);
938                 if (r) {
939                         DMERR("%s: metadata_archive_era failed", __func__);
940                         /* FIXME: fail mode? */
941                         return r;
942                 }
943         }
944
945         r = metadata_new_era(md);
946         if (r) {
947                 DMERR("%s: new era failed", __func__);
948                 /* FIXME: fail mode */
949                 return r;
950         }
951
952         return 0;
953 }
954
955 static bool metadata_current_marked(struct era_metadata *md, dm_block_t block)
956 {
957         bool r;
958         struct writeset *ws;
959
960         rcu_read_lock();
961         ws = rcu_dereference(md->current_writeset);
962         r = writeset_marked(ws, block);
963         rcu_read_unlock();
964
965         return r;
966 }
967
968 static int metadata_commit(struct era_metadata *md)
969 {
970         int r;
971         struct dm_block *sblock;
972
973         if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
974                 r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
975                                     &md->current_writeset->md.root);
976                 if (r) {
977                         DMERR("%s: bitset flush failed", __func__);
978                         return r;
979                 }
980         }
981
982         r = dm_tm_pre_commit(md->tm);
983         if (r) {
984                 DMERR("%s: pre commit failed", __func__);
985                 return r;
986         }
987
988         r = save_sm_root(md);
989         if (r) {
990                 DMERR("%s: save_sm_root failed", __func__);
991                 return r;
992         }
993
994         r = superblock_lock(md, &sblock);
995         if (r) {
996                 DMERR("%s: superblock lock failed", __func__);
997                 return r;
998         }
999
1000         prepare_superblock(md, dm_block_data(sblock));
1001
1002         return dm_tm_commit(md->tm, sblock);
1003 }
1004
1005 static int metadata_checkpoint(struct era_metadata *md)
1006 {
1007         /*
1008          * For now we just rollover, but later I want to put a check in to
1009          * avoid this if the filter is still pretty fresh.
1010          */
1011         return metadata_era_rollover(md);
1012 }
1013
1014 /*
1015  * Metadata snapshots allow userland to access era data.
1016  */
1017 static int metadata_take_snap(struct era_metadata *md)
1018 {
1019         int r, inc;
1020         struct dm_block *clone;
1021
1022         if (md->metadata_snap != SUPERBLOCK_LOCATION) {
1023                 DMERR("%s: metadata snapshot already exists", __func__);
1024                 return -EINVAL;
1025         }
1026
1027         r = metadata_era_rollover(md);
1028         if (r) {
1029                 DMERR("%s: era rollover failed", __func__);
1030                 return r;
1031         }
1032
1033         r = metadata_commit(md);
1034         if (r) {
1035                 DMERR("%s: pre commit failed", __func__);
1036                 return r;
1037         }
1038
1039         r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION);
1040         if (r) {
1041                 DMERR("%s: couldn't increment superblock", __func__);
1042                 return r;
1043         }
1044
1045         r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION,
1046                                &sb_validator, &clone, &inc);
1047         if (r) {
1048                 DMERR("%s: couldn't shadow superblock", __func__);
1049                 dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION);
1050                 return r;
1051         }
1052         BUG_ON(!inc);
1053
1054         r = dm_sm_inc_block(md->sm, md->writeset_tree_root);
1055         if (r) {
1056                 DMERR("%s: couldn't inc writeset tree root", __func__);
1057                 dm_tm_unlock(md->tm, clone);
1058                 return r;
1059         }
1060
1061         r = dm_sm_inc_block(md->sm, md->era_array_root);
1062         if (r) {
1063                 DMERR("%s: couldn't inc era tree root", __func__);
1064                 dm_sm_dec_block(md->sm, md->writeset_tree_root);
1065                 dm_tm_unlock(md->tm, clone);
1066                 return r;
1067         }
1068
1069         md->metadata_snap = dm_block_location(clone);
1070
1071         dm_tm_unlock(md->tm, clone);
1072
1073         return 0;
1074 }
1075
1076 static int metadata_drop_snap(struct era_metadata *md)
1077 {
1078         int r;
1079         dm_block_t location;
1080         struct dm_block *clone;
1081         struct superblock_disk *disk;
1082
1083         if (md->metadata_snap == SUPERBLOCK_LOCATION) {
1084                 DMERR("%s: no snap to drop", __func__);
1085                 return -EINVAL;
1086         }
1087
1088         r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone);
1089         if (r) {
1090                 DMERR("%s: couldn't read lock superblock clone", __func__);
1091                 return r;
1092         }
1093
1094         /*
1095          * Whatever happens now we'll commit with no record of the metadata
1096          * snap.
1097          */
1098         md->metadata_snap = SUPERBLOCK_LOCATION;
1099
1100         disk = dm_block_data(clone);
1101         r = dm_btree_del(&md->writeset_tree_info,
1102                          le64_to_cpu(disk->writeset_tree_root));
1103         if (r) {
1104                 DMERR("%s: error deleting writeset tree clone", __func__);
1105                 dm_tm_unlock(md->tm, clone);
1106                 return r;
1107         }
1108
1109         r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root));
1110         if (r) {
1111                 DMERR("%s: error deleting era array clone", __func__);
1112                 dm_tm_unlock(md->tm, clone);
1113                 return r;
1114         }
1115
1116         location = dm_block_location(clone);
1117         dm_tm_unlock(md->tm, clone);
1118
1119         return dm_sm_dec_block(md->sm, location);
1120 }
1121
1122 struct metadata_stats {
1123         dm_block_t used;
1124         dm_block_t total;
1125         dm_block_t snap;
1126         uint32_t era;
1127 };
1128
1129 static int metadata_get_stats(struct era_metadata *md, void *ptr)
1130 {
1131         int r;
1132         struct metadata_stats *s = ptr;
1133         dm_block_t nr_free, nr_total;
1134
1135         r = dm_sm_get_nr_free(md->sm, &nr_free);
1136         if (r) {
1137                 DMERR("dm_sm_get_nr_free returned %d", r);
1138                 return r;
1139         }
1140
1141         r = dm_sm_get_nr_blocks(md->sm, &nr_total);
1142         if (r) {
1143                 DMERR("dm_pool_get_metadata_dev_size returned %d", r);
1144                 return r;
1145         }
1146
1147         s->used = nr_total - nr_free;
1148         s->total = nr_total;
1149         s->snap = md->metadata_snap;
1150         s->era = md->current_era;
1151
1152         return 0;
1153 }
1154
1155 /*----------------------------------------------------------------*/
1156
1157 struct era {
1158         struct dm_target *ti;
1159
1160         struct dm_dev *metadata_dev;
1161         struct dm_dev *origin_dev;
1162
1163         dm_block_t nr_blocks;
1164         uint32_t sectors_per_block;
1165         int sectors_per_block_shift;
1166         struct era_metadata *md;
1167
1168         struct workqueue_struct *wq;
1169         struct work_struct worker;
1170
1171         spinlock_t deferred_lock;
1172         struct bio_list deferred_bios;
1173
1174         spinlock_t rpc_lock;
1175         struct list_head rpc_calls;
1176
1177         struct digest digest;
1178         atomic_t suspended;
1179 };
1180
1181 struct rpc {
1182         struct list_head list;
1183
1184         int (*fn0)(struct era_metadata *);
1185         int (*fn1)(struct era_metadata *, void *);
1186         void *arg;
1187         int result;
1188
1189         struct completion complete;
1190 };
1191
1192 /*----------------------------------------------------------------
1193  * Remapping.
1194  *---------------------------------------------------------------*/
1195 static bool block_size_is_power_of_two(struct era *era)
1196 {
1197         return era->sectors_per_block_shift >= 0;
1198 }
1199
1200 static dm_block_t get_block(struct era *era, struct bio *bio)
1201 {
1202         sector_t block_nr = bio->bi_iter.bi_sector;
1203
1204         if (!block_size_is_power_of_two(era))
1205                 (void) sector_div(block_nr, era->sectors_per_block);
1206         else
1207                 block_nr >>= era->sectors_per_block_shift;
1208
1209         return block_nr;
1210 }
1211
1212 static void remap_to_origin(struct era *era, struct bio *bio)
1213 {
1214         bio_set_dev(bio, era->origin_dev->bdev);
1215 }
1216
1217 /*----------------------------------------------------------------
1218  * Worker thread
1219  *--------------------------------------------------------------*/
1220 static void wake_worker(struct era *era)
1221 {
1222         if (!atomic_read(&era->suspended))
1223                 queue_work(era->wq, &era->worker);
1224 }
1225
1226 static void process_old_eras(struct era *era)
1227 {
1228         int r;
1229
1230         if (!era->digest.step)
1231                 return;
1232
1233         r = era->digest.step(era->md, &era->digest);
1234         if (r < 0) {
1235                 DMERR("%s: digest step failed, stopping digestion", __func__);
1236                 era->digest.step = NULL;
1237
1238         } else if (era->digest.step)
1239                 wake_worker(era);
1240 }
1241
1242 static void process_deferred_bios(struct era *era)
1243 {
1244         int r;
1245         struct bio_list deferred_bios, marked_bios;
1246         struct bio *bio;
1247         struct blk_plug plug;
1248         bool commit_needed = false;
1249         bool failed = false;
1250         struct writeset *ws = era->md->current_writeset;
1251
1252         bio_list_init(&deferred_bios);
1253         bio_list_init(&marked_bios);
1254
1255         spin_lock(&era->deferred_lock);
1256         bio_list_merge(&deferred_bios, &era->deferred_bios);
1257         bio_list_init(&era->deferred_bios);
1258         spin_unlock(&era->deferred_lock);
1259
1260         if (bio_list_empty(&deferred_bios))
1261                 return;
1262
1263         while ((bio = bio_list_pop(&deferred_bios))) {
1264                 r = writeset_test_and_set(&era->md->bitset_info, ws,
1265                                           get_block(era, bio));
1266                 if (r < 0) {
1267                         /*
1268                          * This is bad news, we need to rollback.
1269                          * FIXME: finish.
1270                          */
1271                         failed = true;
1272                 } else if (r == 0)
1273                         commit_needed = true;
1274
1275                 bio_list_add(&marked_bios, bio);
1276         }
1277
1278         if (commit_needed) {
1279                 r = metadata_commit(era->md);
1280                 if (r)
1281                         failed = true;
1282         }
1283
1284         if (failed)
1285                 while ((bio = bio_list_pop(&marked_bios)))
1286                         bio_io_error(bio);
1287         else {
1288                 blk_start_plug(&plug);
1289                 while ((bio = bio_list_pop(&marked_bios))) {
1290                         /*
1291                          * Only update the in-core writeset if the on-disk one
1292                          * was updated too.
1293                          */
1294                         if (commit_needed)
1295                                 set_bit(get_block(era, bio), ws->bits);
1296                         submit_bio_noacct(bio);
1297                 }
1298                 blk_finish_plug(&plug);
1299         }
1300 }
1301
1302 static void process_rpc_calls(struct era *era)
1303 {
1304         int r;
1305         bool need_commit = false;
1306         struct list_head calls;
1307         struct rpc *rpc, *tmp;
1308
1309         INIT_LIST_HEAD(&calls);
1310         spin_lock(&era->rpc_lock);
1311         list_splice_init(&era->rpc_calls, &calls);
1312         spin_unlock(&era->rpc_lock);
1313
1314         list_for_each_entry_safe(rpc, tmp, &calls, list) {
1315                 rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg);
1316                 need_commit = true;
1317         }
1318
1319         if (need_commit) {
1320                 r = metadata_commit(era->md);
1321                 if (r)
1322                         list_for_each_entry_safe(rpc, tmp, &calls, list)
1323                                 rpc->result = r;
1324         }
1325
1326         list_for_each_entry_safe(rpc, tmp, &calls, list)
1327                 complete(&rpc->complete);
1328 }
1329
1330 static void kick_off_digest(struct era *era)
1331 {
1332         if (era->md->archived_writesets) {
1333                 era->md->archived_writesets = false;
1334                 metadata_digest_start(era->md, &era->digest);
1335         }
1336 }
1337
1338 static void do_work(struct work_struct *ws)
1339 {
1340         struct era *era = container_of(ws, struct era, worker);
1341
1342         kick_off_digest(era);
1343         process_old_eras(era);
1344         process_deferred_bios(era);
1345         process_rpc_calls(era);
1346 }
1347
1348 static void defer_bio(struct era *era, struct bio *bio)
1349 {
1350         spin_lock(&era->deferred_lock);
1351         bio_list_add(&era->deferred_bios, bio);
1352         spin_unlock(&era->deferred_lock);
1353
1354         wake_worker(era);
1355 }
1356
1357 /*
1358  * Make an rpc call to the worker to change the metadata.
1359  */
1360 static int perform_rpc(struct era *era, struct rpc *rpc)
1361 {
1362         rpc->result = 0;
1363         init_completion(&rpc->complete);
1364
1365         spin_lock(&era->rpc_lock);
1366         list_add(&rpc->list, &era->rpc_calls);
1367         spin_unlock(&era->rpc_lock);
1368
1369         wake_worker(era);
1370         wait_for_completion(&rpc->complete);
1371
1372         return rpc->result;
1373 }
1374
1375 static int in_worker0(struct era *era, int (*fn)(struct era_metadata *))
1376 {
1377         struct rpc rpc;
1378         rpc.fn0 = fn;
1379         rpc.fn1 = NULL;
1380
1381         return perform_rpc(era, &rpc);
1382 }
1383
1384 static int in_worker1(struct era *era,
1385                       int (*fn)(struct era_metadata *, void *), void *arg)
1386 {
1387         struct rpc rpc;
1388         rpc.fn0 = NULL;
1389         rpc.fn1 = fn;
1390         rpc.arg = arg;
1391
1392         return perform_rpc(era, &rpc);
1393 }
1394
1395 static void start_worker(struct era *era)
1396 {
1397         atomic_set(&era->suspended, 0);
1398 }
1399
1400 static void stop_worker(struct era *era)
1401 {
1402         atomic_set(&era->suspended, 1);
1403         drain_workqueue(era->wq);
1404 }
1405
1406 /*----------------------------------------------------------------
1407  * Target methods
1408  *--------------------------------------------------------------*/
1409 static void era_destroy(struct era *era)
1410 {
1411         if (era->md)
1412                 metadata_close(era->md);
1413
1414         if (era->wq)
1415                 destroy_workqueue(era->wq);
1416
1417         if (era->origin_dev)
1418                 dm_put_device(era->ti, era->origin_dev);
1419
1420         if (era->metadata_dev)
1421                 dm_put_device(era->ti, era->metadata_dev);
1422
1423         kfree(era);
1424 }
1425
1426 static dm_block_t calc_nr_blocks(struct era *era)
1427 {
1428         return dm_sector_div_up(era->ti->len, era->sectors_per_block);
1429 }
1430
1431 static bool valid_block_size(dm_block_t block_size)
1432 {
1433         bool greater_than_zero = block_size > 0;
1434         bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0;
1435
1436         return greater_than_zero && multiple_of_min_block_size;
1437 }
1438
1439 /*
1440  * <metadata dev> <data dev> <data block size (sectors)>
1441  */
1442 static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
1443 {
1444         int r;
1445         char dummy;
1446         struct era *era;
1447         struct era_metadata *md;
1448
1449         if (argc != 3) {
1450                 ti->error = "Invalid argument count";
1451                 return -EINVAL;
1452         }
1453
1454         era = kzalloc(sizeof(*era), GFP_KERNEL);
1455         if (!era) {
1456                 ti->error = "Error allocating era structure";
1457                 return -ENOMEM;
1458         }
1459
1460         era->ti = ti;
1461
1462         r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev);
1463         if (r) {
1464                 ti->error = "Error opening metadata device";
1465                 era_destroy(era);
1466                 return -EINVAL;
1467         }
1468
1469         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev);
1470         if (r) {
1471                 ti->error = "Error opening data device";
1472                 era_destroy(era);
1473                 return -EINVAL;
1474         }
1475
1476         r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy);
1477         if (r != 1) {
1478                 ti->error = "Error parsing block size";
1479                 era_destroy(era);
1480                 return -EINVAL;
1481         }
1482
1483         r = dm_set_target_max_io_len(ti, era->sectors_per_block);
1484         if (r) {
1485                 ti->error = "could not set max io len";
1486                 era_destroy(era);
1487                 return -EINVAL;
1488         }
1489
1490         if (!valid_block_size(era->sectors_per_block)) {
1491                 ti->error = "Invalid block size";
1492                 era_destroy(era);
1493                 return -EINVAL;
1494         }
1495         if (era->sectors_per_block & (era->sectors_per_block - 1))
1496                 era->sectors_per_block_shift = -1;
1497         else
1498                 era->sectors_per_block_shift = __ffs(era->sectors_per_block);
1499
1500         md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true);
1501         if (IS_ERR(md)) {
1502                 ti->error = "Error reading metadata";
1503                 era_destroy(era);
1504                 return PTR_ERR(md);
1505         }
1506         era->md = md;
1507
1508         era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1509         if (!era->wq) {
1510                 ti->error = "could not create workqueue for metadata object";
1511                 era_destroy(era);
1512                 return -ENOMEM;
1513         }
1514         INIT_WORK(&era->worker, do_work);
1515
1516         spin_lock_init(&era->deferred_lock);
1517         bio_list_init(&era->deferred_bios);
1518
1519         spin_lock_init(&era->rpc_lock);
1520         INIT_LIST_HEAD(&era->rpc_calls);
1521
1522         ti->private = era;
1523         ti->num_flush_bios = 1;
1524         ti->flush_supported = true;
1525
1526         ti->num_discard_bios = 1;
1527
1528         return 0;
1529 }
1530
1531 static void era_dtr(struct dm_target *ti)
1532 {
1533         era_destroy(ti->private);
1534 }
1535
1536 static int era_map(struct dm_target *ti, struct bio *bio)
1537 {
1538         struct era *era = ti->private;
1539         dm_block_t block = get_block(era, bio);
1540
1541         /*
1542          * All bios get remapped to the origin device.  We do this now, but
1543          * it may not get issued until later.  Depending on whether the
1544          * block is marked in this era.
1545          */
1546         remap_to_origin(era, bio);
1547
1548         /*
1549          * REQ_PREFLUSH bios carry no data, so we're not interested in them.
1550          */
1551         if (!(bio->bi_opf & REQ_PREFLUSH) &&
1552             (bio_data_dir(bio) == WRITE) &&
1553             !metadata_current_marked(era->md, block)) {
1554                 defer_bio(era, bio);
1555                 return DM_MAPIO_SUBMITTED;
1556         }
1557
1558         return DM_MAPIO_REMAPPED;
1559 }
1560
1561 static void era_postsuspend(struct dm_target *ti)
1562 {
1563         int r;
1564         struct era *era = ti->private;
1565
1566         r = in_worker0(era, metadata_era_archive);
1567         if (r) {
1568                 DMERR("%s: couldn't archive current era", __func__);
1569                 /* FIXME: fail mode */
1570         }
1571
1572         stop_worker(era);
1573
1574         r = metadata_commit(era->md);
1575         if (r) {
1576                 DMERR("%s: metadata_commit failed", __func__);
1577                 /* FIXME: fail mode */
1578         }
1579 }
1580
1581 static int era_preresume(struct dm_target *ti)
1582 {
1583         int r;
1584         struct era *era = ti->private;
1585         dm_block_t new_size = calc_nr_blocks(era);
1586
1587         if (era->nr_blocks != new_size) {
1588                 r = metadata_resize(era->md, &new_size);
1589                 if (r) {
1590                         DMERR("%s: metadata_resize failed", __func__);
1591                         return r;
1592                 }
1593
1594                 r = metadata_commit(era->md);
1595                 if (r) {
1596                         DMERR("%s: metadata_commit failed", __func__);
1597                         return r;
1598                 }
1599
1600                 era->nr_blocks = new_size;
1601         }
1602
1603         start_worker(era);
1604
1605         r = in_worker0(era, metadata_era_rollover);
1606         if (r) {
1607                 DMERR("%s: metadata_era_rollover failed", __func__);
1608                 return r;
1609         }
1610
1611         return 0;
1612 }
1613
1614 /*
1615  * Status format:
1616  *
1617  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1618  * <current era> <held metadata root | '-'>
1619  */
1620 static void era_status(struct dm_target *ti, status_type_t type,
1621                        unsigned status_flags, char *result, unsigned maxlen)
1622 {
1623         int r;
1624         struct era *era = ti->private;
1625         ssize_t sz = 0;
1626         struct metadata_stats stats;
1627         char buf[BDEVNAME_SIZE];
1628
1629         switch (type) {
1630         case STATUSTYPE_INFO:
1631                 r = in_worker1(era, metadata_get_stats, &stats);
1632                 if (r)
1633                         goto err;
1634
1635                 DMEMIT("%u %llu/%llu %u",
1636                        (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
1637                        (unsigned long long) stats.used,
1638                        (unsigned long long) stats.total,
1639                        (unsigned) stats.era);
1640
1641                 if (stats.snap != SUPERBLOCK_LOCATION)
1642                         DMEMIT(" %llu", stats.snap);
1643                 else
1644                         DMEMIT(" -");
1645                 break;
1646
1647         case STATUSTYPE_TABLE:
1648                 format_dev_t(buf, era->metadata_dev->bdev->bd_dev);
1649                 DMEMIT("%s ", buf);
1650                 format_dev_t(buf, era->origin_dev->bdev->bd_dev);
1651                 DMEMIT("%s %u", buf, era->sectors_per_block);
1652                 break;
1653
1654         case STATUSTYPE_IMA:
1655                 *result = '\0';
1656                 break;
1657         }
1658
1659         return;
1660
1661 err:
1662         DMEMIT("Error");
1663 }
1664
1665 static int era_message(struct dm_target *ti, unsigned argc, char **argv,
1666                        char *result, unsigned maxlen)
1667 {
1668         struct era *era = ti->private;
1669
1670         if (argc != 1) {
1671                 DMERR("incorrect number of message arguments");
1672                 return -EINVAL;
1673         }
1674
1675         if (!strcasecmp(argv[0], "checkpoint"))
1676                 return in_worker0(era, metadata_checkpoint);
1677
1678         if (!strcasecmp(argv[0], "take_metadata_snap"))
1679                 return in_worker0(era, metadata_take_snap);
1680
1681         if (!strcasecmp(argv[0], "drop_metadata_snap"))
1682                 return in_worker0(era, metadata_drop_snap);
1683
1684         DMERR("unsupported message '%s'", argv[0]);
1685         return -EINVAL;
1686 }
1687
1688 static sector_t get_dev_size(struct dm_dev *dev)
1689 {
1690         return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1691 }
1692
1693 static int era_iterate_devices(struct dm_target *ti,
1694                                iterate_devices_callout_fn fn, void *data)
1695 {
1696         struct era *era = ti->private;
1697         return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
1698 }
1699
1700 static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
1701 {
1702         struct era *era = ti->private;
1703         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
1704
1705         /*
1706          * If the system-determined stacked limits are compatible with the
1707          * era device's blocksize (io_opt is a factor) do not override them.
1708          */
1709         if (io_opt_sectors < era->sectors_per_block ||
1710             do_div(io_opt_sectors, era->sectors_per_block)) {
1711                 blk_limits_io_min(limits, 0);
1712                 blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT);
1713         }
1714 }
1715
1716 /*----------------------------------------------------------------*/
1717
1718 static struct target_type era_target = {
1719         .name = "era",
1720         .version = {1, 0, 0},
1721         .module = THIS_MODULE,
1722         .ctr = era_ctr,
1723         .dtr = era_dtr,
1724         .map = era_map,
1725         .postsuspend = era_postsuspend,
1726         .preresume = era_preresume,
1727         .status = era_status,
1728         .message = era_message,
1729         .iterate_devices = era_iterate_devices,
1730         .io_hints = era_io_hints
1731 };
1732
1733 static int __init dm_era_init(void)
1734 {
1735         int r;
1736
1737         r = dm_register_target(&era_target);
1738         if (r) {
1739                 DMERR("era target registration failed: %d", r);
1740                 return r;
1741         }
1742
1743         return 0;
1744 }
1745
1746 static void __exit dm_era_exit(void)
1747 {
1748         dm_unregister_target(&era_target);
1749 }
1750
1751 module_init(dm_era_init);
1752 module_exit(dm_era_exit);
1753
1754 MODULE_DESCRIPTION(DM_NAME " era target");
1755 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
1756 MODULE_LICENSE("GPL");