Merge tag 'v6.3-p2' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
[platform/kernel/linux-rpi.git] / drivers / md / dm-era-target.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "dm.h"
3 #include "persistent-data/dm-transaction-manager.h"
4 #include "persistent-data/dm-bitset.h"
5 #include "persistent-data/dm-space-map.h"
6
7 #include <linux/dm-io.h>
8 #include <linux/dm-kcopyd.h>
9 #include <linux/init.h>
10 #include <linux/mempool.h>
11 #include <linux/module.h>
12 #include <linux/slab.h>
13 #include <linux/vmalloc.h>
14
15 #define DM_MSG_PREFIX "era"
16
17 #define SUPERBLOCK_LOCATION 0
18 #define SUPERBLOCK_MAGIC 2126579579
19 #define SUPERBLOCK_CSUM_XOR 146538381
20 #define MIN_ERA_VERSION 1
21 #define MAX_ERA_VERSION 1
22 #define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION
23 #define MIN_BLOCK_SIZE 8
24
25 /*
26  *--------------------------------------------------------------
27  * Writeset
28  *--------------------------------------------------------------
29  */
30 struct writeset_metadata {
31         uint32_t nr_bits;
32         dm_block_t root;
33 };
34
35 struct writeset {
36         struct writeset_metadata md;
37
38         /*
39          * An in core copy of the bits to save constantly doing look ups on
40          * disk.
41          */
42         unsigned long *bits;
43 };
44
45 /*
46  * This does not free off the on disk bitset as this will normally be done
47  * after digesting into the era array.
48  */
49 static void writeset_free(struct writeset *ws)
50 {
51         vfree(ws->bits);
52         ws->bits = NULL;
53 }
54
55 static int setup_on_disk_bitset(struct dm_disk_bitset *info,
56                                 unsigned int nr_bits, dm_block_t *root)
57 {
58         int r;
59
60         r = dm_bitset_empty(info, root);
61         if (r)
62                 return r;
63
64         return dm_bitset_resize(info, *root, 0, nr_bits, false, root);
65 }
66
67 static size_t bitset_size(unsigned int nr_bits)
68 {
69         return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG);
70 }
71
72 /*
73  * Allocates memory for the in core bitset.
74  */
75 static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
76 {
77         ws->bits = vzalloc(bitset_size(nr_blocks));
78         if (!ws->bits) {
79                 DMERR("%s: couldn't allocate in memory bitset", __func__);
80                 return -ENOMEM;
81         }
82
83         return 0;
84 }
85
86 /*
87  * Wipes the in-core bitset, and creates a new on disk bitset.
88  */
89 static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws,
90                          dm_block_t nr_blocks)
91 {
92         int r;
93
94         memset(ws->bits, 0, bitset_size(nr_blocks));
95
96         ws->md.nr_bits = nr_blocks;
97         r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
98         if (r) {
99                 DMERR("%s: setup_on_disk_bitset failed", __func__);
100                 return r;
101         }
102
103         return 0;
104 }
105
106 static bool writeset_marked(struct writeset *ws, dm_block_t block)
107 {
108         return test_bit(block, ws->bits);
109 }
110
111 static int writeset_marked_on_disk(struct dm_disk_bitset *info,
112                                    struct writeset_metadata *m, dm_block_t block,
113                                    bool *result)
114 {
115         int r;
116         dm_block_t old = m->root;
117
118         /*
119          * The bitset was flushed when it was archived, so we know there'll
120          * be no change to the root.
121          */
122         r = dm_bitset_test_bit(info, m->root, block, &m->root, result);
123         if (r) {
124                 DMERR("%s: dm_bitset_test_bit failed", __func__);
125                 return r;
126         }
127
128         BUG_ON(m->root != old);
129
130         return r;
131 }
132
133 /*
134  * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was.
135  */
136 static int writeset_test_and_set(struct dm_disk_bitset *info,
137                                  struct writeset *ws, uint32_t block)
138 {
139         int r;
140
141         if (!test_bit(block, ws->bits)) {
142                 r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
143                 if (r) {
144                         /* FIXME: fail mode */
145                         return r;
146                 }
147
148                 return 0;
149         }
150
151         return 1;
152 }
153
154 /*
155  *--------------------------------------------------------------
156  * On disk metadata layout
157  *--------------------------------------------------------------
158  */
159 #define SPACE_MAP_ROOT_SIZE 128
160 #define UUID_LEN 16
161
162 struct writeset_disk {
163         __le32 nr_bits;
164         __le64 root;
165 } __packed;
166
167 struct superblock_disk {
168         __le32 csum;
169         __le32 flags;
170         __le64 blocknr;
171
172         __u8 uuid[UUID_LEN];
173         __le64 magic;
174         __le32 version;
175
176         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
177
178         __le32 data_block_size;
179         __le32 metadata_block_size;
180         __le32 nr_blocks;
181
182         __le32 current_era;
183         struct writeset_disk current_writeset;
184
185         /*
186          * Only these two fields are valid within the metadata snapshot.
187          */
188         __le64 writeset_tree_root;
189         __le64 era_array_root;
190
191         __le64 metadata_snap;
192 } __packed;
193
194 /*
195  *--------------------------------------------------------------
196  * Superblock validation
197  *--------------------------------------------------------------
198  */
199 static void sb_prepare_for_write(struct dm_block_validator *v,
200                                  struct dm_block *b,
201                                  size_t sb_block_size)
202 {
203         struct superblock_disk *disk = dm_block_data(b);
204
205         disk->blocknr = cpu_to_le64(dm_block_location(b));
206         disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags,
207                                                 sb_block_size - sizeof(__le32),
208                                                 SUPERBLOCK_CSUM_XOR));
209 }
210
211 static int check_metadata_version(struct superblock_disk *disk)
212 {
213         uint32_t metadata_version = le32_to_cpu(disk->version);
214
215         if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) {
216                 DMERR("Era metadata version %u found, but only versions between %u and %u supported.",
217                       metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION);
218                 return -EINVAL;
219         }
220
221         return 0;
222 }
223
224 static int sb_check(struct dm_block_validator *v,
225                     struct dm_block *b,
226                     size_t sb_block_size)
227 {
228         struct superblock_disk *disk = dm_block_data(b);
229         __le32 csum_le;
230
231         if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) {
232                 DMERR("%s failed: blocknr %llu: wanted %llu",
233                       __func__, le64_to_cpu(disk->blocknr),
234                       (unsigned long long)dm_block_location(b));
235                 return -ENOTBLK;
236         }
237
238         if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) {
239                 DMERR("%s failed: magic %llu: wanted %llu",
240                       __func__, le64_to_cpu(disk->magic),
241                       (unsigned long long) SUPERBLOCK_MAGIC);
242                 return -EILSEQ;
243         }
244
245         csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags,
246                                              sb_block_size - sizeof(__le32),
247                                              SUPERBLOCK_CSUM_XOR));
248         if (csum_le != disk->csum) {
249                 DMERR("%s failed: csum %u: wanted %u",
250                       __func__, le32_to_cpu(csum_le), le32_to_cpu(disk->csum));
251                 return -EILSEQ;
252         }
253
254         return check_metadata_version(disk);
255 }
256
257 static struct dm_block_validator sb_validator = {
258         .name = "superblock",
259         .prepare_for_write = sb_prepare_for_write,
260         .check = sb_check
261 };
262
263 /*
264  *--------------------------------------------------------------
265  * Low level metadata handling
266  *--------------------------------------------------------------
267  */
268 #define DM_ERA_METADATA_BLOCK_SIZE 4096
269 #define ERA_MAX_CONCURRENT_LOCKS 5
270
271 struct era_metadata {
272         struct block_device *bdev;
273         struct dm_block_manager *bm;
274         struct dm_space_map *sm;
275         struct dm_transaction_manager *tm;
276
277         dm_block_t block_size;
278         uint32_t nr_blocks;
279
280         uint32_t current_era;
281
282         /*
283          * We preallocate 2 writesets.  When an era rolls over we
284          * switch between them. This means the allocation is done at
285          * preresume time, rather than on the io path.
286          */
287         struct writeset writesets[2];
288         struct writeset *current_writeset;
289
290         dm_block_t writeset_tree_root;
291         dm_block_t era_array_root;
292
293         struct dm_disk_bitset bitset_info;
294         struct dm_btree_info writeset_tree_info;
295         struct dm_array_info era_array_info;
296
297         dm_block_t metadata_snap;
298
299         /*
300          * A flag that is set whenever a writeset has been archived.
301          */
302         bool archived_writesets;
303
304         /*
305          * Reading the space map root can fail, so we read it into this
306          * buffer before the superblock is locked and updated.
307          */
308         __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
309 };
310
311 static int superblock_read_lock(struct era_metadata *md,
312                                 struct dm_block **sblock)
313 {
314         return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION,
315                                &sb_validator, sblock);
316 }
317
318 static int superblock_lock_zero(struct era_metadata *md,
319                                 struct dm_block **sblock)
320 {
321         return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION,
322                                      &sb_validator, sblock);
323 }
324
325 static int superblock_lock(struct era_metadata *md,
326                            struct dm_block **sblock)
327 {
328         return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION,
329                                 &sb_validator, sblock);
330 }
331
332 /* FIXME: duplication with cache and thin */
333 static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
334 {
335         int r;
336         unsigned int i;
337         struct dm_block *b;
338         __le64 *data_le, zero = cpu_to_le64(0);
339         unsigned int sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
340
341         /*
342          * We can't use a validator here - it may be all zeroes.
343          */
344         r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b);
345         if (r)
346                 return r;
347
348         data_le = dm_block_data(b);
349         *result = true;
350         for (i = 0; i < sb_block_size; i++) {
351                 if (data_le[i] != zero) {
352                         *result = false;
353                         break;
354                 }
355         }
356
357         dm_bm_unlock(b);
358
359         return 0;
360 }
361
362 /*----------------------------------------------------------------*/
363
364 static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk)
365 {
366         disk->nr_bits = cpu_to_le32(core->nr_bits);
367         disk->root = cpu_to_le64(core->root);
368 }
369
370 static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core)
371 {
372         core->nr_bits = le32_to_cpu(disk->nr_bits);
373         core->root = le64_to_cpu(disk->root);
374 }
375
376 static void ws_inc(void *context, const void *value, unsigned int count)
377 {
378         struct era_metadata *md = context;
379         struct writeset_disk ws_d;
380         dm_block_t b;
381         unsigned int i;
382
383         for (i = 0; i < count; i++) {
384                 memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
385                 b = le64_to_cpu(ws_d.root);
386                 dm_tm_inc(md->tm, b);
387         }
388 }
389
390 static void ws_dec(void *context, const void *value, unsigned int count)
391 {
392         struct era_metadata *md = context;
393         struct writeset_disk ws_d;
394         dm_block_t b;
395         unsigned int i;
396
397         for (i = 0; i < count; i++) {
398                 memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
399                 b = le64_to_cpu(ws_d.root);
400                 dm_bitset_del(&md->bitset_info, b);
401         }
402 }
403
404 static int ws_eq(void *context, const void *value1, const void *value2)
405 {
406         return !memcmp(value1, value2, sizeof(struct writeset_disk));
407 }
408
409 /*----------------------------------------------------------------*/
410
411 static void setup_writeset_tree_info(struct era_metadata *md)
412 {
413         struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type;
414
415         md->writeset_tree_info.tm = md->tm;
416         md->writeset_tree_info.levels = 1;
417         vt->context = md;
418         vt->size = sizeof(struct writeset_disk);
419         vt->inc = ws_inc;
420         vt->dec = ws_dec;
421         vt->equal = ws_eq;
422 }
423
424 static void setup_era_array_info(struct era_metadata *md)
425 {
426         struct dm_btree_value_type vt;
427
428         vt.context = NULL;
429         vt.size = sizeof(__le32);
430         vt.inc = NULL;
431         vt.dec = NULL;
432         vt.equal = NULL;
433
434         dm_array_info_init(&md->era_array_info, md->tm, &vt);
435 }
436
437 static void setup_infos(struct era_metadata *md)
438 {
439         dm_disk_bitset_init(md->tm, &md->bitset_info);
440         setup_writeset_tree_info(md);
441         setup_era_array_info(md);
442 }
443
444 /*----------------------------------------------------------------*/
445
446 static int create_fresh_metadata(struct era_metadata *md)
447 {
448         int r;
449
450         r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION,
451                                  &md->tm, &md->sm);
452         if (r < 0) {
453                 DMERR("dm_tm_create_with_sm failed");
454                 return r;
455         }
456
457         setup_infos(md);
458
459         r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root);
460         if (r) {
461                 DMERR("couldn't create new writeset tree");
462                 goto bad;
463         }
464
465         r = dm_array_empty(&md->era_array_info, &md->era_array_root);
466         if (r) {
467                 DMERR("couldn't create era array");
468                 goto bad;
469         }
470
471         return 0;
472
473 bad:
474         dm_sm_destroy(md->sm);
475         dm_tm_destroy(md->tm);
476
477         return r;
478 }
479
480 static int save_sm_root(struct era_metadata *md)
481 {
482         int r;
483         size_t metadata_len;
484
485         r = dm_sm_root_size(md->sm, &metadata_len);
486         if (r < 0)
487                 return r;
488
489         return dm_sm_copy_root(md->sm, &md->metadata_space_map_root,
490                                metadata_len);
491 }
492
493 static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk)
494 {
495         memcpy(&disk->metadata_space_map_root,
496                &md->metadata_space_map_root,
497                sizeof(md->metadata_space_map_root));
498 }
499
500 /*
501  * Writes a superblock, including the static fields that don't get updated
502  * with every commit (possible optimisation here).  'md' should be fully
503  * constructed when this is called.
504  */
505 static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk)
506 {
507         disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
508         disk->flags = cpu_to_le32(0ul);
509
510         /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */
511         memset(disk->uuid, 0, sizeof(disk->uuid));
512         disk->version = cpu_to_le32(MAX_ERA_VERSION);
513
514         copy_sm_root(md, disk);
515
516         disk->data_block_size = cpu_to_le32(md->block_size);
517         disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
518         disk->nr_blocks = cpu_to_le32(md->nr_blocks);
519         disk->current_era = cpu_to_le32(md->current_era);
520
521         ws_pack(&md->current_writeset->md, &disk->current_writeset);
522         disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root);
523         disk->era_array_root = cpu_to_le64(md->era_array_root);
524         disk->metadata_snap = cpu_to_le64(md->metadata_snap);
525 }
526
527 static int write_superblock(struct era_metadata *md)
528 {
529         int r;
530         struct dm_block *sblock;
531         struct superblock_disk *disk;
532
533         r = save_sm_root(md);
534         if (r) {
535                 DMERR("%s: save_sm_root failed", __func__);
536                 return r;
537         }
538
539         r = superblock_lock_zero(md, &sblock);
540         if (r)
541                 return r;
542
543         disk = dm_block_data(sblock);
544         prepare_superblock(md, disk);
545
546         return dm_tm_commit(md->tm, sblock);
547 }
548
549 /*
550  * Assumes block_size and the infos are set.
551  */
552 static int format_metadata(struct era_metadata *md)
553 {
554         int r;
555
556         r = create_fresh_metadata(md);
557         if (r)
558                 return r;
559
560         r = write_superblock(md);
561         if (r) {
562                 dm_sm_destroy(md->sm);
563                 dm_tm_destroy(md->tm);
564                 return r;
565         }
566
567         return 0;
568 }
569
570 static int open_metadata(struct era_metadata *md)
571 {
572         int r;
573         struct dm_block *sblock;
574         struct superblock_disk *disk;
575
576         r = superblock_read_lock(md, &sblock);
577         if (r) {
578                 DMERR("couldn't read_lock superblock");
579                 return r;
580         }
581
582         disk = dm_block_data(sblock);
583
584         /* Verify the data block size hasn't changed */
585         if (le32_to_cpu(disk->data_block_size) != md->block_size) {
586                 DMERR("changing the data block size (from %u to %llu) is not supported",
587                       le32_to_cpu(disk->data_block_size), md->block_size);
588                 r = -EINVAL;
589                 goto bad;
590         }
591
592         r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
593                                disk->metadata_space_map_root,
594                                sizeof(disk->metadata_space_map_root),
595                                &md->tm, &md->sm);
596         if (r) {
597                 DMERR("dm_tm_open_with_sm failed");
598                 goto bad;
599         }
600
601         setup_infos(md);
602
603         md->nr_blocks = le32_to_cpu(disk->nr_blocks);
604         md->current_era = le32_to_cpu(disk->current_era);
605
606         ws_unpack(&disk->current_writeset, &md->current_writeset->md);
607         md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
608         md->era_array_root = le64_to_cpu(disk->era_array_root);
609         md->metadata_snap = le64_to_cpu(disk->metadata_snap);
610         md->archived_writesets = true;
611
612         dm_bm_unlock(sblock);
613
614         return 0;
615
616 bad:
617         dm_bm_unlock(sblock);
618         return r;
619 }
620
621 static int open_or_format_metadata(struct era_metadata *md,
622                                    bool may_format)
623 {
624         int r;
625         bool unformatted = false;
626
627         r = superblock_all_zeroes(md->bm, &unformatted);
628         if (r)
629                 return r;
630
631         if (unformatted)
632                 return may_format ? format_metadata(md) : -EPERM;
633
634         return open_metadata(md);
635 }
636
637 static int create_persistent_data_objects(struct era_metadata *md,
638                                           bool may_format)
639 {
640         int r;
641
642         md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
643                                          ERA_MAX_CONCURRENT_LOCKS);
644         if (IS_ERR(md->bm)) {
645                 DMERR("could not create block manager");
646                 return PTR_ERR(md->bm);
647         }
648
649         r = open_or_format_metadata(md, may_format);
650         if (r)
651                 dm_block_manager_destroy(md->bm);
652
653         return r;
654 }
655
656 static void destroy_persistent_data_objects(struct era_metadata *md)
657 {
658         dm_sm_destroy(md->sm);
659         dm_tm_destroy(md->tm);
660         dm_block_manager_destroy(md->bm);
661 }
662
663 /*
664  * This waits until all era_map threads have picked up the new filter.
665  */
666 static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset)
667 {
668         rcu_assign_pointer(md->current_writeset, new_writeset);
669         synchronize_rcu();
670 }
671
672 /*
673  *------------------------------------------------------------------------
674  * Writesets get 'digested' into the main era array.
675  *
676  * We're using a coroutine here so the worker thread can do the digestion,
677  * thus avoiding synchronisation of the metadata.  Digesting a whole
678  * writeset in one go would cause too much latency.
679  *------------------------------------------------------------------------
680  */
681 struct digest {
682         uint32_t era;
683         unsigned int nr_bits, current_bit;
684         struct writeset_metadata writeset;
685         __le32 value;
686         struct dm_disk_bitset info;
687
688         int (*step)(struct era_metadata *md, struct digest *d);
689 };
690
691 static int metadata_digest_lookup_writeset(struct era_metadata *md,
692                                            struct digest *d);
693
694 static int metadata_digest_remove_writeset(struct era_metadata *md,
695                                            struct digest *d)
696 {
697         int r;
698         uint64_t key = d->era;
699
700         r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root,
701                             &key, &md->writeset_tree_root);
702         if (r) {
703                 DMERR("%s: dm_btree_remove failed", __func__);
704                 return r;
705         }
706
707         d->step = metadata_digest_lookup_writeset;
708         return 0;
709 }
710
711 #define INSERTS_PER_STEP 100
712
713 static int metadata_digest_transcribe_writeset(struct era_metadata *md,
714                                                struct digest *d)
715 {
716         int r;
717         bool marked;
718         unsigned int b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits);
719
720         for (b = d->current_bit; b < e; b++) {
721                 r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked);
722                 if (r) {
723                         DMERR("%s: writeset_marked_on_disk failed", __func__);
724                         return r;
725                 }
726
727                 if (!marked)
728                         continue;
729
730                 __dm_bless_for_disk(&d->value);
731                 r = dm_array_set_value(&md->era_array_info, md->era_array_root,
732                                        b, &d->value, &md->era_array_root);
733                 if (r) {
734                         DMERR("%s: dm_array_set_value failed", __func__);
735                         return r;
736                 }
737         }
738
739         if (b == d->nr_bits)
740                 d->step = metadata_digest_remove_writeset;
741         else
742                 d->current_bit = b;
743
744         return 0;
745 }
746
747 static int metadata_digest_lookup_writeset(struct era_metadata *md,
748                                            struct digest *d)
749 {
750         int r;
751         uint64_t key;
752         struct writeset_disk disk;
753
754         r = dm_btree_find_lowest_key(&md->writeset_tree_info,
755                                      md->writeset_tree_root, &key);
756         if (r < 0)
757                 return r;
758
759         d->era = key;
760
761         r = dm_btree_lookup(&md->writeset_tree_info,
762                             md->writeset_tree_root, &key, &disk);
763         if (r) {
764                 if (r == -ENODATA) {
765                         d->step = NULL;
766                         return 0;
767                 }
768
769                 DMERR("%s: dm_btree_lookup failed", __func__);
770                 return r;
771         }
772
773         ws_unpack(&disk, &d->writeset);
774         d->value = cpu_to_le32(key);
775
776         /*
777          * We initialise another bitset info to avoid any caching side effects
778          * with the previous one.
779          */
780         dm_disk_bitset_init(md->tm, &d->info);
781
782         d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
783         d->current_bit = 0;
784         d->step = metadata_digest_transcribe_writeset;
785
786         return 0;
787 }
788
789 static int metadata_digest_start(struct era_metadata *md, struct digest *d)
790 {
791         if (d->step)
792                 return 0;
793
794         memset(d, 0, sizeof(*d));
795         d->step = metadata_digest_lookup_writeset;
796
797         return 0;
798 }
799
800 /*
801  *-----------------------------------------------------------------
802  * High level metadata interface.  Target methods should use these,
803  * and not the lower level ones.
804  *-----------------------------------------------------------------
805  */
806 static struct era_metadata *metadata_open(struct block_device *bdev,
807                                           sector_t block_size,
808                                           bool may_format)
809 {
810         int r;
811         struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL);
812
813         if (!md)
814                 return NULL;
815
816         md->bdev = bdev;
817         md->block_size = block_size;
818
819         md->writesets[0].md.root = INVALID_WRITESET_ROOT;
820         md->writesets[1].md.root = INVALID_WRITESET_ROOT;
821         md->current_writeset = &md->writesets[0];
822
823         r = create_persistent_data_objects(md, may_format);
824         if (r) {
825                 kfree(md);
826                 return ERR_PTR(r);
827         }
828
829         return md;
830 }
831
832 static void metadata_close(struct era_metadata *md)
833 {
834         writeset_free(&md->writesets[0]);
835         writeset_free(&md->writesets[1]);
836         destroy_persistent_data_objects(md);
837         kfree(md);
838 }
839
840 static bool valid_nr_blocks(dm_block_t n)
841 {
842         /*
843          * dm_bitset restricts us to 2^32.  test_bit & co. restrict us
844          * further to 2^31 - 1
845          */
846         return n < (1ull << 31);
847 }
848
849 static int metadata_resize(struct era_metadata *md, void *arg)
850 {
851         int r;
852         dm_block_t *new_size = arg;
853         __le32 value;
854
855         if (!valid_nr_blocks(*new_size)) {
856                 DMERR("Invalid number of origin blocks %llu",
857                       (unsigned long long) *new_size);
858                 return -EINVAL;
859         }
860
861         writeset_free(&md->writesets[0]);
862         writeset_free(&md->writesets[1]);
863
864         r = writeset_alloc(&md->writesets[0], *new_size);
865         if (r) {
866                 DMERR("%s: writeset_alloc failed for writeset 0", __func__);
867                 return r;
868         }
869
870         r = writeset_alloc(&md->writesets[1], *new_size);
871         if (r) {
872                 DMERR("%s: writeset_alloc failed for writeset 1", __func__);
873                 writeset_free(&md->writesets[0]);
874                 return r;
875         }
876
877         value = cpu_to_le32(0u);
878         __dm_bless_for_disk(&value);
879         r = dm_array_resize(&md->era_array_info, md->era_array_root,
880                             md->nr_blocks, *new_size,
881                             &value, &md->era_array_root);
882         if (r) {
883                 DMERR("%s: dm_array_resize failed", __func__);
884                 writeset_free(&md->writesets[0]);
885                 writeset_free(&md->writesets[1]);
886                 return r;
887         }
888
889         md->nr_blocks = *new_size;
890         return 0;
891 }
892
893 static int metadata_era_archive(struct era_metadata *md)
894 {
895         int r;
896         uint64_t keys[1];
897         struct writeset_disk value;
898
899         r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
900                             &md->current_writeset->md.root);
901         if (r) {
902                 DMERR("%s: dm_bitset_flush failed", __func__);
903                 return r;
904         }
905
906         ws_pack(&md->current_writeset->md, &value);
907
908         keys[0] = md->current_era;
909         __dm_bless_for_disk(&value);
910         r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root,
911                             keys, &value, &md->writeset_tree_root);
912         if (r) {
913                 DMERR("%s: couldn't insert writeset into btree", __func__);
914                 /* FIXME: fail mode */
915                 return r;
916         }
917
918         md->current_writeset->md.root = INVALID_WRITESET_ROOT;
919         md->archived_writesets = true;
920
921         return 0;
922 }
923
924 static struct writeset *next_writeset(struct era_metadata *md)
925 {
926         return (md->current_writeset == &md->writesets[0]) ?
927                 &md->writesets[1] : &md->writesets[0];
928 }
929
930 static int metadata_new_era(struct era_metadata *md)
931 {
932         int r;
933         struct writeset *new_writeset = next_writeset(md);
934
935         r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks);
936         if (r) {
937                 DMERR("%s: writeset_init failed", __func__);
938                 return r;
939         }
940
941         swap_writeset(md, new_writeset);
942         md->current_era++;
943
944         return 0;
945 }
946
947 static int metadata_era_rollover(struct era_metadata *md)
948 {
949         int r;
950
951         if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
952                 r = metadata_era_archive(md);
953                 if (r) {
954                         DMERR("%s: metadata_archive_era failed", __func__);
955                         /* FIXME: fail mode? */
956                         return r;
957                 }
958         }
959
960         r = metadata_new_era(md);
961         if (r) {
962                 DMERR("%s: new era failed", __func__);
963                 /* FIXME: fail mode */
964                 return r;
965         }
966
967         return 0;
968 }
969
970 static bool metadata_current_marked(struct era_metadata *md, dm_block_t block)
971 {
972         bool r;
973         struct writeset *ws;
974
975         rcu_read_lock();
976         ws = rcu_dereference(md->current_writeset);
977         r = writeset_marked(ws, block);
978         rcu_read_unlock();
979
980         return r;
981 }
982
983 static int metadata_commit(struct era_metadata *md)
984 {
985         int r;
986         struct dm_block *sblock;
987
988         if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
989                 r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
990                                     &md->current_writeset->md.root);
991                 if (r) {
992                         DMERR("%s: bitset flush failed", __func__);
993                         return r;
994                 }
995         }
996
997         r = dm_tm_pre_commit(md->tm);
998         if (r) {
999                 DMERR("%s: pre commit failed", __func__);
1000                 return r;
1001         }
1002
1003         r = save_sm_root(md);
1004         if (r) {
1005                 DMERR("%s: save_sm_root failed", __func__);
1006                 return r;
1007         }
1008
1009         r = superblock_lock(md, &sblock);
1010         if (r) {
1011                 DMERR("%s: superblock lock failed", __func__);
1012                 return r;
1013         }
1014
1015         prepare_superblock(md, dm_block_data(sblock));
1016
1017         return dm_tm_commit(md->tm, sblock);
1018 }
1019
1020 static int metadata_checkpoint(struct era_metadata *md)
1021 {
1022         /*
1023          * For now we just rollover, but later I want to put a check in to
1024          * avoid this if the filter is still pretty fresh.
1025          */
1026         return metadata_era_rollover(md);
1027 }
1028
1029 /*
1030  * Metadata snapshots allow userland to access era data.
1031  */
1032 static int metadata_take_snap(struct era_metadata *md)
1033 {
1034         int r, inc;
1035         struct dm_block *clone;
1036
1037         if (md->metadata_snap != SUPERBLOCK_LOCATION) {
1038                 DMERR("%s: metadata snapshot already exists", __func__);
1039                 return -EINVAL;
1040         }
1041
1042         r = metadata_era_rollover(md);
1043         if (r) {
1044                 DMERR("%s: era rollover failed", __func__);
1045                 return r;
1046         }
1047
1048         r = metadata_commit(md);
1049         if (r) {
1050                 DMERR("%s: pre commit failed", __func__);
1051                 return r;
1052         }
1053
1054         r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION);
1055         if (r) {
1056                 DMERR("%s: couldn't increment superblock", __func__);
1057                 return r;
1058         }
1059
1060         r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION,
1061                                &sb_validator, &clone, &inc);
1062         if (r) {
1063                 DMERR("%s: couldn't shadow superblock", __func__);
1064                 dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION);
1065                 return r;
1066         }
1067         BUG_ON(!inc);
1068
1069         r = dm_sm_inc_block(md->sm, md->writeset_tree_root);
1070         if (r) {
1071                 DMERR("%s: couldn't inc writeset tree root", __func__);
1072                 dm_tm_unlock(md->tm, clone);
1073                 return r;
1074         }
1075
1076         r = dm_sm_inc_block(md->sm, md->era_array_root);
1077         if (r) {
1078                 DMERR("%s: couldn't inc era tree root", __func__);
1079                 dm_sm_dec_block(md->sm, md->writeset_tree_root);
1080                 dm_tm_unlock(md->tm, clone);
1081                 return r;
1082         }
1083
1084         md->metadata_snap = dm_block_location(clone);
1085
1086         dm_tm_unlock(md->tm, clone);
1087
1088         return 0;
1089 }
1090
1091 static int metadata_drop_snap(struct era_metadata *md)
1092 {
1093         int r;
1094         dm_block_t location;
1095         struct dm_block *clone;
1096         struct superblock_disk *disk;
1097
1098         if (md->metadata_snap == SUPERBLOCK_LOCATION) {
1099                 DMERR("%s: no snap to drop", __func__);
1100                 return -EINVAL;
1101         }
1102
1103         r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone);
1104         if (r) {
1105                 DMERR("%s: couldn't read lock superblock clone", __func__);
1106                 return r;
1107         }
1108
1109         /*
1110          * Whatever happens now we'll commit with no record of the metadata
1111          * snap.
1112          */
1113         md->metadata_snap = SUPERBLOCK_LOCATION;
1114
1115         disk = dm_block_data(clone);
1116         r = dm_btree_del(&md->writeset_tree_info,
1117                          le64_to_cpu(disk->writeset_tree_root));
1118         if (r) {
1119                 DMERR("%s: error deleting writeset tree clone", __func__);
1120                 dm_tm_unlock(md->tm, clone);
1121                 return r;
1122         }
1123
1124         r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root));
1125         if (r) {
1126                 DMERR("%s: error deleting era array clone", __func__);
1127                 dm_tm_unlock(md->tm, clone);
1128                 return r;
1129         }
1130
1131         location = dm_block_location(clone);
1132         dm_tm_unlock(md->tm, clone);
1133
1134         return dm_sm_dec_block(md->sm, location);
1135 }
1136
1137 struct metadata_stats {
1138         dm_block_t used;
1139         dm_block_t total;
1140         dm_block_t snap;
1141         uint32_t era;
1142 };
1143
1144 static int metadata_get_stats(struct era_metadata *md, void *ptr)
1145 {
1146         int r;
1147         struct metadata_stats *s = ptr;
1148         dm_block_t nr_free, nr_total;
1149
1150         r = dm_sm_get_nr_free(md->sm, &nr_free);
1151         if (r) {
1152                 DMERR("dm_sm_get_nr_free returned %d", r);
1153                 return r;
1154         }
1155
1156         r = dm_sm_get_nr_blocks(md->sm, &nr_total);
1157         if (r) {
1158                 DMERR("dm_pool_get_metadata_dev_size returned %d", r);
1159                 return r;
1160         }
1161
1162         s->used = nr_total - nr_free;
1163         s->total = nr_total;
1164         s->snap = md->metadata_snap;
1165         s->era = md->current_era;
1166
1167         return 0;
1168 }
1169
1170 /*----------------------------------------------------------------*/
1171
1172 struct era {
1173         struct dm_target *ti;
1174
1175         struct dm_dev *metadata_dev;
1176         struct dm_dev *origin_dev;
1177
1178         dm_block_t nr_blocks;
1179         uint32_t sectors_per_block;
1180         int sectors_per_block_shift;
1181         struct era_metadata *md;
1182
1183         struct workqueue_struct *wq;
1184         struct work_struct worker;
1185
1186         spinlock_t deferred_lock;
1187         struct bio_list deferred_bios;
1188
1189         spinlock_t rpc_lock;
1190         struct list_head rpc_calls;
1191
1192         struct digest digest;
1193         atomic_t suspended;
1194 };
1195
1196 struct rpc {
1197         struct list_head list;
1198
1199         int (*fn0)(struct era_metadata *md);
1200         int (*fn1)(struct era_metadata *md, void *ref);
1201         void *arg;
1202         int result;
1203
1204         struct completion complete;
1205 };
1206
1207 /*
1208  *---------------------------------------------------------------
1209  * Remapping.
1210  *---------------------------------------------------------------
1211  */
1212 static bool block_size_is_power_of_two(struct era *era)
1213 {
1214         return era->sectors_per_block_shift >= 0;
1215 }
1216
1217 static dm_block_t get_block(struct era *era, struct bio *bio)
1218 {
1219         sector_t block_nr = bio->bi_iter.bi_sector;
1220
1221         if (!block_size_is_power_of_two(era))
1222                 (void) sector_div(block_nr, era->sectors_per_block);
1223         else
1224                 block_nr >>= era->sectors_per_block_shift;
1225
1226         return block_nr;
1227 }
1228
1229 static void remap_to_origin(struct era *era, struct bio *bio)
1230 {
1231         bio_set_dev(bio, era->origin_dev->bdev);
1232 }
1233
1234 /*
1235  *--------------------------------------------------------------
1236  * Worker thread
1237  *--------------------------------------------------------------
1238  */
1239 static void wake_worker(struct era *era)
1240 {
1241         if (!atomic_read(&era->suspended))
1242                 queue_work(era->wq, &era->worker);
1243 }
1244
1245 static void process_old_eras(struct era *era)
1246 {
1247         int r;
1248
1249         if (!era->digest.step)
1250                 return;
1251
1252         r = era->digest.step(era->md, &era->digest);
1253         if (r < 0) {
1254                 DMERR("%s: digest step failed, stopping digestion", __func__);
1255                 era->digest.step = NULL;
1256
1257         } else if (era->digest.step)
1258                 wake_worker(era);
1259 }
1260
1261 static void process_deferred_bios(struct era *era)
1262 {
1263         int r;
1264         struct bio_list deferred_bios, marked_bios;
1265         struct bio *bio;
1266         struct blk_plug plug;
1267         bool commit_needed = false;
1268         bool failed = false;
1269         struct writeset *ws = era->md->current_writeset;
1270
1271         bio_list_init(&deferred_bios);
1272         bio_list_init(&marked_bios);
1273
1274         spin_lock(&era->deferred_lock);
1275         bio_list_merge(&deferred_bios, &era->deferred_bios);
1276         bio_list_init(&era->deferred_bios);
1277         spin_unlock(&era->deferred_lock);
1278
1279         if (bio_list_empty(&deferred_bios))
1280                 return;
1281
1282         while ((bio = bio_list_pop(&deferred_bios))) {
1283                 r = writeset_test_and_set(&era->md->bitset_info, ws,
1284                                           get_block(era, bio));
1285                 if (r < 0) {
1286                         /*
1287                          * This is bad news, we need to rollback.
1288                          * FIXME: finish.
1289                          */
1290                         failed = true;
1291                 } else if (r == 0)
1292                         commit_needed = true;
1293
1294                 bio_list_add(&marked_bios, bio);
1295         }
1296
1297         if (commit_needed) {
1298                 r = metadata_commit(era->md);
1299                 if (r)
1300                         failed = true;
1301         }
1302
1303         if (failed)
1304                 while ((bio = bio_list_pop(&marked_bios)))
1305                         bio_io_error(bio);
1306         else {
1307                 blk_start_plug(&plug);
1308                 while ((bio = bio_list_pop(&marked_bios))) {
1309                         /*
1310                          * Only update the in-core writeset if the on-disk one
1311                          * was updated too.
1312                          */
1313                         if (commit_needed)
1314                                 set_bit(get_block(era, bio), ws->bits);
1315                         submit_bio_noacct(bio);
1316                 }
1317                 blk_finish_plug(&plug);
1318         }
1319 }
1320
1321 static void process_rpc_calls(struct era *era)
1322 {
1323         int r;
1324         bool need_commit = false;
1325         struct list_head calls;
1326         struct rpc *rpc, *tmp;
1327
1328         INIT_LIST_HEAD(&calls);
1329         spin_lock(&era->rpc_lock);
1330         list_splice_init(&era->rpc_calls, &calls);
1331         spin_unlock(&era->rpc_lock);
1332
1333         list_for_each_entry_safe(rpc, tmp, &calls, list) {
1334                 rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg);
1335                 need_commit = true;
1336         }
1337
1338         if (need_commit) {
1339                 r = metadata_commit(era->md);
1340                 if (r)
1341                         list_for_each_entry_safe(rpc, tmp, &calls, list)
1342                                 rpc->result = r;
1343         }
1344
1345         list_for_each_entry_safe(rpc, tmp, &calls, list)
1346                 complete(&rpc->complete);
1347 }
1348
1349 static void kick_off_digest(struct era *era)
1350 {
1351         if (era->md->archived_writesets) {
1352                 era->md->archived_writesets = false;
1353                 metadata_digest_start(era->md, &era->digest);
1354         }
1355 }
1356
1357 static void do_work(struct work_struct *ws)
1358 {
1359         struct era *era = container_of(ws, struct era, worker);
1360
1361         kick_off_digest(era);
1362         process_old_eras(era);
1363         process_deferred_bios(era);
1364         process_rpc_calls(era);
1365 }
1366
1367 static void defer_bio(struct era *era, struct bio *bio)
1368 {
1369         spin_lock(&era->deferred_lock);
1370         bio_list_add(&era->deferred_bios, bio);
1371         spin_unlock(&era->deferred_lock);
1372
1373         wake_worker(era);
1374 }
1375
1376 /*
1377  * Make an rpc call to the worker to change the metadata.
1378  */
1379 static int perform_rpc(struct era *era, struct rpc *rpc)
1380 {
1381         rpc->result = 0;
1382         init_completion(&rpc->complete);
1383
1384         spin_lock(&era->rpc_lock);
1385         list_add(&rpc->list, &era->rpc_calls);
1386         spin_unlock(&era->rpc_lock);
1387
1388         wake_worker(era);
1389         wait_for_completion(&rpc->complete);
1390
1391         return rpc->result;
1392 }
1393
1394 static int in_worker0(struct era *era, int (*fn)(struct era_metadata *md))
1395 {
1396         struct rpc rpc;
1397
1398         rpc.fn0 = fn;
1399         rpc.fn1 = NULL;
1400
1401         return perform_rpc(era, &rpc);
1402 }
1403
1404 static int in_worker1(struct era *era,
1405                       int (*fn)(struct era_metadata *md, void *ref), void *arg)
1406 {
1407         struct rpc rpc;
1408
1409         rpc.fn0 = NULL;
1410         rpc.fn1 = fn;
1411         rpc.arg = arg;
1412
1413         return perform_rpc(era, &rpc);
1414 }
1415
1416 static void start_worker(struct era *era)
1417 {
1418         atomic_set(&era->suspended, 0);
1419 }
1420
1421 static void stop_worker(struct era *era)
1422 {
1423         atomic_set(&era->suspended, 1);
1424         drain_workqueue(era->wq);
1425 }
1426
1427 /*
1428  *--------------------------------------------------------------
1429  * Target methods
1430  *--------------------------------------------------------------
1431  */
1432 static void era_destroy(struct era *era)
1433 {
1434         if (era->md)
1435                 metadata_close(era->md);
1436
1437         if (era->wq)
1438                 destroy_workqueue(era->wq);
1439
1440         if (era->origin_dev)
1441                 dm_put_device(era->ti, era->origin_dev);
1442
1443         if (era->metadata_dev)
1444                 dm_put_device(era->ti, era->metadata_dev);
1445
1446         kfree(era);
1447 }
1448
1449 static dm_block_t calc_nr_blocks(struct era *era)
1450 {
1451         return dm_sector_div_up(era->ti->len, era->sectors_per_block);
1452 }
1453
1454 static bool valid_block_size(dm_block_t block_size)
1455 {
1456         bool greater_than_zero = block_size > 0;
1457         bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0;
1458
1459         return greater_than_zero && multiple_of_min_block_size;
1460 }
1461
1462 /*
1463  * <metadata dev> <data dev> <data block size (sectors)>
1464  */
1465 static int era_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1466 {
1467         int r;
1468         char dummy;
1469         struct era *era;
1470         struct era_metadata *md;
1471
1472         if (argc != 3) {
1473                 ti->error = "Invalid argument count";
1474                 return -EINVAL;
1475         }
1476
1477         era = kzalloc(sizeof(*era), GFP_KERNEL);
1478         if (!era) {
1479                 ti->error = "Error allocating era structure";
1480                 return -ENOMEM;
1481         }
1482
1483         era->ti = ti;
1484
1485         r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev);
1486         if (r) {
1487                 ti->error = "Error opening metadata device";
1488                 era_destroy(era);
1489                 return -EINVAL;
1490         }
1491
1492         r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev);
1493         if (r) {
1494                 ti->error = "Error opening data device";
1495                 era_destroy(era);
1496                 return -EINVAL;
1497         }
1498
1499         r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy);
1500         if (r != 1) {
1501                 ti->error = "Error parsing block size";
1502                 era_destroy(era);
1503                 return -EINVAL;
1504         }
1505
1506         r = dm_set_target_max_io_len(ti, era->sectors_per_block);
1507         if (r) {
1508                 ti->error = "could not set max io len";
1509                 era_destroy(era);
1510                 return -EINVAL;
1511         }
1512
1513         if (!valid_block_size(era->sectors_per_block)) {
1514                 ti->error = "Invalid block size";
1515                 era_destroy(era);
1516                 return -EINVAL;
1517         }
1518         if (era->sectors_per_block & (era->sectors_per_block - 1))
1519                 era->sectors_per_block_shift = -1;
1520         else
1521                 era->sectors_per_block_shift = __ffs(era->sectors_per_block);
1522
1523         md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true);
1524         if (IS_ERR(md)) {
1525                 ti->error = "Error reading metadata";
1526                 era_destroy(era);
1527                 return PTR_ERR(md);
1528         }
1529         era->md = md;
1530
1531         era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1532         if (!era->wq) {
1533                 ti->error = "could not create workqueue for metadata object";
1534                 era_destroy(era);
1535                 return -ENOMEM;
1536         }
1537         INIT_WORK(&era->worker, do_work);
1538
1539         spin_lock_init(&era->deferred_lock);
1540         bio_list_init(&era->deferred_bios);
1541
1542         spin_lock_init(&era->rpc_lock);
1543         INIT_LIST_HEAD(&era->rpc_calls);
1544
1545         ti->private = era;
1546         ti->num_flush_bios = 1;
1547         ti->flush_supported = true;
1548
1549         ti->num_discard_bios = 1;
1550
1551         return 0;
1552 }
1553
1554 static void era_dtr(struct dm_target *ti)
1555 {
1556         era_destroy(ti->private);
1557 }
1558
1559 static int era_map(struct dm_target *ti, struct bio *bio)
1560 {
1561         struct era *era = ti->private;
1562         dm_block_t block = get_block(era, bio);
1563
1564         /*
1565          * All bios get remapped to the origin device.  We do this now, but
1566          * it may not get issued until later.  Depending on whether the
1567          * block is marked in this era.
1568          */
1569         remap_to_origin(era, bio);
1570
1571         /*
1572          * REQ_PREFLUSH bios carry no data, so we're not interested in them.
1573          */
1574         if (!(bio->bi_opf & REQ_PREFLUSH) &&
1575             (bio_data_dir(bio) == WRITE) &&
1576             !metadata_current_marked(era->md, block)) {
1577                 defer_bio(era, bio);
1578                 return DM_MAPIO_SUBMITTED;
1579         }
1580
1581         return DM_MAPIO_REMAPPED;
1582 }
1583
1584 static void era_postsuspend(struct dm_target *ti)
1585 {
1586         int r;
1587         struct era *era = ti->private;
1588
1589         r = in_worker0(era, metadata_era_archive);
1590         if (r) {
1591                 DMERR("%s: couldn't archive current era", __func__);
1592                 /* FIXME: fail mode */
1593         }
1594
1595         stop_worker(era);
1596
1597         r = metadata_commit(era->md);
1598         if (r) {
1599                 DMERR("%s: metadata_commit failed", __func__);
1600                 /* FIXME: fail mode */
1601         }
1602 }
1603
1604 static int era_preresume(struct dm_target *ti)
1605 {
1606         int r;
1607         struct era *era = ti->private;
1608         dm_block_t new_size = calc_nr_blocks(era);
1609
1610         if (era->nr_blocks != new_size) {
1611                 r = metadata_resize(era->md, &new_size);
1612                 if (r) {
1613                         DMERR("%s: metadata_resize failed", __func__);
1614                         return r;
1615                 }
1616
1617                 r = metadata_commit(era->md);
1618                 if (r) {
1619                         DMERR("%s: metadata_commit failed", __func__);
1620                         return r;
1621                 }
1622
1623                 era->nr_blocks = new_size;
1624         }
1625
1626         start_worker(era);
1627
1628         r = in_worker0(era, metadata_era_rollover);
1629         if (r) {
1630                 DMERR("%s: metadata_era_rollover failed", __func__);
1631                 return r;
1632         }
1633
1634         return 0;
1635 }
1636
1637 /*
1638  * Status format:
1639  *
1640  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1641  * <current era> <held metadata root | '-'>
1642  */
1643 static void era_status(struct dm_target *ti, status_type_t type,
1644                        unsigned int status_flags, char *result, unsigned int maxlen)
1645 {
1646         int r;
1647         struct era *era = ti->private;
1648         ssize_t sz = 0;
1649         struct metadata_stats stats;
1650         char buf[BDEVNAME_SIZE];
1651
1652         switch (type) {
1653         case STATUSTYPE_INFO:
1654                 r = in_worker1(era, metadata_get_stats, &stats);
1655                 if (r)
1656                         goto err;
1657
1658                 DMEMIT("%u %llu/%llu %u",
1659                        (unsigned int) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
1660                        (unsigned long long) stats.used,
1661                        (unsigned long long) stats.total,
1662                        (unsigned int) stats.era);
1663
1664                 if (stats.snap != SUPERBLOCK_LOCATION)
1665                         DMEMIT(" %llu", stats.snap);
1666                 else
1667                         DMEMIT(" -");
1668                 break;
1669
1670         case STATUSTYPE_TABLE:
1671                 format_dev_t(buf, era->metadata_dev->bdev->bd_dev);
1672                 DMEMIT("%s ", buf);
1673                 format_dev_t(buf, era->origin_dev->bdev->bd_dev);
1674                 DMEMIT("%s %u", buf, era->sectors_per_block);
1675                 break;
1676
1677         case STATUSTYPE_IMA:
1678                 *result = '\0';
1679                 break;
1680         }
1681
1682         return;
1683
1684 err:
1685         DMEMIT("Error");
1686 }
1687
1688 static int era_message(struct dm_target *ti, unsigned int argc, char **argv,
1689                        char *result, unsigned int maxlen)
1690 {
1691         struct era *era = ti->private;
1692
1693         if (argc != 1) {
1694                 DMERR("incorrect number of message arguments");
1695                 return -EINVAL;
1696         }
1697
1698         if (!strcasecmp(argv[0], "checkpoint"))
1699                 return in_worker0(era, metadata_checkpoint);
1700
1701         if (!strcasecmp(argv[0], "take_metadata_snap"))
1702                 return in_worker0(era, metadata_take_snap);
1703
1704         if (!strcasecmp(argv[0], "drop_metadata_snap"))
1705                 return in_worker0(era, metadata_drop_snap);
1706
1707         DMERR("unsupported message '%s'", argv[0]);
1708         return -EINVAL;
1709 }
1710
1711 static sector_t get_dev_size(struct dm_dev *dev)
1712 {
1713         return bdev_nr_sectors(dev->bdev);
1714 }
1715
1716 static int era_iterate_devices(struct dm_target *ti,
1717                                iterate_devices_callout_fn fn, void *data)
1718 {
1719         struct era *era = ti->private;
1720
1721         return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
1722 }
1723
1724 static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
1725 {
1726         struct era *era = ti->private;
1727         uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
1728
1729         /*
1730          * If the system-determined stacked limits are compatible with the
1731          * era device's blocksize (io_opt is a factor) do not override them.
1732          */
1733         if (io_opt_sectors < era->sectors_per_block ||
1734             do_div(io_opt_sectors, era->sectors_per_block)) {
1735                 blk_limits_io_min(limits, 0);
1736                 blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT);
1737         }
1738 }
1739
1740 /*----------------------------------------------------------------*/
1741
1742 static struct target_type era_target = {
1743         .name = "era",
1744         .version = {1, 0, 0},
1745         .module = THIS_MODULE,
1746         .ctr = era_ctr,
1747         .dtr = era_dtr,
1748         .map = era_map,
1749         .postsuspend = era_postsuspend,
1750         .preresume = era_preresume,
1751         .status = era_status,
1752         .message = era_message,
1753         .iterate_devices = era_iterate_devices,
1754         .io_hints = era_io_hints
1755 };
1756
1757 static int __init dm_era_init(void)
1758 {
1759         int r;
1760
1761         r = dm_register_target(&era_target);
1762         if (r) {
1763                 DMERR("era target registration failed: %d", r);
1764                 return r;
1765         }
1766
1767         return 0;
1768 }
1769
1770 static void __exit dm_era_exit(void)
1771 {
1772         dm_unregister_target(&era_target);
1773 }
1774
1775 module_init(dm_era_init);
1776 module_exit(dm_era_exit);
1777
1778 MODULE_DESCRIPTION(DM_NAME " era target");
1779 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
1780 MODULE_LICENSE("GPL");