Btrfs-progs: Add chunk recover function - using old chunk items
[platform/upstream/btrfs-progs.git] / cmds-chunk.c
1 /*
2  * Copyright (C) 2013 Fujitsu.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #define _XOPEN_SOURCE 500
19 #define _GNU_SOURCE
20
21 #include <stdio.h>
22 #include <stdio_ext.h>
23 #include <stdlib.h>
24 #include <sys/types.h>
25 #include <sys/stat.h>
26 #include <fcntl.h>
27 #include <unistd.h>
28 #include <uuid/uuid.h>
29
30 #include "kerncompat.h"
31 #include "list.h"
32 #include "radix-tree.h"
33 #include "ctree.h"
34 #include "extent-cache.h"
35 #include "disk-io.h"
36 #include "volumes.h"
37 #include "transaction.h"
38 #include "crc32c.h"
39 #include "utils.h"
40 #include "version.h"
41 #include "btrfsck.h"
42 #include "commands.h"
43
44 #define BTRFS_CHUNK_TREE_REBUILD_ABORTED        -7500
45
46 struct recover_control {
47         int verbose;
48         int yes;
49
50         u16 csum_size;
51         u32 sectorsize;
52         u32 leafsize;
53         u64 generation;
54         u64 chunk_root_generation;
55
56         struct btrfs_fs_devices *fs_devices;
57
58         struct cache_tree chunk;
59         struct block_group_tree bg;
60         struct device_extent_tree devext;
61
62         struct list_head good_chunks;
63         struct list_head bad_chunks;
64 };
65
66 static struct btrfs_chunk *create_chunk_item(struct chunk_record *record)
67 {
68         struct btrfs_chunk *ret;
69         struct btrfs_stripe *chunk_stripe;
70         int i;
71
72         if (!record || record->num_stripes == 0)
73                 return NULL;
74         ret = malloc(btrfs_chunk_item_size(record->num_stripes));
75         if (!ret)
76                 return NULL;
77         btrfs_set_stack_chunk_length(ret, record->length);
78         btrfs_set_stack_chunk_owner(ret, record->owner);
79         btrfs_set_stack_chunk_stripe_len(ret, record->stripe_len);
80         btrfs_set_stack_chunk_type(ret, record->type_flags);
81         btrfs_set_stack_chunk_io_align(ret, record->io_align);
82         btrfs_set_stack_chunk_io_width(ret, record->io_width);
83         btrfs_set_stack_chunk_sector_size(ret, record->sector_size);
84         btrfs_set_stack_chunk_num_stripes(ret, record->num_stripes);
85         btrfs_set_stack_chunk_sub_stripes(ret, record->sub_stripes);
86         for (i = 0, chunk_stripe = &ret->stripe; i < record->num_stripes;
87              i++, chunk_stripe++) {
88                 btrfs_set_stack_stripe_devid(chunk_stripe,
89                                 record->stripes[i].devid);
90                 btrfs_set_stack_stripe_offset(chunk_stripe,
91                                 record->stripes[i].offset);
92                 memcpy(chunk_stripe->dev_uuid, record->stripes[i].dev_uuid,
93                        BTRFS_UUID_SIZE);
94         }
95         return ret;
96 }
97
98 void init_recover_control(struct recover_control *rc, int verbose, int yes)
99 {
100         memset(rc, 0, sizeof(struct recover_control));
101         cache_tree_init(&rc->chunk);
102         block_group_tree_init(&rc->bg);
103         device_extent_tree_init(&rc->devext);
104
105         INIT_LIST_HEAD(&rc->good_chunks);
106         INIT_LIST_HEAD(&rc->bad_chunks);
107
108         rc->verbose = verbose;
109         rc->yes = yes;
110 }
111
112 void free_recover_control(struct recover_control *rc)
113 {
114         free_block_group_tree(&rc->bg);
115         free_chunk_cache_tree(&rc->chunk);
116         free_device_extent_tree(&rc->devext);
117 }
118
119 static int process_block_group_item(struct block_group_tree *bg_cache,
120                                     struct extent_buffer *leaf,
121                                     struct btrfs_key *key, int slot)
122 {
123         struct block_group_record *rec;
124         struct block_group_record *exist;
125         struct cache_extent *cache;
126         int ret = 0;
127
128         rec = btrfs_new_block_group_record(leaf, key, slot);
129         if (!rec->cache.size)
130                 goto free_out;
131 again:
132         cache = lookup_cache_extent(&bg_cache->tree,
133                                     rec->cache.start,
134                                     rec->cache.size);
135         if (cache) {
136                 exist = container_of(cache, struct block_group_record, cache);
137
138                 /*check the generation and replace if needed*/
139                 if (exist->generation > rec->generation)
140                         goto free_out;
141                 if (exist->generation == rec->generation) {
142                         int offset = offsetof(struct block_group_record,
143                                               generation);
144                         /*
145                          * According to the current kernel code, the following
146                          * case is impossble, or there is something wrong in
147                          * the kernel code.
148                          */
149                         if (memcmp(((void *)exist) + offset,
150                                    ((void *)rec) + offset,
151                                    sizeof(*rec) - offset))
152                                 ret = -EEXIST;
153                         goto free_out;
154                 }
155                 remove_cache_extent(&bg_cache->tree, cache);
156                 list_del_init(&exist->list);
157                 free(exist);
158                 /*
159                  * We must do seach again to avoid the following cache.
160                  * /--old bg 1--//--old bg 2--/
161                  *        /--new bg--/
162                  */
163                 goto again;
164         }
165
166         ret = insert_block_group_record(bg_cache, rec);
167         BUG_ON(ret);
168 out:
169         return ret;
170 free_out:
171         free(rec);
172         goto out;
173 }
174
175 static int process_chunk_item(struct cache_tree *chunk_cache,
176                               struct extent_buffer *leaf, struct btrfs_key *key,
177                               int slot)
178 {
179         struct chunk_record *rec;
180         struct chunk_record *exist;
181         struct cache_extent *cache;
182         int ret = 0;
183
184         rec = btrfs_new_chunk_record(leaf, key, slot);
185         if (!rec->cache.size)
186                 goto free_out;
187 again:
188         cache = lookup_cache_extent(chunk_cache, rec->offset, rec->length);
189         if (cache) {
190                 exist = container_of(cache, struct chunk_record, cache);
191
192                 if (exist->generation > rec->generation)
193                         goto free_out;
194                 if (exist->generation == rec->generation) {
195                         int num_stripes = rec->num_stripes;
196                         int rec_size = btrfs_chunk_record_size(num_stripes);
197                         int offset = offsetof(struct chunk_record, generation);
198
199                         if (exist->num_stripes != rec->num_stripes ||
200                             memcmp(((void *)exist) + offset,
201                                    ((void *)rec) + offset,
202                                    rec_size - offset))
203                                 ret = -EEXIST;
204                         goto free_out;
205                 }
206                 remove_cache_extent(chunk_cache, cache);
207                 free(exist);
208                 goto again;
209         }
210         ret = insert_cache_extent(chunk_cache, &rec->cache);
211         BUG_ON(ret);
212 out:
213         return ret;
214 free_out:
215         free(rec);
216         goto out;
217 }
218
219 static int process_device_extent_item(struct device_extent_tree *devext_cache,
220                                       struct extent_buffer *leaf,
221                                       struct btrfs_key *key, int slot)
222 {
223         struct device_extent_record *rec;
224         struct device_extent_record *exist;
225         struct cache_extent *cache;
226         int ret = 0;
227
228         rec = btrfs_new_device_extent_record(leaf, key, slot);
229         if (!rec->cache.size)
230                 goto free_out;
231 again:
232         cache = lookup_cache_extent2(&devext_cache->tree,
233                                      rec->cache.objectid,
234                                      rec->cache.start,
235                                      rec->cache.size);
236         if (cache) {
237                 exist = container_of(cache, struct device_extent_record, cache);
238                 if (exist->generation > rec->generation)
239                         goto free_out;
240                 if (exist->generation == rec->generation) {
241                         int offset = offsetof(struct device_extent_record,
242                                               generation);
243                         if (memcmp(((void *)exist) + offset,
244                                    ((void *)rec) + offset,
245                                    sizeof(*rec) - offset))
246                                 ret = -EEXIST;
247                         goto free_out;
248                 }
249                 remove_cache_extent(&devext_cache->tree, cache);
250                 list_del_init(&exist->chunk_list);
251                 list_del_init(&exist->device_list);
252                 free(exist);
253                 goto again;
254         }
255
256         ret = insert_device_extent_record(devext_cache, rec);
257         BUG_ON(ret);
258 out:
259         return ret;
260 free_out:
261         free(rec);
262         goto out;
263 }
264
265 static void print_block_group_info(struct block_group_record *rec, char *prefix)
266 {
267         if (prefix)
268                 printf("%s", prefix);
269         printf("Block Group: start = %llu, len = %llu, flag = %llx\n",
270                rec->objectid, rec->offset, rec->flags);
271 }
272
273 static void print_block_group_tree(struct block_group_tree *tree)
274 {
275         struct cache_extent *cache;
276         struct block_group_record *rec;
277
278         printf("All Block Groups:\n");
279         for (cache = first_cache_extent(&tree->tree); cache;
280              cache = next_cache_extent(cache)) {
281                 rec = container_of(cache, struct block_group_record, cache);
282                 print_block_group_info(rec, "\t");
283         }
284         printf("\n");
285 }
286
287 static void print_stripe_info(struct stripe *data, char *prefix1, char *prefix2,
288                               int index)
289 {
290         if (prefix1)
291                 printf("%s", prefix1);
292         if (prefix2)
293                 printf("%s", prefix2);
294         printf("[%2d] Stripe: devid = %llu, offset = %llu\n",
295                index, data->devid, data->offset);
296 }
297
298 static void print_chunk_self_info(struct chunk_record *rec, char *prefix)
299 {
300         int i;
301
302         if (prefix)
303                 printf("%s", prefix);
304         printf("Chunk: start = %llu, len = %llu, type = %llx, num_stripes = %u\n",
305                rec->offset, rec->length, rec->type_flags, rec->num_stripes);
306         if (prefix)
307                 printf("%s", prefix);
308         printf("    Stripes list:\n");
309         for (i = 0; i < rec->num_stripes; i++)
310                 print_stripe_info(&rec->stripes[i], prefix, "    ", i);
311 }
312
313 static void print_chunk_tree(struct cache_tree *tree)
314 {
315         struct cache_extent *n;
316         struct chunk_record *entry;
317
318         printf("All Chunks:\n");
319         for (n = first_cache_extent(tree); n;
320              n = next_cache_extent(n)) {
321                 entry = container_of(n, struct chunk_record, cache);
322                 print_chunk_self_info(entry, "\t");
323         }
324         printf("\n");
325 }
326
327 static void print_device_extent_info(struct device_extent_record *rec,
328                                      char *prefix)
329 {
330         if (prefix)
331                 printf("%s", prefix);
332         printf("Device extent: devid = %llu, start = %llu, len = %llu, chunk offset = %llu\n",
333                rec->objectid, rec->offset, rec->length, rec->chunk_offset);
334 }
335
336 static void print_device_extent_tree(struct device_extent_tree *tree)
337 {
338         struct cache_extent *n;
339         struct device_extent_record *entry;
340
341         printf("All Device Extents:\n");
342         for (n = first_cache_extent(&tree->tree); n;
343              n = next_cache_extent(n)) {
344                 entry = container_of(n, struct device_extent_record, cache);
345                 print_device_extent_info(entry, "\t");
346         }
347         printf("\n");
348 }
349
350 static void print_device_info(struct btrfs_device *device, char *prefix)
351 {
352         if (prefix)
353                 printf("%s", prefix);
354         printf("Device: id = %llu, name = %s\n",
355                device->devid, device->name);
356 }
357
358 static void print_all_devices(struct list_head *devices)
359 {
360         struct btrfs_device *dev;
361
362         printf("All Devices:\n");
363         list_for_each_entry(dev, devices, dev_list)
364                 print_device_info(dev, "\t");
365         printf("\n");
366 }
367
368 static void print_scan_result(struct recover_control *rc)
369 {
370         if (!rc->verbose)
371                 return;
372
373         printf("DEVICE SCAN RESULT:\n");
374         printf("Filesystem Information:\n");
375         printf("\tsectorsize: %d\n", rc->sectorsize);
376         printf("\tleafsize: %d\n", rc->leafsize);
377         printf("\ttree root generation: %llu\n", rc->generation);
378         printf("\tchunk root generation: %llu\n", rc->chunk_root_generation);
379         printf("\n");
380
381         print_all_devices(&rc->fs_devices->devices);
382         print_block_group_tree(&rc->bg);
383         print_chunk_tree(&rc->chunk);
384         print_device_extent_tree(&rc->devext);
385 }
386
387 static void print_chunk_info(struct chunk_record *chunk, char *prefix)
388 {
389         struct device_extent_record *devext;
390         int i;
391
392         print_chunk_self_info(chunk, prefix);
393         if (prefix)
394                 printf("%s", prefix);
395         if (chunk->bg_rec)
396                 print_block_group_info(chunk->bg_rec, "    ");
397         else
398                 printf("    No block group.\n");
399         if (prefix)
400                 printf("%s", prefix);
401         if (list_empty(&chunk->dextents)) {
402                 printf("    No device extent.\n");
403         } else {
404                 printf("    Device extent list:\n");
405                 i = 0;
406                 list_for_each_entry(devext, &chunk->dextents, chunk_list) {
407                         if (prefix)
408                                 printf("%s", prefix);
409                         printf("%s[%2d]", "        ", i);
410                         print_device_extent_info(devext, NULL);
411                         i++;
412                 }
413         }
414 }
415
416 static void print_check_result(struct recover_control *rc)
417 {
418         struct chunk_record *chunk;
419         struct block_group_record *bg;
420         struct device_extent_record *devext;
421         int total = 0;
422         int good = 0;
423         int bad = 0;
424
425         if (!rc->verbose)
426                 return;
427
428         printf("CHECK RESULT:\n");
429         printf("Healthy Chunks:\n");
430         list_for_each_entry(chunk, &rc->good_chunks, list) {
431                 print_chunk_info(chunk, "  ");
432                 good++;
433                 total++;
434         }
435         printf("Bad Chunks:\n");
436         list_for_each_entry(chunk, &rc->bad_chunks, list) {
437                 print_chunk_info(chunk, "  ");
438                 bad++;
439                 total++;
440         }
441         printf("\n");
442         printf("Total Chunks:\t%d\n", total);
443         printf("  Heathy:\t%d\n", good);
444         printf("  Bad:\t%d\n", bad);
445
446         printf("\n");
447         printf("Orphan Block Groups:\n");
448         list_for_each_entry(bg, &rc->bg.block_groups, list)
449                 print_block_group_info(bg, "  ");
450
451         printf("\n");
452         printf("Orphan Device Extents:\n");
453         list_for_each_entry(devext, &rc->devext.no_chunk_orphans, chunk_list)
454                 print_device_extent_info(devext, "  ");
455 }
456
457 static int check_chunk_by_metadata(struct recover_control *rc,
458                                    struct btrfs_root *root,
459                                    struct chunk_record *chunk, int bg_only)
460 {
461         int ret;
462         int i;
463         int slot;
464         struct btrfs_path path;
465         struct btrfs_key key;
466         struct btrfs_root *dev_root;
467         struct stripe *stripe;
468         struct btrfs_dev_extent *dev_extent;
469         struct btrfs_block_group_item *bg_ptr;
470         struct extent_buffer *l;
471
472         btrfs_init_path(&path);
473
474         if (bg_only)
475                 goto bg_check;
476
477         dev_root = root->fs_info->dev_root;
478         for (i = 0; i < chunk->num_stripes; i++) {
479                 stripe = &chunk->stripes[i];
480
481                 key.objectid = stripe->devid;
482                 key.offset = stripe->offset;
483                 key.type = BTRFS_DEV_EXTENT_KEY;
484
485                 ret = btrfs_search_slot(NULL, dev_root, &key, &path, 0, 0);
486                 if (ret < 0) {
487                         fprintf(stderr, "Search device extent failed(%d)\n",
488                                 ret);
489                         btrfs_release_path(root, &path);
490                         return ret;
491                 } else if (ret > 0) {
492                         if (rc->verbose)
493                                 fprintf(stderr,
494                                         "No device extent[%llu, %llu]\n",
495                                         stripe->devid, stripe->offset);
496                         btrfs_release_path(root, &path);
497                         return -ENOENT;
498                 }
499                 l = path.nodes[0];
500                 slot = path.slots[0];
501                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
502                 if (chunk->offset !=
503                     btrfs_dev_extent_chunk_offset(l, dev_extent)) {
504                         if (rc->verbose)
505                                 fprintf(stderr,
506                                         "Device tree unmatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
507                                         btrfs_dev_extent_chunk_offset(l,
508                                                                 dev_extent),
509                                         btrfs_dev_extent_length(l, dev_extent),
510                                         chunk->offset, chunk->length);
511                         btrfs_release_path(root, &path);
512                         return -ENOENT;
513                 }
514                 btrfs_release_path(root, &path);
515         }
516
517 bg_check:
518         key.objectid = chunk->offset;
519         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
520         key.offset = chunk->length;
521
522         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, &path,
523                                 0, 0);
524         if (ret < 0) {
525                 fprintf(stderr, "Search block group failed(%d)\n", ret);
526                 btrfs_release_path(root, &path);
527                 return ret;
528         } else if (ret > 0) {
529                 if (rc->verbose)
530                         fprintf(stderr, "No block group[%llu, %llu]\n",
531                                 key.objectid, key.offset);
532                 btrfs_release_path(root, &path);
533                 return -ENOENT;
534         }
535
536         l = path.nodes[0];
537         slot = path.slots[0];
538         bg_ptr = btrfs_item_ptr(l, slot, struct btrfs_block_group_item);
539         if (chunk->type_flags != btrfs_disk_block_group_flags(l, bg_ptr)) {
540                 if (rc->verbose)
541                         fprintf(stderr,
542                                 "Chunk[%llu, %llu]'s type(%llu) is differemt with Block Group's type(%llu)\n",
543                                 chunk->offset, chunk->length, chunk->type_flags,
544                                 btrfs_disk_block_group_flags(l, bg_ptr));
545                 btrfs_release_path(root, &path);
546                 return -ENOENT;
547         }
548         btrfs_release_path(root, &path);
549         return 0;
550 }
551
552 static int check_all_chunks_by_metadata(struct recover_control *rc,
553                                         struct btrfs_root *root)
554 {
555         struct chunk_record *chunk;
556         LIST_HEAD(orphan_chunks);
557         int ret = 0;
558         int err;
559
560         list_for_each_entry(chunk, &rc->good_chunks, list) {
561                 err = check_chunk_by_metadata(rc, root, chunk, 0);
562                 if (err) {
563                         if (err == -ENOENT)
564                                 list_move_tail(&chunk->list, &orphan_chunks);
565                         else if (err && !ret)
566                                 ret = err;
567                 }
568         }
569
570         list_for_each_entry(chunk, &rc->bad_chunks, list) {
571                 err = check_chunk_by_metadata(rc, root, chunk, 1);
572                 if (err != -ENOENT && !ret)
573                         ret = err ? err : -EINVAL;
574         }
575         list_splice(&orphan_chunks, &rc->bad_chunks);
576         return ret;
577 }
578
579 static int extract_metadata_record(struct recover_control *rc,
580                                    struct extent_buffer *leaf)
581 {
582         struct btrfs_key key;
583         int ret = 0;
584         int i;
585         u32 nritems;
586
587         nritems = btrfs_header_nritems(leaf);
588         for (i = 0; i < nritems; i++) {
589                 btrfs_item_key_to_cpu(leaf, &key, i);
590                 switch (key.type) {
591                 case BTRFS_BLOCK_GROUP_ITEM_KEY:
592                         ret = process_block_group_item(&rc->bg, leaf, &key, i);
593                         break;
594                 case BTRFS_CHUNK_ITEM_KEY:
595                         ret = process_chunk_item(&rc->chunk, leaf, &key, i);
596                         break;
597                 case BTRFS_DEV_EXTENT_KEY:
598                         ret = process_device_extent_item(&rc->devext, leaf,
599                                                          &key, i);
600                         break;
601                 }
602                 if (ret)
603                         break;
604         }
605         return ret;
606 }
607
608 static inline int is_super_block_address(u64 offset)
609 {
610         int i;
611
612         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
613                 if (offset == btrfs_sb_offset(i))
614                         return 1;
615         }
616         return 0;
617 }
618
619 static int scan_one_device(struct recover_control *rc, int fd)
620 {
621         struct extent_buffer *buf;
622         u64 bytenr;
623         int ret = 0;
624
625         buf = malloc(sizeof(*buf) + rc->leafsize);
626         if (!buf)
627                 return -ENOMEM;
628         buf->len = rc->leafsize;
629
630         bytenr = 0;
631         while (1) {
632                 if (is_super_block_address(bytenr))
633                         bytenr += rc->sectorsize;
634
635                 if (pread64(fd, buf->data, rc->leafsize, bytenr) <
636                     rc->leafsize)
637                         break;
638
639                 if (memcmp_extent_buffer(buf, rc->fs_devices->fsid,
640                                          (unsigned long)btrfs_header_fsid(buf),
641                                          BTRFS_FSID_SIZE)) {
642                         bytenr += rc->sectorsize;
643                         continue;
644                 }
645
646                 if (verify_tree_block_csum_silent(buf, rc->csum_size)) {
647                         bytenr += rc->sectorsize;
648                         continue;
649                 }
650
651                 if (btrfs_header_level(buf) != 0)
652                         goto next_node;
653
654                 switch (btrfs_header_owner(buf)) {
655                 case BTRFS_EXTENT_TREE_OBJECTID:
656                 case BTRFS_DEV_TREE_OBJECTID:
657                         /* different tree use different generation */
658                         if (btrfs_header_generation(buf) > rc->generation)
659                                 break;
660                         ret = extract_metadata_record(rc, buf);
661                         if (ret)
662                                 goto out;
663                         break;
664                 case BTRFS_CHUNK_TREE_OBJECTID:
665                         if (btrfs_header_generation(buf) >
666                             rc->chunk_root_generation)
667                                 break;
668                         ret = extract_metadata_record(rc, buf);
669                         if (ret)
670                                 goto out;
671                         break;
672                 }
673 next_node:
674                 bytenr += rc->leafsize;
675         }
676 out:
677         free(buf);
678         return ret;
679 }
680
681 static int scan_devices(struct recover_control *rc)
682 {
683         int ret = 0;
684         int fd;
685         struct btrfs_device *dev;
686
687         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
688                 fd = open(dev->name, O_RDONLY);
689                 if (fd < 0) {
690                         fprintf(stderr, "Failed to open device %s\n",
691                                 dev->name);
692                         return -1;
693                 }
694                 ret = scan_one_device(rc, fd);
695                 close(fd);
696                 if (ret)
697                         return ret;
698         }
699         return ret;
700 }
701
702 static int build_device_map_by_chunk_record(struct btrfs_root *root,
703                                             struct chunk_record *chunk)
704 {
705         int ret = 0;
706         int i;
707         u64 devid;
708         u8 uuid[BTRFS_UUID_SIZE];
709         u16 num_stripes;
710         struct btrfs_mapping_tree *map_tree;
711         struct map_lookup *map;
712         struct stripe *stripe;
713
714         map_tree = &root->fs_info->mapping_tree;
715         num_stripes = chunk->num_stripes;
716         map = malloc(btrfs_map_lookup_size(num_stripes));
717         if (!map)
718                 return -ENOMEM;
719         map->ce.start = chunk->offset;
720         map->ce.size = chunk->length;
721         map->num_stripes = num_stripes;
722         map->io_width = chunk->io_width;
723         map->io_align = chunk->io_align;
724         map->sector_size = chunk->sector_size;
725         map->stripe_len = chunk->stripe_len;
726         map->type = chunk->type_flags;
727         map->sub_stripes = chunk->sub_stripes;
728
729         for (i = 0, stripe = chunk->stripes; i < num_stripes; i++, stripe++) {
730                 devid = stripe->devid;
731                 memcpy(uuid, stripe->dev_uuid, BTRFS_UUID_SIZE);
732                 map->stripes[i].physical = stripe->offset;
733                 map->stripes[i].dev = btrfs_find_device(root, devid,
734                                                         uuid, NULL);
735                 if (!map->stripes[i].dev) {
736                         kfree(map);
737                         return -EIO;
738                 }
739         }
740
741         ret = insert_cache_extent(&map_tree->cache_tree, &map->ce);
742         return ret;
743 }
744
745 static int build_device_maps_by_chunk_records(struct recover_control *rc,
746                                               struct btrfs_root *root)
747 {
748         int ret = 0;
749         struct chunk_record *chunk;
750
751         list_for_each_entry(chunk, &rc->good_chunks, list) {
752                 ret = build_device_map_by_chunk_record(root, chunk);
753                 if (ret)
754                         return ret;
755         }
756         return ret;
757 }
758
759 static int block_group_remove_all_extent_items(struct btrfs_trans_handle *trans,
760                                                struct btrfs_root *root,
761                                                struct block_group_record *bg)
762 {
763         struct btrfs_fs_info *fs_info = root->fs_info;
764         struct btrfs_key key;
765         struct btrfs_path path;
766         struct extent_buffer *leaf;
767         u64 start = bg->objectid;
768         u64 end = bg->objectid + bg->offset;
769         u64 old_val;
770         int nitems;
771         int ret;
772         int i;
773         int del_s, del_nr;
774
775         btrfs_init_path(&path);
776         root = root->fs_info->extent_root;
777
778         key.objectid = start;
779         key.offset = 0;
780         key.type = BTRFS_EXTENT_ITEM_KEY;
781 again:
782         ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
783         if (ret < 0)
784                 goto err;
785         else if (ret > 0)
786                 ret = 0;
787
788         leaf = path.nodes[0];
789         nitems = btrfs_header_nritems(leaf);
790         if (!nitems) {
791                 /* The tree is empty. */
792                 ret = 0;
793                 goto err;
794         }
795
796         if (path.slots[0] >= nitems) {
797                 ret = btrfs_next_leaf(root, &path);
798                 if (ret < 0)
799                         goto err;
800                 if (ret > 0) {
801                         ret = 0;
802                         goto err;
803                 }
804                 leaf = path.nodes[0];
805                 btrfs_item_key_to_cpu(leaf, &key, 0);
806                 if (key.objectid >= end)
807                         goto err;
808                 btrfs_release_path(root, &path);
809                 goto again;
810         }
811
812         del_nr = 0;
813         del_s = -1;
814         for (i = path.slots[0]; i < nitems; i++) {
815                 btrfs_item_key_to_cpu(leaf, &key, i);
816                 if (key.objectid >= end)
817                         break;
818
819                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
820                         if (del_nr == 0)
821                                 continue;
822                         else
823                                 break;
824                 }
825
826                 if (del_s == -1)
827                         del_s = i;
828                 del_nr++;
829                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
830                     key.type == BTRFS_METADATA_ITEM_KEY) {
831                         old_val = btrfs_super_bytes_used(fs_info->super_copy);
832                         if (key.type == BTRFS_METADATA_ITEM_KEY)
833                                 old_val += root->leafsize;
834                         else
835                                 old_val += key.offset;
836                         btrfs_set_super_bytes_used(fs_info->super_copy,
837                                                    old_val);
838                 }
839         }
840
841         if (del_nr) {
842                 ret = btrfs_del_items(trans, root, &path, del_s, del_nr);
843                 if (ret)
844                         goto err;
845         }
846
847         if (key.objectid < end) {
848                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
849                         key.objectid += root->sectorsize;
850                         key.type = BTRFS_EXTENT_ITEM_KEY;
851                         key.offset = 0;
852                 }
853                 btrfs_release_path(root, &path);
854                 goto again;
855         }
856 err:
857         btrfs_release_path(root, &path);
858         return ret;
859 }
860
861 static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
862                                        struct btrfs_root *root,
863                                        struct block_group_record *bg)
864 {
865         struct btrfs_block_group_cache *cache;
866         struct btrfs_fs_info *info;
867         u64 start;
868         u64 end;
869
870         info = root->fs_info;
871         cache = btrfs_lookup_block_group(info, bg->objectid);
872         if (!cache)
873                 return -ENOENT;
874
875         start = cache->key.objectid;
876         end = start + cache->key.offset - 1;
877
878         set_extent_bits(&info->block_group_cache, start, end,
879                         BLOCK_GROUP_DIRTY, GFP_NOFS);
880         set_extent_dirty(&info->free_space_cache, start, end, GFP_NOFS);
881
882         btrfs_set_block_group_used(&cache->item, 0);
883
884         return 0;
885 }
886
887 static int remove_chunk_extent_item(struct btrfs_trans_handle *trans,
888                                     struct recover_control *rc,
889                                     struct btrfs_root *root)
890 {
891         struct chunk_record *chunk;
892         int ret = 0;
893
894         list_for_each_entry(chunk, &rc->good_chunks, list) {
895                 if (!(chunk->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
896                         continue;
897                 ret = block_group_remove_all_extent_items(trans, root,
898                                                           chunk->bg_rec);
899                 if (ret)
900                         return ret;
901
902                 ret = block_group_free_all_extent(trans, root, chunk->bg_rec);
903                 if (ret)
904                         return ret;
905         }
906         return ret;
907 }
908
909 static int __rebuild_chunk_root(struct btrfs_trans_handle *trans,
910                                 struct recover_control *rc,
911                                 struct btrfs_root *root)
912 {
913         u64 min_devid = -1;
914         struct btrfs_device *dev;
915         struct extent_buffer *cow;
916         struct btrfs_disk_key disk_key;
917         int ret = 0;
918
919         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
920                 if (min_devid > dev->devid)
921                         min_devid = dev->devid;
922         }
923         disk_key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
924         disk_key.type = BTRFS_DEV_ITEM_KEY;
925         disk_key.offset = min_devid;
926
927         cow = btrfs_alloc_free_block(trans, root, root->sectorsize,
928                                      BTRFS_CHUNK_TREE_OBJECTID,
929                                      &disk_key, 0, 0, 0);
930         btrfs_set_header_bytenr(cow, cow->start);
931         btrfs_set_header_generation(cow, trans->transid);
932         btrfs_set_header_nritems(cow, 0);
933         btrfs_set_header_level(cow, 0);
934         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
935         btrfs_set_header_owner(cow, BTRFS_CHUNK_TREE_OBJECTID);
936         write_extent_buffer(cow, root->fs_info->fsid,
937                         (unsigned long)btrfs_header_fsid(cow),
938                         BTRFS_FSID_SIZE);
939
940         write_extent_buffer(cow, root->fs_info->chunk_tree_uuid,
941                         (unsigned long)btrfs_header_chunk_tree_uuid(cow),
942                         BTRFS_UUID_SIZE);
943
944         root->node = cow;
945         btrfs_mark_buffer_dirty(cow);
946
947         return ret;
948 }
949
950 static int __rebuild_device_items(struct btrfs_trans_handle *trans,
951                                   struct recover_control *rc,
952                                   struct btrfs_root *root)
953 {
954         struct btrfs_device *dev;
955         struct btrfs_key key;
956         struct btrfs_dev_item *dev_item;
957         int ret = 0;
958
959         dev_item = malloc(sizeof(struct btrfs_dev_item));
960         if (!dev_item)
961                 return -ENOMEM;
962
963         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
964                 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
965                 key.type = BTRFS_DEV_ITEM_KEY;
966                 key.offset = dev->devid;
967
968                 btrfs_set_stack_device_generation(dev_item, 0);
969                 btrfs_set_stack_device_type(dev_item, dev->type);
970                 btrfs_set_stack_device_id(dev_item, dev->devid);
971                 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
972                 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
973                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
974                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
975                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
976                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
977                 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
978
979                 ret = btrfs_insert_item(trans, root, &key,
980                                         dev_item, sizeof(*dev_item));
981         }
982
983         free(dev_item);
984         return ret;
985 }
986
987 static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
988                                  struct recover_control *rc,
989                                  struct btrfs_root *root)
990 {
991         struct btrfs_key key;
992         struct btrfs_chunk *chunk = NULL;
993         struct btrfs_root *chunk_root;
994         struct chunk_record *chunk_rec;
995         int ret;
996
997         chunk_root = root->fs_info->chunk_root;
998
999         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1000                 chunk = create_chunk_item(chunk_rec);
1001                 if (!chunk)
1002                         return -ENOMEM;
1003
1004                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1005                 key.type = BTRFS_CHUNK_ITEM_KEY;
1006                 key.offset = chunk_rec->offset;
1007
1008                 ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
1009                                 btrfs_chunk_item_size(chunk->num_stripes));
1010                 free(chunk);
1011                 if (ret)
1012                         return ret;
1013         }
1014         return 0;
1015 }
1016
1017 static int rebuild_chunk_tree(struct btrfs_trans_handle *trans,
1018                               struct recover_control *rc,
1019                               struct btrfs_root *root)
1020 {
1021         int ret = 0;
1022
1023         root = root->fs_info->chunk_root;
1024
1025         ret = __rebuild_chunk_root(trans, rc, root);
1026         if (ret)
1027                 return ret;
1028
1029         ret = __rebuild_device_items(trans, rc, root);
1030         if (ret)
1031                 return ret;
1032
1033         ret = __rebuild_chunk_items(trans, rc, root);
1034
1035         return ret;
1036 }
1037
1038 static int rebuild_sys_array(struct recover_control *rc,
1039                              struct btrfs_root *root)
1040 {
1041         struct btrfs_chunk *chunk;
1042         struct btrfs_key key;
1043         struct chunk_record *chunk_rec;
1044         int ret = 0;
1045         u16 num_stripes;
1046
1047         btrfs_set_super_sys_array_size(root->fs_info->super_copy, 0);
1048
1049         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1050                 if (!(chunk_rec->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
1051                         continue;
1052
1053                 num_stripes = chunk_rec->num_stripes;
1054                 chunk = create_chunk_item(chunk_rec);
1055                 if (!chunk) {
1056                         ret = -ENOMEM;
1057                         break;
1058                 }
1059
1060                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1061                 key.type = BTRFS_CHUNK_ITEM_KEY;
1062                 key.offset = chunk_rec->offset;
1063
1064                 ret = btrfs_add_system_chunk(NULL, root, &key, chunk,
1065                                 btrfs_chunk_item_size(num_stripes));
1066                 free(chunk);
1067                 if (ret)
1068                         break;
1069         }
1070         return ret;
1071
1072 }
1073
1074 static struct btrfs_root *
1075 open_ctree_with_broken_chunk(struct recover_control *rc)
1076 {
1077         struct btrfs_fs_info *fs_info;
1078         struct btrfs_super_block *disk_super;
1079         struct extent_buffer *eb;
1080         u32 sectorsize;
1081         u32 nodesize;
1082         u32 leafsize;
1083         u32 stripesize;
1084         int ret;
1085
1086         fs_info = btrfs_new_fs_info(1, BTRFS_SUPER_INFO_OFFSET);
1087         if (!fs_info) {
1088                 fprintf(stderr, "Failed to allocate memory for fs_info\n");
1089                 return ERR_PTR(-ENOMEM);
1090         }
1091
1092         fs_info->fs_devices = rc->fs_devices;
1093         ret = btrfs_open_devices(fs_info->fs_devices, O_RDWR);
1094         if (ret)
1095                 goto out;
1096
1097         disk_super = fs_info->super_copy;
1098         ret = btrfs_read_dev_super(fs_info->fs_devices->latest_bdev,
1099                                    disk_super, fs_info->super_bytenr);
1100         if (ret) {
1101                 fprintf(stderr, "No valid btrfs found\n");
1102                 goto out_devices;
1103         }
1104
1105         memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
1106
1107         ret = btrfs_check_fs_compatibility(disk_super, 1);
1108         if (ret)
1109                 goto out_devices;
1110
1111         nodesize = btrfs_super_nodesize(disk_super);
1112         leafsize = btrfs_super_leafsize(disk_super);
1113         sectorsize = btrfs_super_sectorsize(disk_super);
1114         stripesize = btrfs_super_stripesize(disk_super);
1115
1116         __setup_root(nodesize, leafsize, sectorsize, stripesize,
1117                      fs_info->chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1118
1119         ret = build_device_maps_by_chunk_records(rc, fs_info->chunk_root);
1120         if (ret)
1121                 goto out_cleanup;
1122
1123         ret = btrfs_setup_all_roots(fs_info, 0, 0);
1124         if (ret)
1125                 goto out_failed;
1126
1127         eb = fs_info->tree_root->node;
1128         read_extent_buffer(eb, fs_info->chunk_tree_uuid,
1129                            (unsigned long)btrfs_header_chunk_tree_uuid(eb),
1130                            BTRFS_UUID_SIZE);
1131
1132         return fs_info->fs_root;
1133 out_failed:
1134         btrfs_release_all_roots(fs_info);
1135 out_cleanup:
1136         btrfs_cleanup_all_caches(fs_info);
1137 out_devices:
1138         btrfs_close_devices(fs_info->fs_devices);
1139 out:
1140         btrfs_free_fs_info(fs_info);
1141         return ERR_PTR(ret);
1142 }
1143
1144 static int recover_prepare(struct recover_control *rc, char *path)
1145 {
1146         int ret;
1147         int fd;
1148         struct btrfs_super_block *sb;
1149         struct btrfs_fs_devices *fs_devices;
1150
1151         ret = 0;
1152         fd = open(path, O_RDONLY);
1153         if (fd < 0) {
1154                 fprintf(stderr, "open %s\n error.\n", path);
1155                 return -1;
1156         }
1157
1158         sb = malloc(sizeof(struct btrfs_super_block));
1159         if (!sb) {
1160                 fprintf(stderr, "allocating memory for sb failed.\n");
1161                 ret = -ENOMEM;
1162                 goto fail_close_fd;
1163         }
1164
1165         ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET);
1166         if (ret) {
1167                 fprintf(stderr, "read super block error\n");
1168                 goto fail_free_sb;
1169         }
1170
1171         rc->sectorsize = btrfs_super_sectorsize(sb);
1172         rc->leafsize = btrfs_super_leafsize(sb);
1173         rc->generation = btrfs_super_generation(sb);
1174         rc->chunk_root_generation = btrfs_super_chunk_root_generation(sb);
1175         rc->csum_size = btrfs_super_csum_size(sb);
1176
1177         /* if seed, the result of scanning below will be partial */
1178         if (btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_SEEDING) {
1179                 fprintf(stderr, "this device is seed device\n");
1180                 ret = -1;
1181                 goto fail_free_sb;
1182         }
1183
1184         ret = btrfs_scan_fs_devices(fd, path, &fs_devices);
1185         if (ret)
1186                 goto fail_free_sb;
1187
1188         rc->fs_devices = fs_devices;
1189
1190         if (rc->verbose)
1191                 print_all_devices(&rc->fs_devices->devices);
1192
1193 fail_free_sb:
1194         free(sb);
1195 fail_close_fd:
1196         close(fd);
1197         return ret;
1198 }
1199
1200 static int ask_user(char *question, int defval)
1201 {
1202         char answer[5];
1203         char *defstr;
1204         int i;
1205
1206         if (defval == 1)
1207                 defstr = "[Y/n]";
1208         else if (defval == 0)
1209                 defstr = "[y/N]";
1210         else if (defval == -1)
1211                 defstr = "[y/n]";
1212         else
1213                 BUG_ON(1);
1214 again:
1215         printf("%s%s? ", question, defstr);
1216
1217         i = 0;
1218         while (i < 4 && scanf("%c", &answer[i])) {
1219                 if (answer[i] == '\n') {
1220                         answer[i] = '\0';
1221                         break;
1222                 } else if (answer[i] == ' '){
1223                         answer[i] = '\0';
1224                         if (i == 0)
1225                                 continue;
1226                         else
1227                                 break;
1228                 } else if (answer[i] >= 'A' && answer[i] <= 'Z') {
1229                         answer[i] += 'a' - 'A';
1230                 }
1231                 i++;
1232         }
1233         answer[5] = '\0';
1234         __fpurge(stdin);
1235
1236         if (strlen(answer) == 0) {
1237                 if (defval != -1)
1238                         return defval;
1239                 else
1240                         goto again;
1241         }
1242
1243         if (!strcmp(answer, "yes") ||
1244             !strcmp(answer, "y"))
1245                 return 1;
1246
1247         if (!strcmp(answer, "no") ||
1248             !strcmp(answer, "n"))
1249                 return 0;
1250
1251         goto again;
1252 }
1253
1254 static int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
1255 {
1256         int ret = 0;
1257         struct btrfs_root *root = NULL;
1258         struct btrfs_trans_handle *trans;
1259         struct recover_control rc;
1260
1261         init_recover_control(&rc, verbose, yes);
1262
1263         ret = recover_prepare(&rc, path);
1264         if (ret) {
1265                 fprintf(stderr, "recover prepare error\n");
1266                 return ret;
1267         }
1268
1269         ret = scan_devices(&rc);
1270         if (ret) {
1271                 fprintf(stderr, "scan chunk headers error\n");
1272                 goto fail_rc;
1273         }
1274
1275         if (cache_tree_empty(&rc.chunk) &&
1276             cache_tree_empty(&rc.bg.tree) &&
1277             cache_tree_empty(&rc.devext.tree)) {
1278                 fprintf(stderr, "no recoverable chunk\n");
1279                 goto fail_rc;
1280         }
1281
1282         print_scan_result(&rc);
1283
1284         ret = check_chunks(&rc.chunk, &rc.bg, &rc.devext, &rc.good_chunks,
1285                            &rc.bad_chunks, 1);
1286         print_check_result(&rc);
1287         if (ret) {
1288                 if (!list_empty(&rc.bg.block_groups) ||
1289                     !list_empty(&rc.devext.no_chunk_orphans)) {
1290                         fprintf(stderr,
1291                                 "There are some orphan block groups and device extents, we can't repair them now.\n");
1292                         goto fail_rc;
1293                 }
1294                 /*
1295                  * If the chunk is healthy, its block group item and device
1296                  * extent item should be written on the disks. So, it is very
1297                  * likely that the bad chunk is a old one that has been
1298                  * droppped from the fs. Don't deal with them now, we will
1299                  * check it after the fs is opened.
1300                  */
1301         }
1302
1303         root = open_ctree_with_broken_chunk(&rc);
1304         if (IS_ERR(root)) {
1305                 fprintf(stderr, "open with broken chunk error\n");
1306                 ret = PTR_ERR(root);
1307                 goto fail_rc;
1308         }
1309
1310         ret = check_all_chunks_by_metadata(&rc, root);
1311         if (ret) {
1312                 fprintf(stderr, "The chunks in memory can not match the metadata of the fs. Repair failed.\n");
1313                 goto fail_close_ctree;
1314         }
1315
1316         if (!rc.yes) {
1317                 ret = ask_user("We are going to rebuild the chunk tree on disk, it might destroy the old metadata on the disk, Are you sure",
1318                                0);
1319                 if (!ret) {
1320                         ret = BTRFS_CHUNK_TREE_REBUILD_ABORTED;
1321                         goto fail_close_ctree;
1322                 }
1323         }
1324
1325         trans = btrfs_start_transaction(root, 1);
1326         ret = remove_chunk_extent_item(trans, &rc, root);
1327         BUG_ON(ret);
1328
1329         ret = rebuild_chunk_tree(trans, &rc, root);
1330         BUG_ON(ret);
1331
1332         ret = rebuild_sys_array(&rc, root);
1333         BUG_ON(ret);
1334
1335         btrfs_commit_transaction(trans, root);
1336 fail_close_ctree:
1337         close_ctree(root);
1338 fail_rc:
1339         free_recover_control(&rc);
1340         return ret;
1341 }
1342
1343 const char * const cmd_chunk_recover_usage[] = {
1344         "btrfs chunk-recover [options] <device>",
1345         "Recover the chunk tree by scaning the devices one by one.",
1346         "",
1347         "-y     Assume an answer of `yes' to all questions",
1348         "-v     Verbose mode",
1349         "-h     Help",
1350         NULL
1351 };
1352
1353 int cmd_chunk_recover(int argc, char *argv[])
1354 {
1355         int ret = 0;
1356         char *file;
1357         int yes = 0;
1358         int verbose = 0;
1359
1360         while (1) {
1361                 int c = getopt(argc, argv, "yvh");
1362                 if (c < 0)
1363                         break;
1364                 switch (c) {
1365                 case 'y':
1366                         yes = 1;
1367                         break;
1368                 case 'v':
1369                         verbose = 1;
1370                         break;
1371                 case 'h':
1372                 default:
1373                         usage(cmd_chunk_recover_usage);
1374                 }
1375         }
1376
1377         argc = argc - optind;
1378         if (argc == 0)
1379                 usage(cmd_chunk_recover_usage);
1380
1381         file = argv[optind];
1382
1383         ret = check_mounted(file);
1384         if (ret) {
1385                 fprintf(stderr, "the device is busy\n");
1386                 return ret;
1387         }
1388
1389         ret = btrfs_recover_chunk_tree(file, verbose, yes);
1390         if (!ret) {
1391                 fprintf(stdout, "Recover the chunk tree successfully.\n");
1392         } else if (ret == BTRFS_CHUNK_TREE_REBUILD_ABORTED) {
1393                 ret = 0;
1394                 fprintf(stdout, "Abort to rebuild the on-disk chunk tree.\n");
1395         } else {
1396                 fprintf(stdout, "Fail to recover the chunk tree.\n");
1397         }
1398         return ret;
1399 }