btrfs-progs: scan devices in parallel for chunk-recover
[platform/upstream/btrfs-progs.git] / chunk-recover.c
1 /*
2  * Copyright (C) 2013 Fujitsu.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #define _XOPEN_SOURCE 500
19 #define _GNU_SOURCE
20
21 #include <stdio.h>
22 #include <stdio_ext.h>
23 #include <stdlib.h>
24 #include <sys/types.h>
25 #include <sys/stat.h>
26 #include <fcntl.h>
27 #include <unistd.h>
28 #include <uuid/uuid.h>
29 #include <pthread.h>
30
31 #include "kerncompat.h"
32 #include "list.h"
33 #include "radix-tree.h"
34 #include "ctree.h"
35 #include "extent-cache.h"
36 #include "disk-io.h"
37 #include "volumes.h"
38 #include "transaction.h"
39 #include "crc32c.h"
40 #include "utils.h"
41 #include "version.h"
42 #include "btrfsck.h"
43 #include "commands.h"
44
45 #define BTRFS_STRIPE_LEN                        (64 * 1024)
46 #define BTRFS_NUM_MIRRORS                       2
47
48 struct recover_control {
49         int verbose;
50         int yes;
51
52         u16 csum_size;
53         u32 sectorsize;
54         u32 leafsize;
55         u64 generation;
56         u64 chunk_root_generation;
57
58         struct btrfs_fs_devices *fs_devices;
59
60         struct cache_tree chunk;
61         struct block_group_tree bg;
62         struct device_extent_tree devext;
63         struct cache_tree eb_cache;
64
65         struct list_head good_chunks;
66         struct list_head bad_chunks;
67         struct list_head unrepaired_chunks;
68         pthread_mutex_t rc_lock;
69 };
70
71 struct extent_record {
72         struct cache_extent cache;
73         u64 generation;
74         u8 csum[BTRFS_CSUM_SIZE];
75         struct btrfs_device *devices[BTRFS_NUM_MIRRORS];
76         u64 offsets[BTRFS_NUM_MIRRORS];
77         int nmirrors;
78 };
79
80 struct device_scan {
81         struct recover_control *rc;
82         struct btrfs_device *dev;
83         int fd;
84 };
85
86 static struct extent_record *btrfs_new_extent_record(struct extent_buffer *eb)
87 {
88         struct extent_record *rec;
89
90         rec = malloc(sizeof(*rec));
91         if (!rec) {
92                 fprintf(stderr, "Fail to allocate memory for extent record.\n");
93                 exit(1);
94         }
95
96         memset(rec, 0, sizeof(*rec));
97         rec->cache.start = btrfs_header_bytenr(eb);
98         rec->cache.size = eb->len;
99         rec->generation = btrfs_header_generation(eb);
100         read_extent_buffer(eb, rec->csum, (unsigned long)btrfs_header_csum(eb),
101                            BTRFS_CSUM_SIZE);
102         return rec;
103 }
104
105 static int process_extent_buffer(struct cache_tree *eb_cache,
106                                  struct extent_buffer *eb,
107                                  struct btrfs_device *device, u64 offset)
108 {
109         struct extent_record *rec;
110         struct extent_record *exist;
111         struct cache_extent *cache;
112         int ret = 0;
113
114         rec = btrfs_new_extent_record(eb);
115         if (!rec->cache.size)
116                 goto free_out;
117 again:
118         cache = lookup_cache_extent(eb_cache,
119                                     rec->cache.start,
120                                     rec->cache.size);
121         if (cache) {
122                 exist = container_of(cache, struct extent_record, cache);
123
124                 if (exist->generation > rec->generation)
125                         goto free_out;
126                 if (exist->generation == rec->generation) {
127                         if (exist->cache.start != rec->cache.start ||
128                             exist->cache.size != rec->cache.size ||
129                             memcmp(exist->csum, rec->csum, BTRFS_CSUM_SIZE)) {
130                                 ret = -EEXIST;
131                         } else {
132                                 BUG_ON(exist->nmirrors >= BTRFS_NUM_MIRRORS);
133                                 exist->devices[exist->nmirrors] = device;
134                                 exist->offsets[exist->nmirrors] = offset;
135                                 exist->nmirrors++;
136                         }
137                         goto free_out;
138                 }
139                 remove_cache_extent(eb_cache, cache);
140                 free(exist);
141                 goto again;
142         }
143
144         rec->devices[0] = device;
145         rec->offsets[0] = offset;
146         rec->nmirrors++;
147         ret = insert_cache_extent(eb_cache, &rec->cache);
148         BUG_ON(ret);
149 out:
150         return ret;
151 free_out:
152         free(rec);
153         goto out;
154 }
155
156 static void free_extent_record(struct cache_extent *cache)
157 {
158         struct extent_record *er;
159
160         er = container_of(cache, struct extent_record, cache);
161         free(er);
162 }
163
164 FREE_EXTENT_CACHE_BASED_TREE(extent_record, free_extent_record);
165
166 static struct btrfs_chunk *create_chunk_item(struct chunk_record *record)
167 {
168         struct btrfs_chunk *ret;
169         struct btrfs_stripe *chunk_stripe;
170         int i;
171
172         if (!record || record->num_stripes == 0)
173                 return NULL;
174         ret = malloc(btrfs_chunk_item_size(record->num_stripes));
175         if (!ret)
176                 return NULL;
177         btrfs_set_stack_chunk_length(ret, record->length);
178         btrfs_set_stack_chunk_owner(ret, record->owner);
179         btrfs_set_stack_chunk_stripe_len(ret, record->stripe_len);
180         btrfs_set_stack_chunk_type(ret, record->type_flags);
181         btrfs_set_stack_chunk_io_align(ret, record->io_align);
182         btrfs_set_stack_chunk_io_width(ret, record->io_width);
183         btrfs_set_stack_chunk_sector_size(ret, record->sector_size);
184         btrfs_set_stack_chunk_num_stripes(ret, record->num_stripes);
185         btrfs_set_stack_chunk_sub_stripes(ret, record->sub_stripes);
186         for (i = 0, chunk_stripe = &ret->stripe; i < record->num_stripes;
187              i++, chunk_stripe++) {
188                 btrfs_set_stack_stripe_devid(chunk_stripe,
189                                 record->stripes[i].devid);
190                 btrfs_set_stack_stripe_offset(chunk_stripe,
191                                 record->stripes[i].offset);
192                 memcpy(chunk_stripe->dev_uuid, record->stripes[i].dev_uuid,
193                        BTRFS_UUID_SIZE);
194         }
195         return ret;
196 }
197
198 static void init_recover_control(struct recover_control *rc, int verbose,
199                 int yes)
200 {
201         memset(rc, 0, sizeof(struct recover_control));
202         cache_tree_init(&rc->chunk);
203         cache_tree_init(&rc->eb_cache);
204         block_group_tree_init(&rc->bg);
205         device_extent_tree_init(&rc->devext);
206
207         INIT_LIST_HEAD(&rc->good_chunks);
208         INIT_LIST_HEAD(&rc->bad_chunks);
209         INIT_LIST_HEAD(&rc->unrepaired_chunks);
210
211         rc->verbose = verbose;
212         rc->yes = yes;
213         pthread_mutex_init(&rc->rc_lock, NULL);
214 }
215
216 static void free_recover_control(struct recover_control *rc)
217 {
218         free_block_group_tree(&rc->bg);
219         free_chunk_cache_tree(&rc->chunk);
220         free_device_extent_tree(&rc->devext);
221         free_extent_record_tree(&rc->eb_cache);
222         pthread_mutex_destroy(&rc->rc_lock);
223 }
224
225 static int process_block_group_item(struct block_group_tree *bg_cache,
226                                     struct extent_buffer *leaf,
227                                     struct btrfs_key *key, int slot)
228 {
229         struct block_group_record *rec;
230         struct block_group_record *exist;
231         struct cache_extent *cache;
232         int ret = 0;
233
234         rec = btrfs_new_block_group_record(leaf, key, slot);
235         if (!rec->cache.size)
236                 goto free_out;
237 again:
238         cache = lookup_cache_extent(&bg_cache->tree,
239                                     rec->cache.start,
240                                     rec->cache.size);
241         if (cache) {
242                 exist = container_of(cache, struct block_group_record, cache);
243
244                 /*check the generation and replace if needed*/
245                 if (exist->generation > rec->generation)
246                         goto free_out;
247                 if (exist->generation == rec->generation) {
248                         int offset = offsetof(struct block_group_record,
249                                               generation);
250                         /*
251                          * According to the current kernel code, the following
252                          * case is impossble, or there is something wrong in
253                          * the kernel code.
254                          */
255                         if (memcmp(((void *)exist) + offset,
256                                    ((void *)rec) + offset,
257                                    sizeof(*rec) - offset))
258                                 ret = -EEXIST;
259                         goto free_out;
260                 }
261                 remove_cache_extent(&bg_cache->tree, cache);
262                 list_del_init(&exist->list);
263                 free(exist);
264                 /*
265                  * We must do seach again to avoid the following cache.
266                  * /--old bg 1--//--old bg 2--/
267                  *        /--new bg--/
268                  */
269                 goto again;
270         }
271
272         ret = insert_block_group_record(bg_cache, rec);
273         BUG_ON(ret);
274 out:
275         return ret;
276 free_out:
277         free(rec);
278         goto out;
279 }
280
281 static int process_chunk_item(struct cache_tree *chunk_cache,
282                               struct extent_buffer *leaf, struct btrfs_key *key,
283                               int slot)
284 {
285         struct chunk_record *rec;
286         struct chunk_record *exist;
287         struct cache_extent *cache;
288         int ret = 0;
289
290         rec = btrfs_new_chunk_record(leaf, key, slot);
291         if (!rec->cache.size)
292                 goto free_out;
293 again:
294         cache = lookup_cache_extent(chunk_cache, rec->offset, rec->length);
295         if (cache) {
296                 exist = container_of(cache, struct chunk_record, cache);
297
298                 if (exist->generation > rec->generation)
299                         goto free_out;
300                 if (exist->generation == rec->generation) {
301                         int num_stripes = rec->num_stripes;
302                         int rec_size = btrfs_chunk_record_size(num_stripes);
303                         int offset = offsetof(struct chunk_record, generation);
304
305                         if (exist->num_stripes != rec->num_stripes ||
306                             memcmp(((void *)exist) + offset,
307                                    ((void *)rec) + offset,
308                                    rec_size - offset))
309                                 ret = -EEXIST;
310                         goto free_out;
311                 }
312                 remove_cache_extent(chunk_cache, cache);
313                 free(exist);
314                 goto again;
315         }
316         ret = insert_cache_extent(chunk_cache, &rec->cache);
317         BUG_ON(ret);
318 out:
319         return ret;
320 free_out:
321         free(rec);
322         goto out;
323 }
324
325 static int process_device_extent_item(struct device_extent_tree *devext_cache,
326                                       struct extent_buffer *leaf,
327                                       struct btrfs_key *key, int slot)
328 {
329         struct device_extent_record *rec;
330         struct device_extent_record *exist;
331         struct cache_extent *cache;
332         int ret = 0;
333
334         rec = btrfs_new_device_extent_record(leaf, key, slot);
335         if (!rec->cache.size)
336                 goto free_out;
337 again:
338         cache = lookup_cache_extent2(&devext_cache->tree,
339                                      rec->cache.objectid,
340                                      rec->cache.start,
341                                      rec->cache.size);
342         if (cache) {
343                 exist = container_of(cache, struct device_extent_record, cache);
344                 if (exist->generation > rec->generation)
345                         goto free_out;
346                 if (exist->generation == rec->generation) {
347                         int offset = offsetof(struct device_extent_record,
348                                               generation);
349                         if (memcmp(((void *)exist) + offset,
350                                    ((void *)rec) + offset,
351                                    sizeof(*rec) - offset))
352                                 ret = -EEXIST;
353                         goto free_out;
354                 }
355                 remove_cache_extent(&devext_cache->tree, cache);
356                 list_del_init(&exist->chunk_list);
357                 list_del_init(&exist->device_list);
358                 free(exist);
359                 goto again;
360         }
361
362         ret = insert_device_extent_record(devext_cache, rec);
363         BUG_ON(ret);
364 out:
365         return ret;
366 free_out:
367         free(rec);
368         goto out;
369 }
370
371 static void print_block_group_info(struct block_group_record *rec, char *prefix)
372 {
373         if (prefix)
374                 printf("%s", prefix);
375         printf("Block Group: start = %llu, len = %llu, flag = %llx\n",
376                rec->objectid, rec->offset, rec->flags);
377 }
378
379 static void print_block_group_tree(struct block_group_tree *tree)
380 {
381         struct cache_extent *cache;
382         struct block_group_record *rec;
383
384         printf("All Block Groups:\n");
385         for (cache = first_cache_extent(&tree->tree); cache;
386              cache = next_cache_extent(cache)) {
387                 rec = container_of(cache, struct block_group_record, cache);
388                 print_block_group_info(rec, "\t");
389         }
390         printf("\n");
391 }
392
393 static void print_stripe_info(struct stripe *data, char *prefix1, char *prefix2,
394                               int index)
395 {
396         if (prefix1)
397                 printf("%s", prefix1);
398         if (prefix2)
399                 printf("%s", prefix2);
400         printf("[%2d] Stripe: devid = %llu, offset = %llu\n",
401                index, data->devid, data->offset);
402 }
403
404 static void print_chunk_self_info(struct chunk_record *rec, char *prefix)
405 {
406         int i;
407
408         if (prefix)
409                 printf("%s", prefix);
410         printf("Chunk: start = %llu, len = %llu, type = %llx, num_stripes = %u\n",
411                rec->offset, rec->length, rec->type_flags, rec->num_stripes);
412         if (prefix)
413                 printf("%s", prefix);
414         printf("    Stripes list:\n");
415         for (i = 0; i < rec->num_stripes; i++)
416                 print_stripe_info(&rec->stripes[i], prefix, "    ", i);
417 }
418
419 static void print_chunk_tree(struct cache_tree *tree)
420 {
421         struct cache_extent *n;
422         struct chunk_record *entry;
423
424         printf("All Chunks:\n");
425         for (n = first_cache_extent(tree); n;
426              n = next_cache_extent(n)) {
427                 entry = container_of(n, struct chunk_record, cache);
428                 print_chunk_self_info(entry, "\t");
429         }
430         printf("\n");
431 }
432
433 static void print_device_extent_info(struct device_extent_record *rec,
434                                      char *prefix)
435 {
436         if (prefix)
437                 printf("%s", prefix);
438         printf("Device extent: devid = %llu, start = %llu, len = %llu, chunk offset = %llu\n",
439                rec->objectid, rec->offset, rec->length, rec->chunk_offset);
440 }
441
442 static void print_device_extent_tree(struct device_extent_tree *tree)
443 {
444         struct cache_extent *n;
445         struct device_extent_record *entry;
446
447         printf("All Device Extents:\n");
448         for (n = first_cache_extent(&tree->tree); n;
449              n = next_cache_extent(n)) {
450                 entry = container_of(n, struct device_extent_record, cache);
451                 print_device_extent_info(entry, "\t");
452         }
453         printf("\n");
454 }
455
456 static void print_device_info(struct btrfs_device *device, char *prefix)
457 {
458         if (prefix)
459                 printf("%s", prefix);
460         printf("Device: id = %llu, name = %s\n",
461                device->devid, device->name);
462 }
463
464 static void print_all_devices(struct list_head *devices)
465 {
466         struct btrfs_device *dev;
467
468         printf("All Devices:\n");
469         list_for_each_entry(dev, devices, dev_list)
470                 print_device_info(dev, "\t");
471         printf("\n");
472 }
473
474 static void print_scan_result(struct recover_control *rc)
475 {
476         if (!rc->verbose)
477                 return;
478
479         printf("DEVICE SCAN RESULT:\n");
480         printf("Filesystem Information:\n");
481         printf("\tsectorsize: %d\n", rc->sectorsize);
482         printf("\tleafsize: %d\n", rc->leafsize);
483         printf("\ttree root generation: %llu\n", rc->generation);
484         printf("\tchunk root generation: %llu\n", rc->chunk_root_generation);
485         printf("\n");
486
487         print_all_devices(&rc->fs_devices->devices);
488         print_block_group_tree(&rc->bg);
489         print_chunk_tree(&rc->chunk);
490         print_device_extent_tree(&rc->devext);
491 }
492
493 static void print_chunk_info(struct chunk_record *chunk, char *prefix)
494 {
495         struct device_extent_record *devext;
496         int i;
497
498         print_chunk_self_info(chunk, prefix);
499         if (prefix)
500                 printf("%s", prefix);
501         if (chunk->bg_rec)
502                 print_block_group_info(chunk->bg_rec, "    ");
503         else
504                 printf("    No block group.\n");
505         if (prefix)
506                 printf("%s", prefix);
507         if (list_empty(&chunk->dextents)) {
508                 printf("    No device extent.\n");
509         } else {
510                 printf("    Device extent list:\n");
511                 i = 0;
512                 list_for_each_entry(devext, &chunk->dextents, chunk_list) {
513                         if (prefix)
514                                 printf("%s", prefix);
515                         printf("%s[%2d]", "        ", i);
516                         print_device_extent_info(devext, NULL);
517                         i++;
518                 }
519         }
520 }
521
522 static void print_check_result(struct recover_control *rc)
523 {
524         struct chunk_record *chunk;
525         struct block_group_record *bg;
526         struct device_extent_record *devext;
527         int total = 0;
528         int good = 0;
529         int bad = 0;
530
531         if (!rc->verbose)
532                 return;
533
534         printf("CHECK RESULT:\n");
535         printf("Healthy Chunks:\n");
536         list_for_each_entry(chunk, &rc->good_chunks, list) {
537                 print_chunk_info(chunk, "  ");
538                 good++;
539                 total++;
540         }
541         printf("Bad Chunks:\n");
542         list_for_each_entry(chunk, &rc->bad_chunks, list) {
543                 print_chunk_info(chunk, "  ");
544                 bad++;
545                 total++;
546         }
547         printf("\n");
548         printf("Total Chunks:\t%d\n", total);
549         printf("  Heathy:\t%d\n", good);
550         printf("  Bad:\t%d\n", bad);
551
552         printf("\n");
553         printf("Orphan Block Groups:\n");
554         list_for_each_entry(bg, &rc->bg.block_groups, list)
555                 print_block_group_info(bg, "  ");
556
557         printf("\n");
558         printf("Orphan Device Extents:\n");
559         list_for_each_entry(devext, &rc->devext.no_chunk_orphans, chunk_list)
560                 print_device_extent_info(devext, "  ");
561 }
562
563 static int check_chunk_by_metadata(struct recover_control *rc,
564                                    struct btrfs_root *root,
565                                    struct chunk_record *chunk, int bg_only)
566 {
567         int ret;
568         int i;
569         int slot;
570         struct btrfs_path path;
571         struct btrfs_key key;
572         struct btrfs_root *dev_root;
573         struct stripe *stripe;
574         struct btrfs_dev_extent *dev_extent;
575         struct btrfs_block_group_item *bg_ptr;
576         struct extent_buffer *l;
577
578         btrfs_init_path(&path);
579
580         if (bg_only)
581                 goto bg_check;
582
583         dev_root = root->fs_info->dev_root;
584         for (i = 0; i < chunk->num_stripes; i++) {
585                 stripe = &chunk->stripes[i];
586
587                 key.objectid = stripe->devid;
588                 key.offset = stripe->offset;
589                 key.type = BTRFS_DEV_EXTENT_KEY;
590
591                 ret = btrfs_search_slot(NULL, dev_root, &key, &path, 0, 0);
592                 if (ret < 0) {
593                         fprintf(stderr, "Search device extent failed(%d)\n",
594                                 ret);
595                         btrfs_release_path(&path);
596                         return ret;
597                 } else if (ret > 0) {
598                         if (rc->verbose)
599                                 fprintf(stderr,
600                                         "No device extent[%llu, %llu]\n",
601                                         stripe->devid, stripe->offset);
602                         btrfs_release_path(&path);
603                         return -ENOENT;
604                 }
605                 l = path.nodes[0];
606                 slot = path.slots[0];
607                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
608                 if (chunk->offset !=
609                     btrfs_dev_extent_chunk_offset(l, dev_extent)) {
610                         if (rc->verbose)
611                                 fprintf(stderr,
612                                         "Device tree unmatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
613                                         btrfs_dev_extent_chunk_offset(l,
614                                                                 dev_extent),
615                                         btrfs_dev_extent_length(l, dev_extent),
616                                         chunk->offset, chunk->length);
617                         btrfs_release_path(&path);
618                         return -ENOENT;
619                 }
620                 btrfs_release_path(&path);
621         }
622
623 bg_check:
624         key.objectid = chunk->offset;
625         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
626         key.offset = chunk->length;
627
628         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, &path,
629                                 0, 0);
630         if (ret < 0) {
631                 fprintf(stderr, "Search block group failed(%d)\n", ret);
632                 btrfs_release_path(&path);
633                 return ret;
634         } else if (ret > 0) {
635                 if (rc->verbose)
636                         fprintf(stderr, "No block group[%llu, %llu]\n",
637                                 key.objectid, key.offset);
638                 btrfs_release_path(&path);
639                 return -ENOENT;
640         }
641
642         l = path.nodes[0];
643         slot = path.slots[0];
644         bg_ptr = btrfs_item_ptr(l, slot, struct btrfs_block_group_item);
645         if (chunk->type_flags != btrfs_disk_block_group_flags(l, bg_ptr)) {
646                 if (rc->verbose)
647                         fprintf(stderr,
648                                 "Chunk[%llu, %llu]'s type(%llu) is differemt with Block Group's type(%llu)\n",
649                                 chunk->offset, chunk->length, chunk->type_flags,
650                                 btrfs_disk_block_group_flags(l, bg_ptr));
651                 btrfs_release_path(&path);
652                 return -ENOENT;
653         }
654         btrfs_release_path(&path);
655         return 0;
656 }
657
658 static int check_all_chunks_by_metadata(struct recover_control *rc,
659                                         struct btrfs_root *root)
660 {
661         struct chunk_record *chunk;
662         struct chunk_record *next;
663         LIST_HEAD(orphan_chunks);
664         int ret = 0;
665         int err;
666
667         list_for_each_entry_safe(chunk, next, &rc->good_chunks, list) {
668                 err = check_chunk_by_metadata(rc, root, chunk, 0);
669                 if (err) {
670                         if (err == -ENOENT)
671                                 list_move_tail(&chunk->list, &orphan_chunks);
672                         else if (err && !ret)
673                                 ret = err;
674                 }
675         }
676
677         list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) {
678                 err = check_chunk_by_metadata(rc, root, chunk, 1);
679                 if (err == -ENOENT)
680                         list_move_tail(&chunk->list, &orphan_chunks);
681                 else if (err && !ret)
682                         ret = err;
683         }
684
685         list_for_each_entry(chunk, &rc->bad_chunks, list) {
686                 err = check_chunk_by_metadata(rc, root, chunk, 1);
687                 if (err != -ENOENT && !ret)
688                         ret = err ? err : -EINVAL;
689         }
690         list_splice(&orphan_chunks, &rc->bad_chunks);
691         return ret;
692 }
693
694 static int extract_metadata_record(struct recover_control *rc,
695                                    struct extent_buffer *leaf)
696 {
697         struct btrfs_key key;
698         int ret = 0;
699         int i;
700         u32 nritems;
701
702         nritems = btrfs_header_nritems(leaf);
703         for (i = 0; i < nritems; i++) {
704                 btrfs_item_key_to_cpu(leaf, &key, i);
705                 switch (key.type) {
706                 case BTRFS_BLOCK_GROUP_ITEM_KEY:
707                         pthread_mutex_lock(&rc->rc_lock);
708                         ret = process_block_group_item(&rc->bg, leaf, &key, i);
709                         pthread_mutex_unlock(&rc->rc_lock);
710                         break;
711                 case BTRFS_CHUNK_ITEM_KEY:
712                         pthread_mutex_lock(&rc->rc_lock);
713                         ret = process_chunk_item(&rc->chunk, leaf, &key, i);
714                         pthread_mutex_unlock(&rc->rc_lock);
715                         break;
716                 case BTRFS_DEV_EXTENT_KEY:
717                         pthread_mutex_lock(&rc->rc_lock);
718                         ret = process_device_extent_item(&rc->devext, leaf,
719                                                          &key, i);
720                         pthread_mutex_unlock(&rc->rc_lock);
721                         break;
722                 }
723                 if (ret)
724                         break;
725         }
726         return ret;
727 }
728
729 static inline int is_super_block_address(u64 offset)
730 {
731         int i;
732
733         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
734                 if (offset == btrfs_sb_offset(i))
735                         return 1;
736         }
737         return 0;
738 }
739
740 static int scan_one_device(void *dev_scan_struct)
741 {
742         struct extent_buffer *buf;
743         u64 bytenr;
744         int ret = 0;
745         struct device_scan *dev_scan = (struct device_scan *)dev_scan_struct;
746         struct recover_control *rc = dev_scan->rc;
747         struct btrfs_device *device = dev_scan->dev;
748         int fd = dev_scan->fd;
749
750         ret = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
751         if (ret)
752                 return 1;
753
754         buf = malloc(sizeof(*buf) + rc->leafsize);
755         if (!buf)
756                 return -ENOMEM;
757         buf->len = rc->leafsize;
758
759         bytenr = 0;
760         while (1) {
761                 if (is_super_block_address(bytenr))
762                         bytenr += rc->sectorsize;
763
764                 if (pread64(fd, buf->data, rc->leafsize, bytenr) <
765                     rc->leafsize)
766                         break;
767
768                 if (memcmp_extent_buffer(buf, rc->fs_devices->fsid,
769                                          btrfs_header_fsid(),
770                                          BTRFS_FSID_SIZE)) {
771                         bytenr += rc->sectorsize;
772                         continue;
773                 }
774
775                 if (verify_tree_block_csum_silent(buf, rc->csum_size)) {
776                         bytenr += rc->sectorsize;
777                         continue;
778                 }
779
780                 pthread_mutex_lock(&rc->rc_lock);
781                 ret = process_extent_buffer(&rc->eb_cache, buf, device, bytenr);
782                 pthread_mutex_unlock(&rc->rc_lock);
783                 if (ret)
784                         goto out;
785
786                 if (btrfs_header_level(buf) != 0)
787                         goto next_node;
788
789                 switch (btrfs_header_owner(buf)) {
790                 case BTRFS_EXTENT_TREE_OBJECTID:
791                 case BTRFS_DEV_TREE_OBJECTID:
792                         /* different tree use different generation */
793                         if (btrfs_header_generation(buf) > rc->generation)
794                                 break;
795                         ret = extract_metadata_record(rc, buf);
796                         if (ret)
797                                 goto out;
798                         break;
799                 case BTRFS_CHUNK_TREE_OBJECTID:
800                         if (btrfs_header_generation(buf) >
801                             rc->chunk_root_generation)
802                                 break;
803                         ret = extract_metadata_record(rc, buf);
804                         if (ret)
805                                 goto out;
806                         break;
807                 }
808 next_node:
809                 bytenr += rc->leafsize;
810         }
811 out:
812         close(fd);
813         free(buf);
814         return ret;
815 }
816
817 static int scan_devices(struct recover_control *rc)
818 {
819         int ret = 0;
820         int fd;
821         struct btrfs_device *dev;
822         struct device_scan *dev_scans;
823         pthread_t *t_scans;
824         int *t_rets;
825         int devnr = 0;
826         int devidx = 0;
827         int cancel_from = 0;
828         int cancel_to = 0;
829         int i;
830
831         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list)
832                 devnr++;
833         dev_scans = (struct device_scan *)malloc(sizeof(struct device_scan)
834                                                  * devnr);
835         if (!dev_scans)
836                 return -ENOMEM;
837         t_scans = (pthread_t *)malloc(sizeof(pthread_t) * devnr);
838         if (!t_scans)
839                 return -ENOMEM;
840         t_rets = (int *)malloc(sizeof(int) * devnr);
841         if (!t_rets)
842                 return -ENOMEM;
843
844         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
845                 fd = open(dev->name, O_RDONLY);
846                 if (fd < 0) {
847                         fprintf(stderr, "Failed to open device %s\n",
848                                 dev->name);
849                         return -1;
850                 }
851                 dev_scans[devidx].rc = rc;
852                 dev_scans[devidx].dev = dev;
853                 dev_scans[devidx].fd = fd;
854                 ret = pthread_create(&t_scans[devidx], NULL,
855                                      (void *)scan_one_device,
856                                      (void *)&dev_scans[devidx]);
857                 if (ret) {
858                         cancel_from = 0;
859                         cancel_to = devidx - 1;
860                         goto out;
861                 }
862                 devidx++;
863         }
864
865         i = 0;
866         while (i < devidx) {
867                 ret = pthread_join(t_scans[i], (void **)&t_rets[i]);
868                 if (ret || t_rets[i]) {
869                         ret = 1;
870                         cancel_from = i + 1;
871                         cancel_to = devnr - 1;
872                         break;
873                 }
874                 i++;
875         }
876 out:
877         while (cancel_from <= cancel_to) {
878                 pthread_cancel(t_scans[cancel_from]);
879                 cancel_from++;
880         }
881         free(dev_scans);
882         free(t_scans);
883         free(t_rets);
884         return !!ret;
885 }
886
887 static int build_device_map_by_chunk_record(struct btrfs_root *root,
888                                             struct chunk_record *chunk)
889 {
890         int ret = 0;
891         int i;
892         u64 devid;
893         u8 uuid[BTRFS_UUID_SIZE];
894         u16 num_stripes;
895         struct btrfs_mapping_tree *map_tree;
896         struct map_lookup *map;
897         struct stripe *stripe;
898
899         map_tree = &root->fs_info->mapping_tree;
900         num_stripes = chunk->num_stripes;
901         map = malloc(btrfs_map_lookup_size(num_stripes));
902         if (!map)
903                 return -ENOMEM;
904         map->ce.start = chunk->offset;
905         map->ce.size = chunk->length;
906         map->num_stripes = num_stripes;
907         map->io_width = chunk->io_width;
908         map->io_align = chunk->io_align;
909         map->sector_size = chunk->sector_size;
910         map->stripe_len = chunk->stripe_len;
911         map->type = chunk->type_flags;
912         map->sub_stripes = chunk->sub_stripes;
913
914         for (i = 0, stripe = chunk->stripes; i < num_stripes; i++, stripe++) {
915                 devid = stripe->devid;
916                 memcpy(uuid, stripe->dev_uuid, BTRFS_UUID_SIZE);
917                 map->stripes[i].physical = stripe->offset;
918                 map->stripes[i].dev = btrfs_find_device(root, devid,
919                                                         uuid, NULL);
920                 if (!map->stripes[i].dev) {
921                         kfree(map);
922                         return -EIO;
923                 }
924         }
925
926         ret = insert_cache_extent(&map_tree->cache_tree, &map->ce);
927         return ret;
928 }
929
930 static int build_device_maps_by_chunk_records(struct recover_control *rc,
931                                               struct btrfs_root *root)
932 {
933         int ret = 0;
934         struct chunk_record *chunk;
935
936         list_for_each_entry(chunk, &rc->good_chunks, list) {
937                 ret = build_device_map_by_chunk_record(root, chunk);
938                 if (ret)
939                         return ret;
940         }
941         return ret;
942 }
943
944 static int block_group_remove_all_extent_items(struct btrfs_trans_handle *trans,
945                                                struct btrfs_root *root,
946                                                struct block_group_record *bg)
947 {
948         struct btrfs_fs_info *fs_info = root->fs_info;
949         struct btrfs_key key;
950         struct btrfs_path path;
951         struct extent_buffer *leaf;
952         u64 start = bg->objectid;
953         u64 end = bg->objectid + bg->offset;
954         u64 old_val;
955         int nitems;
956         int ret;
957         int i;
958         int del_s, del_nr;
959
960         btrfs_init_path(&path);
961         root = root->fs_info->extent_root;
962
963         key.objectid = start;
964         key.offset = 0;
965         key.type = BTRFS_EXTENT_ITEM_KEY;
966 again:
967         ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
968         if (ret < 0)
969                 goto err;
970         else if (ret > 0)
971                 ret = 0;
972
973         leaf = path.nodes[0];
974         nitems = btrfs_header_nritems(leaf);
975         if (!nitems) {
976                 /* The tree is empty. */
977                 ret = 0;
978                 goto err;
979         }
980
981         if (path.slots[0] >= nitems) {
982                 ret = btrfs_next_leaf(root, &path);
983                 if (ret < 0)
984                         goto err;
985                 if (ret > 0) {
986                         ret = 0;
987                         goto err;
988                 }
989                 leaf = path.nodes[0];
990                 btrfs_item_key_to_cpu(leaf, &key, 0);
991                 if (key.objectid >= end)
992                         goto err;
993                 btrfs_release_path(&path);
994                 goto again;
995         }
996
997         del_nr = 0;
998         del_s = -1;
999         for (i = path.slots[0]; i < nitems; i++) {
1000                 btrfs_item_key_to_cpu(leaf, &key, i);
1001                 if (key.objectid >= end)
1002                         break;
1003
1004                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1005                         if (del_nr == 0)
1006                                 continue;
1007                         else
1008                                 break;
1009                 }
1010
1011                 if (del_s == -1)
1012                         del_s = i;
1013                 del_nr++;
1014                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
1015                     key.type == BTRFS_METADATA_ITEM_KEY) {
1016                         old_val = btrfs_super_bytes_used(fs_info->super_copy);
1017                         if (key.type == BTRFS_METADATA_ITEM_KEY)
1018                                 old_val += root->leafsize;
1019                         else
1020                                 old_val += key.offset;
1021                         btrfs_set_super_bytes_used(fs_info->super_copy,
1022                                                    old_val);
1023                 }
1024         }
1025
1026         if (del_nr) {
1027                 ret = btrfs_del_items(trans, root, &path, del_s, del_nr);
1028                 if (ret)
1029                         goto err;
1030         }
1031
1032         if (key.objectid < end) {
1033                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1034                         key.objectid += root->sectorsize;
1035                         key.type = BTRFS_EXTENT_ITEM_KEY;
1036                         key.offset = 0;
1037                 }
1038                 btrfs_release_path(&path);
1039                 goto again;
1040         }
1041 err:
1042         btrfs_release_path(&path);
1043         return ret;
1044 }
1045
1046 static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
1047                                        struct btrfs_root *root,
1048                                        struct block_group_record *bg)
1049 {
1050         struct btrfs_block_group_cache *cache;
1051         struct btrfs_fs_info *info;
1052         u64 start;
1053         u64 end;
1054
1055         info = root->fs_info;
1056         cache = btrfs_lookup_block_group(info, bg->objectid);
1057         if (!cache)
1058                 return -ENOENT;
1059
1060         start = cache->key.objectid;
1061         end = start + cache->key.offset - 1;
1062
1063         set_extent_bits(&info->block_group_cache, start, end,
1064                         BLOCK_GROUP_DIRTY, GFP_NOFS);
1065         set_extent_dirty(&info->free_space_cache, start, end, GFP_NOFS);
1066
1067         btrfs_set_block_group_used(&cache->item, 0);
1068
1069         return 0;
1070 }
1071
1072 static int remove_chunk_extent_item(struct btrfs_trans_handle *trans,
1073                                     struct recover_control *rc,
1074                                     struct btrfs_root *root)
1075 {
1076         struct chunk_record *chunk;
1077         int ret = 0;
1078
1079         list_for_each_entry(chunk, &rc->good_chunks, list) {
1080                 if (!(chunk->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
1081                         continue;
1082                 ret = block_group_remove_all_extent_items(trans, root,
1083                                                           chunk->bg_rec);
1084                 if (ret)
1085                         return ret;
1086
1087                 ret = block_group_free_all_extent(trans, root, chunk->bg_rec);
1088                 if (ret)
1089                         return ret;
1090         }
1091         return ret;
1092 }
1093
1094 static int __rebuild_chunk_root(struct btrfs_trans_handle *trans,
1095                                 struct recover_control *rc,
1096                                 struct btrfs_root *root)
1097 {
1098         u64 min_devid = -1;
1099         struct btrfs_device *dev;
1100         struct extent_buffer *cow;
1101         struct btrfs_disk_key disk_key;
1102         int ret = 0;
1103
1104         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
1105                 if (min_devid > dev->devid)
1106                         min_devid = dev->devid;
1107         }
1108         disk_key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1109         disk_key.type = BTRFS_DEV_ITEM_KEY;
1110         disk_key.offset = min_devid;
1111
1112         cow = btrfs_alloc_free_block(trans, root, root->nodesize,
1113                                      BTRFS_CHUNK_TREE_OBJECTID,
1114                                      &disk_key, 0, 0, 0);
1115         btrfs_set_header_bytenr(cow, cow->start);
1116         btrfs_set_header_generation(cow, trans->transid);
1117         btrfs_set_header_nritems(cow, 0);
1118         btrfs_set_header_level(cow, 0);
1119         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1120         btrfs_set_header_owner(cow, BTRFS_CHUNK_TREE_OBJECTID);
1121         write_extent_buffer(cow, root->fs_info->fsid,
1122                         btrfs_header_fsid(), BTRFS_FSID_SIZE);
1123
1124         write_extent_buffer(cow, root->fs_info->chunk_tree_uuid,
1125                         btrfs_header_chunk_tree_uuid(cow),
1126                         BTRFS_UUID_SIZE);
1127
1128         root->node = cow;
1129         btrfs_mark_buffer_dirty(cow);
1130
1131         return ret;
1132 }
1133
1134 static int __rebuild_device_items(struct btrfs_trans_handle *trans,
1135                                   struct recover_control *rc,
1136                                   struct btrfs_root *root)
1137 {
1138         struct btrfs_device *dev;
1139         struct btrfs_key key;
1140         struct btrfs_dev_item *dev_item;
1141         int ret = 0;
1142
1143         dev_item = malloc(sizeof(struct btrfs_dev_item));
1144         if (!dev_item)
1145                 return -ENOMEM;
1146
1147         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
1148                 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1149                 key.type = BTRFS_DEV_ITEM_KEY;
1150                 key.offset = dev->devid;
1151
1152                 btrfs_set_stack_device_generation(dev_item, 0);
1153                 btrfs_set_stack_device_type(dev_item, dev->type);
1154                 btrfs_set_stack_device_id(dev_item, dev->devid);
1155                 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
1156                 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
1157                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
1158                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
1159                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
1160                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
1161                 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
1162
1163                 ret = btrfs_insert_item(trans, root, &key,
1164                                         dev_item, sizeof(*dev_item));
1165         }
1166
1167         free(dev_item);
1168         return ret;
1169 }
1170
1171 static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
1172                                  struct recover_control *rc,
1173                                  struct btrfs_root *root)
1174 {
1175         struct btrfs_key key;
1176         struct btrfs_chunk *chunk = NULL;
1177         struct btrfs_root *chunk_root;
1178         struct chunk_record *chunk_rec;
1179         int ret;
1180
1181         chunk_root = root->fs_info->chunk_root;
1182
1183         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1184                 chunk = create_chunk_item(chunk_rec);
1185                 if (!chunk)
1186                         return -ENOMEM;
1187
1188                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1189                 key.type = BTRFS_CHUNK_ITEM_KEY;
1190                 key.offset = chunk_rec->offset;
1191
1192                 ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
1193                                 btrfs_chunk_item_size(chunk->num_stripes));
1194                 free(chunk);
1195                 if (ret)
1196                         return ret;
1197         }
1198         return 0;
1199 }
1200
1201 static int rebuild_chunk_tree(struct btrfs_trans_handle *trans,
1202                               struct recover_control *rc,
1203                               struct btrfs_root *root)
1204 {
1205         int ret = 0;
1206
1207         root = root->fs_info->chunk_root;
1208
1209         ret = __rebuild_chunk_root(trans, rc, root);
1210         if (ret)
1211                 return ret;
1212
1213         ret = __rebuild_device_items(trans, rc, root);
1214         if (ret)
1215                 return ret;
1216
1217         ret = __rebuild_chunk_items(trans, rc, root);
1218
1219         return ret;
1220 }
1221
1222 static int rebuild_sys_array(struct recover_control *rc,
1223                              struct btrfs_root *root)
1224 {
1225         struct btrfs_chunk *chunk;
1226         struct btrfs_key key;
1227         struct chunk_record *chunk_rec;
1228         int ret = 0;
1229         u16 num_stripes;
1230
1231         btrfs_set_super_sys_array_size(root->fs_info->super_copy, 0);
1232
1233         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1234                 if (!(chunk_rec->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
1235                         continue;
1236
1237                 num_stripes = chunk_rec->num_stripes;
1238                 chunk = create_chunk_item(chunk_rec);
1239                 if (!chunk) {
1240                         ret = -ENOMEM;
1241                         break;
1242                 }
1243
1244                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1245                 key.type = BTRFS_CHUNK_ITEM_KEY;
1246                 key.offset = chunk_rec->offset;
1247
1248                 ret = btrfs_add_system_chunk(NULL, root, &key, chunk,
1249                                 btrfs_chunk_item_size(num_stripes));
1250                 free(chunk);
1251                 if (ret)
1252                         break;
1253         }
1254         return ret;
1255
1256 }
1257
1258 static struct btrfs_root *
1259 open_ctree_with_broken_chunk(struct recover_control *rc)
1260 {
1261         struct btrfs_fs_info *fs_info;
1262         struct btrfs_super_block *disk_super;
1263         struct extent_buffer *eb;
1264         u32 sectorsize;
1265         u32 nodesize;
1266         u32 leafsize;
1267         u32 stripesize;
1268         int ret;
1269
1270         fs_info = btrfs_new_fs_info(1, BTRFS_SUPER_INFO_OFFSET);
1271         if (!fs_info) {
1272                 fprintf(stderr, "Failed to allocate memory for fs_info\n");
1273                 return ERR_PTR(-ENOMEM);
1274         }
1275         fs_info->is_chunk_recover = 1;
1276
1277         fs_info->fs_devices = rc->fs_devices;
1278         ret = btrfs_open_devices(fs_info->fs_devices, O_RDWR);
1279         if (ret)
1280                 goto out;
1281
1282         disk_super = fs_info->super_copy;
1283         ret = btrfs_read_dev_super(fs_info->fs_devices->latest_bdev,
1284                                    disk_super, fs_info->super_bytenr);
1285         if (ret) {
1286                 fprintf(stderr, "No valid btrfs found\n");
1287                 goto out_devices;
1288         }
1289
1290         memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
1291
1292         ret = btrfs_check_fs_compatibility(disk_super, 1);
1293         if (ret)
1294                 goto out_devices;
1295
1296         nodesize = btrfs_super_nodesize(disk_super);
1297         leafsize = btrfs_super_leafsize(disk_super);
1298         sectorsize = btrfs_super_sectorsize(disk_super);
1299         stripesize = btrfs_super_stripesize(disk_super);
1300
1301         __setup_root(nodesize, leafsize, sectorsize, stripesize,
1302                      fs_info->chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1303
1304         ret = build_device_maps_by_chunk_records(rc, fs_info->chunk_root);
1305         if (ret)
1306                 goto out_cleanup;
1307
1308         ret = btrfs_setup_all_roots(fs_info, 0, 0);
1309         if (ret)
1310                 goto out_failed;
1311
1312         eb = fs_info->tree_root->node;
1313         read_extent_buffer(eb, fs_info->chunk_tree_uuid,
1314                            btrfs_header_chunk_tree_uuid(eb),
1315                            BTRFS_UUID_SIZE);
1316
1317         return fs_info->fs_root;
1318 out_failed:
1319         btrfs_release_all_roots(fs_info);
1320 out_cleanup:
1321         btrfs_cleanup_all_caches(fs_info);
1322 out_devices:
1323         btrfs_close_devices(fs_info->fs_devices);
1324 out:
1325         btrfs_free_fs_info(fs_info);
1326         return ERR_PTR(ret);
1327 }
1328
1329 static int recover_prepare(struct recover_control *rc, char *path)
1330 {
1331         int ret;
1332         int fd;
1333         struct btrfs_super_block *sb;
1334         struct btrfs_fs_devices *fs_devices;
1335
1336         ret = 0;
1337         fd = open(path, O_RDONLY);
1338         if (fd < 0) {
1339                 fprintf(stderr, "open %s\n error.\n", path);
1340                 return -1;
1341         }
1342
1343         sb = malloc(sizeof(struct btrfs_super_block));
1344         if (!sb) {
1345                 fprintf(stderr, "allocating memory for sb failed.\n");
1346                 ret = -ENOMEM;
1347                 goto fail_close_fd;
1348         }
1349
1350         ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET);
1351         if (ret) {
1352                 fprintf(stderr, "read super block error\n");
1353                 goto fail_free_sb;
1354         }
1355
1356         rc->sectorsize = btrfs_super_sectorsize(sb);
1357         rc->leafsize = btrfs_super_leafsize(sb);
1358         rc->generation = btrfs_super_generation(sb);
1359         rc->chunk_root_generation = btrfs_super_chunk_root_generation(sb);
1360         rc->csum_size = btrfs_super_csum_size(sb);
1361
1362         /* if seed, the result of scanning below will be partial */
1363         if (btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_SEEDING) {
1364                 fprintf(stderr, "this device is seed device\n");
1365                 ret = -1;
1366                 goto fail_free_sb;
1367         }
1368
1369         ret = btrfs_scan_fs_devices(fd, path, &fs_devices, 0, 1);
1370         if (ret)
1371                 goto fail_free_sb;
1372
1373         rc->fs_devices = fs_devices;
1374
1375         if (rc->verbose)
1376                 print_all_devices(&rc->fs_devices->devices);
1377
1378 fail_free_sb:
1379         free(sb);
1380 fail_close_fd:
1381         close(fd);
1382         return ret;
1383 }
1384
1385 static int btrfs_get_device_extents(u64 chunk_object,
1386                                     struct list_head *orphan_devexts,
1387                                     struct list_head *ret_list)
1388 {
1389         struct device_extent_record *devext;
1390         struct device_extent_record *next;
1391         int count = 0;
1392
1393         list_for_each_entry_safe(devext, next, orphan_devexts, chunk_list) {
1394                 if (devext->chunk_offset == chunk_object) {
1395                         list_move_tail(&devext->chunk_list, ret_list);
1396                         count++;
1397                 }
1398         }
1399         return count;
1400 }
1401
1402 static int calc_num_stripes(u64 type)
1403 {
1404         if (type & (BTRFS_BLOCK_GROUP_RAID0 |
1405                     BTRFS_BLOCK_GROUP_RAID10 |
1406                     BTRFS_BLOCK_GROUP_RAID5 |
1407                     BTRFS_BLOCK_GROUP_RAID6))
1408                 return 0;
1409         else if (type & (BTRFS_BLOCK_GROUP_RAID1 |
1410                          BTRFS_BLOCK_GROUP_DUP))
1411                 return 2;
1412         else
1413                 return 1;
1414 }
1415
1416 static inline int calc_sub_nstripes(u64 type)
1417 {
1418         if (type & BTRFS_BLOCK_GROUP_RAID10)
1419                 return 2;
1420         else
1421                 return 1;
1422 }
1423
1424 static int btrfs_verify_device_extents(struct block_group_record *bg,
1425                                        struct list_head *devexts, int ndevexts)
1426 {
1427         struct device_extent_record *devext;
1428         u64 strpie_length;
1429         int expected_num_stripes;
1430
1431         expected_num_stripes = calc_num_stripes(bg->flags);
1432         if (expected_num_stripes && expected_num_stripes != ndevexts)
1433                 return 1;
1434
1435         strpie_length = calc_stripe_length(bg->flags, bg->offset, ndevexts);
1436         list_for_each_entry(devext, devexts, chunk_list) {
1437                 if (devext->length != strpie_length)
1438                         return 1;
1439         }
1440         return 0;
1441 }
1442
1443 static int btrfs_rebuild_unordered_chunk_stripes(struct recover_control *rc,
1444                                                  struct chunk_record *chunk)
1445 {
1446         struct device_extent_record *devext;
1447         struct btrfs_device *device;
1448         int i;
1449
1450         devext = list_first_entry(&chunk->dextents, struct device_extent_record,
1451                                   chunk_list);
1452         for (i = 0; i < chunk->num_stripes; i++) {
1453                 chunk->stripes[i].devid = devext->objectid;
1454                 chunk->stripes[i].offset = devext->offset;
1455                 device = btrfs_find_device_by_devid(rc->fs_devices,
1456                                                     devext->objectid,
1457                                                     0);
1458                 if (!device)
1459                         return -ENOENT;
1460                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1461                                                   devext->objectid,
1462                                                   1));
1463                 memcpy(chunk->stripes[i].dev_uuid, device->uuid,
1464                        BTRFS_UUID_SIZE);
1465                 devext = list_next_entry(devext, chunk_list);
1466         }
1467         return 0;
1468 }
1469
1470 static int btrfs_calc_stripe_index(struct chunk_record *chunk, u64 logical)
1471 {
1472         u64 offset = logical - chunk->offset;
1473         int stripe_nr;
1474         int nr_data_stripes;
1475         int index;
1476
1477         stripe_nr = offset / chunk->stripe_len;
1478         if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID0) {
1479                 index = stripe_nr % chunk->num_stripes;
1480         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID10) {
1481                 index = stripe_nr % (chunk->num_stripes / chunk->sub_stripes);
1482                 index *= chunk->sub_stripes;
1483         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5) {
1484                 nr_data_stripes = chunk->num_stripes - 1;
1485                 index = stripe_nr % nr_data_stripes;
1486                 stripe_nr /= nr_data_stripes;
1487                 index = (index + stripe_nr) % chunk->num_stripes;
1488         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6) {
1489                 nr_data_stripes = chunk->num_stripes - 2;
1490                 index = stripe_nr % nr_data_stripes;
1491                 stripe_nr /= nr_data_stripes;
1492                 index = (index + stripe_nr) % chunk->num_stripes;
1493         } else {
1494                 BUG_ON(1);
1495         }
1496         return index;
1497 }
1498
1499 /* calc the logical offset which is the start of the next stripe. */
1500 static inline u64 btrfs_next_stripe_logical_offset(struct chunk_record *chunk,
1501                                                    u64 logical)
1502 {
1503         u64 offset = logical - chunk->offset;
1504
1505         offset /= chunk->stripe_len;
1506         offset *= chunk->stripe_len;
1507         offset += chunk->stripe_len;
1508
1509         return offset + chunk->offset;
1510 }
1511
1512 static int is_extent_record_in_device_extent(struct extent_record *er,
1513                                              struct device_extent_record *dext,
1514                                              int *mirror)
1515 {
1516         int i;
1517
1518         for (i = 0; i < er->nmirrors; i++) {
1519                 if (er->devices[i]->devid == dext->objectid &&
1520                     er->offsets[i] >= dext->offset &&
1521                     er->offsets[i] < dext->offset + dext->length) {
1522                         *mirror = i;
1523                         return 1;
1524                 }
1525         }
1526         return 0;
1527 }
1528
1529 static int
1530 btrfs_rebuild_ordered_meta_chunk_stripes(struct recover_control *rc,
1531                                          struct chunk_record *chunk)
1532 {
1533         u64 start = chunk->offset;
1534         u64 end = chunk->offset + chunk->length;
1535         struct cache_extent *cache;
1536         struct extent_record *er;
1537         struct device_extent_record *devext;
1538         struct device_extent_record *next;
1539         struct btrfs_device *device;
1540         LIST_HEAD(devexts);
1541         int index;
1542         int mirror;
1543         int ret;
1544
1545         cache = lookup_cache_extent(&rc->eb_cache,
1546                                     start, chunk->length);
1547         if (!cache) {
1548                 /* No used space, we can reorder the stripes freely. */
1549                 ret = btrfs_rebuild_unordered_chunk_stripes(rc, chunk);
1550                 return ret;
1551         }
1552
1553         list_splice_init(&chunk->dextents, &devexts);
1554 again:
1555         er = container_of(cache, struct extent_record, cache);
1556         index = btrfs_calc_stripe_index(chunk, er->cache.start);
1557         if (chunk->stripes[index].devid)
1558                 goto next;
1559         list_for_each_entry_safe(devext, next, &devexts, chunk_list) {
1560                 if (is_extent_record_in_device_extent(er, devext, &mirror)) {
1561                         chunk->stripes[index].devid = devext->objectid;
1562                         chunk->stripes[index].offset = devext->offset;
1563                         memcpy(chunk->stripes[index].dev_uuid,
1564                                er->devices[mirror]->uuid,
1565                                BTRFS_UUID_SIZE);
1566                         index++;
1567                         list_move(&devext->chunk_list, &chunk->dextents);
1568                 }
1569         }
1570 next:
1571         start = btrfs_next_stripe_logical_offset(chunk, er->cache.start);
1572         if (start >= end)
1573                 goto no_extent_record;
1574
1575         cache = lookup_cache_extent(&rc->eb_cache, start, end - start);
1576         if (cache)
1577                 goto again;
1578 no_extent_record:
1579         if (list_empty(&devexts))
1580                 return 0;
1581
1582         if (chunk->type_flags & (BTRFS_BLOCK_GROUP_RAID5 |
1583                                  BTRFS_BLOCK_GROUP_RAID6)) {
1584                 /* Fixme: try to recover the order by the parity block. */
1585                 list_splice_tail(&devexts, &chunk->dextents);
1586                 return -EINVAL;
1587         }
1588
1589         /* There is no data on the lost stripes, we can reorder them freely. */
1590         for (index = 0; index < chunk->num_stripes; index++) {
1591                 if (chunk->stripes[index].devid)
1592                         continue;
1593
1594                 devext = list_first_entry(&devexts,
1595                                           struct device_extent_record,
1596                                            chunk_list);
1597                 list_move(&devext->chunk_list, &chunk->dextents);
1598
1599                 chunk->stripes[index].devid = devext->objectid;
1600                 chunk->stripes[index].offset = devext->offset;
1601                 device = btrfs_find_device_by_devid(rc->fs_devices,
1602                                                     devext->objectid,
1603                                                     0);
1604                 if (!device) {
1605                         list_splice_tail(&devexts, &chunk->dextents);
1606                         return -EINVAL;
1607                 }
1608                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1609                                                   devext->objectid,
1610                                                   1));
1611                 memcpy(chunk->stripes[index].dev_uuid, device->uuid,
1612                        BTRFS_UUID_SIZE);
1613         }
1614         return 0;
1615 }
1616
1617 #define BTRFS_ORDERED_RAID      (BTRFS_BLOCK_GROUP_RAID0 |      \
1618                                  BTRFS_BLOCK_GROUP_RAID10 |     \
1619                                  BTRFS_BLOCK_GROUP_RAID5 |      \
1620                                  BTRFS_BLOCK_GROUP_RAID6)
1621
1622 static int btrfs_rebuild_chunk_stripes(struct recover_control *rc,
1623                                        struct chunk_record *chunk)
1624 {
1625         int ret;
1626
1627         /*
1628          * All the data in the system metadata chunk will be dropped,
1629          * so we need not guarantee that the data is right or not, that
1630          * is we can reorder the stripes in the system metadata chunk.
1631          */
1632         if ((chunk->type_flags & BTRFS_BLOCK_GROUP_METADATA) &&
1633             (chunk->type_flags & BTRFS_ORDERED_RAID))
1634                 ret =btrfs_rebuild_ordered_meta_chunk_stripes(rc, chunk);
1635         else if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA) &&
1636                  (chunk->type_flags & BTRFS_ORDERED_RAID))
1637                 ret = 1;        /* Be handled after the fs is opened. */
1638         else
1639                 ret = btrfs_rebuild_unordered_chunk_stripes(rc, chunk);
1640
1641         return ret;
1642 }
1643
1644 static int next_csum(struct btrfs_root *root,
1645                      struct extent_buffer **leaf,
1646                      struct btrfs_path *path,
1647                      int *slot,
1648                      u64 *csum_offset,
1649                      u32 *tree_csum,
1650                      u64 end,
1651                      struct btrfs_key *key)
1652 {
1653         int ret = 0;
1654         struct btrfs_root *csum_root = root->fs_info->csum_root;
1655         struct btrfs_csum_item *csum_item;
1656         u32 blocksize = root->sectorsize;
1657         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1658         int csums_in_item = btrfs_item_size_nr(*leaf, *slot) / csum_size;
1659
1660         if (*csum_offset >= csums_in_item) {
1661                 ++(*slot);
1662                 *csum_offset = 0;
1663                 if (*slot >= btrfs_header_nritems(*leaf)) {
1664                         ret = btrfs_next_leaf(csum_root, path);
1665                         if (ret < 0)
1666                                 return -1;
1667                         else if (ret > 0)
1668                                 return 1;
1669                         *leaf = path->nodes[0];
1670                         *slot = path->slots[0];
1671                 }
1672                 btrfs_item_key_to_cpu(*leaf, key, *slot);
1673         }
1674
1675         if (key->offset + (*csum_offset) * blocksize >= end)
1676                 return 2;
1677         csum_item = btrfs_item_ptr(*leaf, *slot, struct btrfs_csum_item);
1678         csum_item = (struct btrfs_csum_item *)((unsigned char *)csum_item
1679                                              + (*csum_offset) * csum_size);
1680         read_extent_buffer(*leaf, tree_csum,
1681                           (unsigned long)csum_item, csum_size);
1682         return ret;
1683 }
1684
1685 static u64 calc_data_offset(struct btrfs_key *key,
1686                             struct chunk_record *chunk,
1687                             u64 dev_offset,
1688                             u64 csum_offset,
1689                             u32 blocksize)
1690 {
1691         u64 data_offset;
1692         int logical_stripe_nr;
1693         int dev_stripe_nr;
1694         int nr_data_stripes;
1695
1696         data_offset = key->offset + csum_offset * blocksize - chunk->offset;
1697         nr_data_stripes = chunk->num_stripes;
1698
1699         if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5)
1700                 nr_data_stripes -= 1;
1701         else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6)
1702                 nr_data_stripes -= 2;
1703
1704         logical_stripe_nr = data_offset / chunk->stripe_len;
1705         dev_stripe_nr = logical_stripe_nr / nr_data_stripes;
1706
1707         data_offset -= logical_stripe_nr * chunk->stripe_len;
1708         data_offset += dev_stripe_nr * chunk->stripe_len;
1709
1710         return dev_offset + data_offset;
1711 }
1712
1713 static int check_one_csum(int fd, u64 start, u32 len, u32 tree_csum)
1714 {
1715         char *data;
1716         int ret = 0;
1717         u32 csum_result = ~(u32)0;
1718
1719         data = malloc(len);
1720         if (!data)
1721                 return -1;
1722         ret = pread64(fd, data, len, start);
1723         if (ret < 0 || ret != len) {
1724                 ret = -1;
1725                 goto out;
1726         }
1727         ret = 0;
1728         csum_result = btrfs_csum_data(NULL, data, csum_result, len);
1729         btrfs_csum_final(csum_result, (char *)&csum_result);
1730         if (csum_result != tree_csum)
1731                 ret = 1;
1732 out:
1733         free(data);
1734         return ret;
1735 }
1736
1737 static u64 item_end_offset(struct btrfs_root *root, struct btrfs_key *key,
1738                            struct extent_buffer *leaf, int slot) {
1739         u32 blocksize = root->sectorsize;
1740         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1741
1742         u64 offset = btrfs_item_size_nr(leaf, slot);
1743         offset /= csum_size;
1744         offset *= blocksize;
1745         offset += key->offset;
1746
1747         return offset;
1748 }
1749
1750 static int insert_stripe(struct list_head *devexts,
1751                          struct recover_control *rc,
1752                          struct chunk_record *chunk,
1753                          int index) {
1754         struct device_extent_record *devext;
1755         struct btrfs_device *dev;
1756
1757         devext = list_entry(devexts->next, struct device_extent_record,
1758                             chunk_list);
1759         dev = btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
1760                                         0);
1761         if (!dev)
1762                 return 1;
1763         BUG_ON(btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
1764                                         1));
1765
1766         chunk->stripes[index].devid = devext->objectid;
1767         chunk->stripes[index].offset = devext->offset;
1768         memcpy(chunk->stripes[index].dev_uuid, dev->uuid, BTRFS_UUID_SIZE);
1769
1770         list_move(&devext->chunk_list, &chunk->dextents);
1771
1772         return 0;
1773 }
1774
1775 #define EQUAL_STRIPE (1 << 0)
1776
1777 static int rebuild_raid_data_chunk_stripes(struct recover_control *rc,
1778                                            struct btrfs_root *root,
1779                                            struct chunk_record *chunk,
1780                                            u8 *flags)
1781 {
1782         int i;
1783         int ret = 0;
1784         int slot;
1785         struct btrfs_path path;
1786         struct btrfs_key prev_key;
1787         struct btrfs_key key;
1788         struct btrfs_root *csum_root;
1789         struct extent_buffer *leaf;
1790         struct device_extent_record *devext;
1791         struct device_extent_record *next;
1792         struct btrfs_device *dev;
1793         u64 start = chunk->offset;
1794         u64 end = start + chunk->stripe_len;
1795         u64 chunk_end = chunk->offset + chunk->length;
1796         u64 csum_offset = 0;
1797         u64 data_offset;
1798         u32 blocksize = root->sectorsize;
1799         u32 tree_csum;
1800         int index = 0;
1801         int num_unordered = 0;
1802         LIST_HEAD(unordered);
1803         LIST_HEAD(candidates);
1804
1805         csum_root = root->fs_info->csum_root;
1806         btrfs_init_path(&path);
1807         list_splice_init(&chunk->dextents, &candidates);
1808 again:
1809         if (list_is_last(candidates.next, &candidates))
1810                 goto out;
1811
1812         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1813         key.type = BTRFS_EXTENT_CSUM_KEY;
1814         key.offset = start;
1815
1816         ret = btrfs_search_slot(NULL, csum_root, &key, &path, 0, 0);
1817         if (ret < 0) {
1818                 fprintf(stderr, "Search csum failed(%d)\n", ret);
1819                 goto fail_out;
1820         }
1821         leaf = path.nodes[0];
1822         slot = path.slots[0];
1823         if (ret > 0) {
1824                 if (slot >= btrfs_header_nritems(leaf)) {
1825                         ret = btrfs_next_leaf(csum_root, &path);
1826                         if (ret < 0) {
1827                                 fprintf(stderr,
1828                                         "Walk tree failed(%d)\n", ret);
1829                                 goto fail_out;
1830                         } else if (ret > 0) {
1831                                 slot = btrfs_header_nritems(leaf) - 1;
1832                                 btrfs_item_key_to_cpu(leaf, &key, slot);
1833                                 if (item_end_offset(root, &key, leaf, slot)
1834                                                                 > start) {
1835                                         csum_offset = start - key.offset;
1836                                         csum_offset /= blocksize;
1837                                         goto next_csum;
1838                                 }
1839                                 goto next_stripe;
1840                         }
1841                         leaf = path.nodes[0];
1842                         slot = path.slots[0];
1843                 }
1844                 btrfs_item_key_to_cpu(leaf, &key, slot);
1845                 ret = btrfs_previous_item(csum_root, &path, 0,
1846                                           BTRFS_EXTENT_CSUM_KEY);
1847                 if (ret < 0)
1848                         goto fail_out;
1849                 else if (ret > 0) {
1850                         if (key.offset >= end)
1851                                 goto next_stripe;
1852                         else
1853                                 goto next_csum;
1854                 }
1855                 leaf = path.nodes[0];
1856                 slot = path.slots[0];
1857
1858                 btrfs_item_key_to_cpu(leaf, &prev_key, slot);
1859                 if (item_end_offset(root, &prev_key, leaf, slot) > start) {
1860                         csum_offset = start - prev_key.offset;
1861                         csum_offset /= blocksize;
1862                         btrfs_item_key_to_cpu(leaf, &key, slot);
1863                 } else {
1864                         if (key.offset >= end)
1865                                 goto next_stripe;
1866                 }
1867
1868                 if (key.offset + csum_offset * blocksize > chunk_end)
1869                         goto out;
1870         }
1871 next_csum:
1872         ret = next_csum(root, &leaf, &path, &slot, &csum_offset, &tree_csum,
1873                         end, &key);
1874         if (ret < 0) {
1875                 fprintf(stderr, "Fetch csum failed\n");
1876                 goto fail_out;
1877         } else if (ret == 1) {
1878                 list_for_each_entry(devext, &unordered, chunk_list)
1879                         num_unordered++;
1880                 if (!(*flags & EQUAL_STRIPE))
1881                         *flags |= EQUAL_STRIPE;
1882                 goto out;
1883         } else if (ret == 2)
1884                 goto next_stripe;
1885
1886         list_for_each_entry_safe(devext, next, &candidates, chunk_list) {
1887                 data_offset = calc_data_offset(&key, chunk, devext->offset,
1888                                                csum_offset, blocksize);
1889                 dev = btrfs_find_device_by_devid(rc->fs_devices,
1890                                                  devext->objectid, 0);
1891                 if (!dev) {
1892                         ret = 1;
1893                         goto fail_out;
1894                 }
1895                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1896                                                   devext->objectid, 1));
1897
1898                 ret = check_one_csum(dev->fd, data_offset, blocksize,
1899                                      tree_csum);
1900                 if (ret < 0)
1901                         goto fail_out;
1902                 else if (ret > 0)
1903                         list_move(&devext->chunk_list, &unordered);
1904         }
1905
1906         if (list_empty(&candidates)) {
1907                 list_for_each_entry(devext, &unordered, chunk_list)
1908                         num_unordered++;
1909                 if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6
1910                                         && num_unordered == 2) {
1911                         list_splice_init(&unordered, &chunk->dextents);
1912                         btrfs_release_path(&path);
1913                         return 0;
1914                 } else
1915                         ret = 1;
1916
1917                 goto fail_out;
1918         }
1919
1920         if (list_is_last(candidates.next, &candidates)) {
1921                 index = btrfs_calc_stripe_index(chunk,
1922                         key.offset + csum_offset * blocksize);
1923                 if (chunk->stripes[index].devid)
1924                         goto next_stripe;
1925                 ret = insert_stripe(&candidates, rc, chunk, index);
1926                 if (ret)
1927                         goto fail_out;
1928         } else {
1929                 csum_offset++;
1930                 goto next_csum;
1931         }
1932 next_stripe:
1933         start = btrfs_next_stripe_logical_offset(chunk, start);
1934         end = min(start + chunk->stripe_len, chunk_end);
1935         list_splice_init(&unordered, &candidates);
1936         btrfs_release_path(&path);
1937         csum_offset = 0;
1938         if (end < chunk_end)
1939                 goto again;
1940 out:
1941         ret = 0;
1942         list_splice_init(&candidates, &unordered);
1943         list_for_each_entry(devext, &unordered, chunk_list)
1944                 num_unordered++;
1945         if (num_unordered == 1) {
1946                 for (i = 0; i < chunk->num_stripes; i++) {
1947                         if (!chunk->stripes[i].devid) {
1948                                 index = i;
1949                                 break;
1950                         }
1951                 }
1952                 ret = insert_stripe(&unordered, rc, chunk, index);
1953                 if (ret)
1954                         goto fail_out;
1955         } else {
1956                 if ((num_unordered == 2 && chunk->type_flags
1957                         & BTRFS_BLOCK_GROUP_RAID5)
1958                  || (num_unordered == 3 && chunk->type_flags
1959                         & BTRFS_BLOCK_GROUP_RAID6)) {
1960                         for (i = 0; i < chunk->num_stripes; i++) {
1961                                 if (!chunk->stripes[i].devid) {
1962                                         ret = insert_stripe(&unordered, rc,
1963                                                         chunk, i);
1964                                         if (ret)
1965                                                 break;
1966                                 }
1967                         }
1968                 }
1969         }
1970 fail_out:
1971         ret = !!ret || (list_empty(&unordered) ? 0 : 1);
1972         list_splice_init(&candidates, &chunk->dextents);
1973         list_splice_init(&unordered, &chunk->dextents);
1974         btrfs_release_path(&path);
1975
1976         return ret;
1977 }
1978
1979 static int btrfs_rebuild_ordered_data_chunk_stripes(struct recover_control *rc,
1980                                            struct btrfs_root *root)
1981 {
1982         struct chunk_record *chunk;
1983         struct chunk_record *next;
1984         int ret = 0;
1985         int err;
1986         u8 flags;
1987
1988         list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) {
1989                 if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA)
1990                  && (chunk->type_flags & BTRFS_ORDERED_RAID)) {
1991                         flags = 0;
1992                         err = rebuild_raid_data_chunk_stripes(rc, root, chunk,
1993                                                               &flags);
1994                         if (err) {
1995                                 list_move(&chunk->list, &rc->bad_chunks);
1996                                 if (flags & EQUAL_STRIPE)
1997                                         fprintf(stderr,
1998                         "Failure: too many equal stripes in chunk[%llu %llu]\n",
1999                                                 chunk->offset, chunk->length);
2000                                 if (!ret)
2001                                         ret = err;
2002                         } else
2003                                 list_move(&chunk->list, &rc->good_chunks);
2004                 }
2005         }
2006         return ret;
2007 }
2008
2009 static int btrfs_recover_chunks(struct recover_control *rc)
2010 {
2011         struct chunk_record *chunk;
2012         struct block_group_record *bg;
2013         struct block_group_record *next;
2014         LIST_HEAD(new_chunks);
2015         LIST_HEAD(devexts);
2016         int nstripes;
2017         int ret;
2018
2019         /* create the chunk by block group */
2020         list_for_each_entry_safe(bg, next, &rc->bg.block_groups, list) {
2021                 nstripes = btrfs_get_device_extents(bg->objectid,
2022                                                     &rc->devext.no_chunk_orphans,
2023                                                     &devexts);
2024                 chunk = malloc(btrfs_chunk_record_size(nstripes));
2025                 if (!chunk)
2026                         return -ENOMEM;
2027                 memset(chunk, 0, btrfs_chunk_record_size(nstripes));
2028                 INIT_LIST_HEAD(&chunk->dextents);
2029                 chunk->bg_rec = bg;
2030                 chunk->cache.start = bg->objectid;
2031                 chunk->cache.size = bg->offset;
2032                 chunk->objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2033                 chunk->type = BTRFS_CHUNK_ITEM_KEY;
2034                 chunk->offset = bg->objectid;
2035                 chunk->generation = bg->generation;
2036                 chunk->length = bg->offset;
2037                 chunk->owner = BTRFS_CHUNK_TREE_OBJECTID;
2038                 chunk->stripe_len = BTRFS_STRIPE_LEN;
2039                 chunk->type_flags = bg->flags;
2040                 chunk->io_width = BTRFS_STRIPE_LEN;
2041                 chunk->io_align = BTRFS_STRIPE_LEN;
2042                 chunk->sector_size = rc->sectorsize;
2043                 chunk->sub_stripes = calc_sub_nstripes(bg->flags);
2044
2045                 ret = insert_cache_extent(&rc->chunk, &chunk->cache);
2046                 BUG_ON(ret);
2047
2048                 if (!nstripes) {
2049                         list_add_tail(&chunk->list, &rc->bad_chunks);
2050                         continue;
2051                 }
2052
2053                 list_splice_init(&devexts, &chunk->dextents);
2054
2055                 ret = btrfs_verify_device_extents(bg, &devexts, nstripes);
2056                 if (ret) {
2057                         list_add_tail(&chunk->list, &rc->bad_chunks);
2058                         continue;
2059                 }
2060
2061                 chunk->num_stripes = nstripes;
2062                 ret = btrfs_rebuild_chunk_stripes(rc, chunk);
2063                 if (ret > 0)
2064                         list_add_tail(&chunk->list, &rc->unrepaired_chunks);
2065                 else if (ret < 0)
2066                         list_add_tail(&chunk->list, &rc->bad_chunks);
2067                 else
2068                         list_add_tail(&chunk->list, &rc->good_chunks);
2069         }
2070         /*
2071          * Don't worry about the lost orphan device extents, they don't
2072          * have its chunk and block group, they must be the old ones that
2073          * we have dropped.
2074          */
2075         return 0;
2076 }
2077
2078 /*
2079  * Return 0 when succesful, < 0 on error and > 0 if aborted by user
2080  */
2081 int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
2082 {
2083         int ret = 0;
2084         struct btrfs_root *root = NULL;
2085         struct btrfs_trans_handle *trans;
2086         struct recover_control rc;
2087
2088         init_recover_control(&rc, verbose, yes);
2089
2090         ret = recover_prepare(&rc, path);
2091         if (ret) {
2092                 fprintf(stderr, "recover prepare error\n");
2093                 return ret;
2094         }
2095
2096         ret = scan_devices(&rc);
2097         if (ret) {
2098                 fprintf(stderr, "scan chunk headers error\n");
2099                 goto fail_rc;
2100         }
2101
2102         if (cache_tree_empty(&rc.chunk) &&
2103             cache_tree_empty(&rc.bg.tree) &&
2104             cache_tree_empty(&rc.devext.tree)) {
2105                 fprintf(stderr, "no recoverable chunk\n");
2106                 goto fail_rc;
2107         }
2108
2109         print_scan_result(&rc);
2110
2111         ret = check_chunks(&rc.chunk, &rc.bg, &rc.devext, &rc.good_chunks,
2112                            &rc.bad_chunks, 1);
2113         print_check_result(&rc);
2114         if (ret) {
2115                 if (!list_empty(&rc.bg.block_groups) ||
2116                     !list_empty(&rc.devext.no_chunk_orphans)) {
2117                         ret = btrfs_recover_chunks(&rc);
2118                         if (ret)
2119                                 goto fail_rc;
2120                 }
2121                 /*
2122                  * If the chunk is healthy, its block group item and device
2123                  * extent item should be written on the disks. So, it is very
2124                  * likely that the bad chunk is a old one that has been
2125                  * droppped from the fs. Don't deal with them now, we will
2126                  * check it after the fs is opened.
2127                  */
2128         } else {
2129                 fprintf(stderr, "Check chunks successfully with no orphans\n");
2130                 goto fail_rc;
2131         }
2132
2133         root = open_ctree_with_broken_chunk(&rc);
2134         if (IS_ERR(root)) {
2135                 fprintf(stderr, "open with broken chunk error\n");
2136                 ret = PTR_ERR(root);
2137                 goto fail_rc;
2138         }
2139
2140         ret = check_all_chunks_by_metadata(&rc, root);
2141         if (ret) {
2142                 fprintf(stderr, "The chunks in memory can not match the metadata of the fs. Repair failed.\n");
2143                 goto fail_close_ctree;
2144         }
2145
2146         ret = btrfs_rebuild_ordered_data_chunk_stripes(&rc, root);
2147         if (ret) {
2148                 fprintf(stderr, "Failed to rebuild ordered chunk stripes.\n");
2149                 goto fail_close_ctree;
2150         }
2151
2152         if (!rc.yes) {
2153                 ret = ask_user("We are going to rebuild the chunk tree on disk, it might destroy the old metadata on the disk, Are you sure?");
2154                 if (!ret) {
2155                         ret = 1;
2156                         goto fail_close_ctree;
2157                 }
2158         }
2159
2160         trans = btrfs_start_transaction(root, 1);
2161         ret = remove_chunk_extent_item(trans, &rc, root);
2162         BUG_ON(ret);
2163
2164         ret = rebuild_chunk_tree(trans, &rc, root);
2165         BUG_ON(ret);
2166
2167         ret = rebuild_sys_array(&rc, root);
2168         BUG_ON(ret);
2169
2170         btrfs_commit_transaction(trans, root);
2171 fail_close_ctree:
2172         close_ctree(root);
2173 fail_rc:
2174         free_recover_control(&rc);
2175         return ret;
2176 }