btrfs-progs: Unify the messy error message formats
[platform/upstream/btrfs-progs.git] / chunk-recover.c
1 /*
2  * Copyright (C) 2013 FUJITSU LIMITED.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #define _XOPEN_SOURCE 500
19 #define _GNU_SOURCE
20
21 #include <stdio.h>
22 #include <stdio_ext.h>
23 #include <stdlib.h>
24 #include <sys/types.h>
25 #include <sys/stat.h>
26 #include <fcntl.h>
27 #include <unistd.h>
28 #include <uuid/uuid.h>
29 #include <pthread.h>
30
31 #include "kerncompat.h"
32 #include "list.h"
33 #include "radix-tree.h"
34 #include "ctree.h"
35 #include "extent-cache.h"
36 #include "disk-io.h"
37 #include "volumes.h"
38 #include "transaction.h"
39 #include "crc32c.h"
40 #include "utils.h"
41 #include "version.h"
42 #include "btrfsck.h"
43 #include "commands.h"
44
45 struct recover_control {
46         int verbose;
47         int yes;
48
49         u16 csum_size;
50         u32 sectorsize;
51         u32 leafsize;
52         u64 generation;
53         u64 chunk_root_generation;
54
55         struct btrfs_fs_devices *fs_devices;
56
57         struct cache_tree chunk;
58         struct block_group_tree bg;
59         struct device_extent_tree devext;
60         struct cache_tree eb_cache;
61
62         struct list_head good_chunks;
63         struct list_head bad_chunks;
64         struct list_head unrepaired_chunks;
65         pthread_mutex_t rc_lock;
66 };
67
68 struct extent_record {
69         struct cache_extent cache;
70         u64 generation;
71         u8 csum[BTRFS_CSUM_SIZE];
72         struct btrfs_device *devices[BTRFS_MAX_MIRRORS];
73         u64 offsets[BTRFS_MAX_MIRRORS];
74         int nmirrors;
75 };
76
77 struct device_scan {
78         struct recover_control *rc;
79         struct btrfs_device *dev;
80         int fd;
81 };
82
83 static struct extent_record *btrfs_new_extent_record(struct extent_buffer *eb)
84 {
85         struct extent_record *rec;
86
87         rec = malloc(sizeof(*rec));
88         if (!rec) {
89                 fprintf(stderr, "Fail to allocate memory for extent record.\n");
90                 exit(1);
91         }
92
93         memset(rec, 0, sizeof(*rec));
94         rec->cache.start = btrfs_header_bytenr(eb);
95         rec->cache.size = eb->len;
96         rec->generation = btrfs_header_generation(eb);
97         read_extent_buffer(eb, rec->csum, (unsigned long)btrfs_header_csum(eb),
98                            BTRFS_CSUM_SIZE);
99         return rec;
100 }
101
102 static int process_extent_buffer(struct cache_tree *eb_cache,
103                                  struct extent_buffer *eb,
104                                  struct btrfs_device *device, u64 offset)
105 {
106         struct extent_record *rec;
107         struct extent_record *exist;
108         struct cache_extent *cache;
109         int ret = 0;
110
111         rec = btrfs_new_extent_record(eb);
112         if (!rec->cache.size)
113                 goto free_out;
114 again:
115         cache = lookup_cache_extent(eb_cache,
116                                     rec->cache.start,
117                                     rec->cache.size);
118         if (cache) {
119                 exist = container_of(cache, struct extent_record, cache);
120
121                 if (exist->generation > rec->generation)
122                         goto free_out;
123                 if (exist->generation == rec->generation) {
124                         if (exist->cache.start != rec->cache.start ||
125                             exist->cache.size != rec->cache.size ||
126                             memcmp(exist->csum, rec->csum, BTRFS_CSUM_SIZE)) {
127                                 ret = -EEXIST;
128                         } else {
129                                 BUG_ON(exist->nmirrors >= BTRFS_MAX_MIRRORS);
130                                 exist->devices[exist->nmirrors] = device;
131                                 exist->offsets[exist->nmirrors] = offset;
132                                 exist->nmirrors++;
133                         }
134                         goto free_out;
135                 }
136                 remove_cache_extent(eb_cache, cache);
137                 free(exist);
138                 goto again;
139         }
140
141         rec->devices[0] = device;
142         rec->offsets[0] = offset;
143         rec->nmirrors++;
144         ret = insert_cache_extent(eb_cache, &rec->cache);
145         BUG_ON(ret);
146 out:
147         return ret;
148 free_out:
149         free(rec);
150         goto out;
151 }
152
153 static void free_extent_record(struct cache_extent *cache)
154 {
155         struct extent_record *er;
156
157         er = container_of(cache, struct extent_record, cache);
158         free(er);
159 }
160
161 FREE_EXTENT_CACHE_BASED_TREE(extent_record, free_extent_record);
162
163 static struct btrfs_chunk *create_chunk_item(struct chunk_record *record)
164 {
165         struct btrfs_chunk *ret;
166         struct btrfs_stripe *chunk_stripe;
167         int i;
168
169         if (!record || record->num_stripes == 0)
170                 return NULL;
171         ret = malloc(btrfs_chunk_item_size(record->num_stripes));
172         if (!ret)
173                 return NULL;
174         btrfs_set_stack_chunk_length(ret, record->length);
175         btrfs_set_stack_chunk_owner(ret, record->owner);
176         btrfs_set_stack_chunk_stripe_len(ret, record->stripe_len);
177         btrfs_set_stack_chunk_type(ret, record->type_flags);
178         btrfs_set_stack_chunk_io_align(ret, record->io_align);
179         btrfs_set_stack_chunk_io_width(ret, record->io_width);
180         btrfs_set_stack_chunk_sector_size(ret, record->sector_size);
181         btrfs_set_stack_chunk_num_stripes(ret, record->num_stripes);
182         btrfs_set_stack_chunk_sub_stripes(ret, record->sub_stripes);
183         for (i = 0, chunk_stripe = &ret->stripe; i < record->num_stripes;
184              i++, chunk_stripe++) {
185                 btrfs_set_stack_stripe_devid(chunk_stripe,
186                                 record->stripes[i].devid);
187                 btrfs_set_stack_stripe_offset(chunk_stripe,
188                                 record->stripes[i].offset);
189                 memcpy(chunk_stripe->dev_uuid, record->stripes[i].dev_uuid,
190                        BTRFS_UUID_SIZE);
191         }
192         return ret;
193 }
194
195 static void init_recover_control(struct recover_control *rc, int verbose,
196                 int yes)
197 {
198         memset(rc, 0, sizeof(struct recover_control));
199         cache_tree_init(&rc->chunk);
200         cache_tree_init(&rc->eb_cache);
201         block_group_tree_init(&rc->bg);
202         device_extent_tree_init(&rc->devext);
203
204         INIT_LIST_HEAD(&rc->good_chunks);
205         INIT_LIST_HEAD(&rc->bad_chunks);
206         INIT_LIST_HEAD(&rc->unrepaired_chunks);
207
208         rc->verbose = verbose;
209         rc->yes = yes;
210         pthread_mutex_init(&rc->rc_lock, NULL);
211 }
212
213 static void free_recover_control(struct recover_control *rc)
214 {
215         free_block_group_tree(&rc->bg);
216         free_chunk_cache_tree(&rc->chunk);
217         free_device_extent_tree(&rc->devext);
218         free_extent_record_tree(&rc->eb_cache);
219         pthread_mutex_destroy(&rc->rc_lock);
220 }
221
222 static int process_block_group_item(struct block_group_tree *bg_cache,
223                                     struct extent_buffer *leaf,
224                                     struct btrfs_key *key, int slot)
225 {
226         struct block_group_record *rec;
227         struct block_group_record *exist;
228         struct cache_extent *cache;
229         int ret = 0;
230
231         rec = btrfs_new_block_group_record(leaf, key, slot);
232         if (!rec->cache.size)
233                 goto free_out;
234 again:
235         cache = lookup_cache_extent(&bg_cache->tree,
236                                     rec->cache.start,
237                                     rec->cache.size);
238         if (cache) {
239                 exist = container_of(cache, struct block_group_record, cache);
240
241                 /*check the generation and replace if needed*/
242                 if (exist->generation > rec->generation)
243                         goto free_out;
244                 if (exist->generation == rec->generation) {
245                         int offset = offsetof(struct block_group_record,
246                                               generation);
247                         /*
248                          * According to the current kernel code, the following
249                          * case is impossble, or there is something wrong in
250                          * the kernel code.
251                          */
252                         if (memcmp(((void *)exist) + offset,
253                                    ((void *)rec) + offset,
254                                    sizeof(*rec) - offset))
255                                 ret = -EEXIST;
256                         goto free_out;
257                 }
258                 remove_cache_extent(&bg_cache->tree, cache);
259                 list_del_init(&exist->list);
260                 free(exist);
261                 /*
262                  * We must do seach again to avoid the following cache.
263                  * /--old bg 1--//--old bg 2--/
264                  *        /--new bg--/
265                  */
266                 goto again;
267         }
268
269         ret = insert_block_group_record(bg_cache, rec);
270         BUG_ON(ret);
271 out:
272         return ret;
273 free_out:
274         free(rec);
275         goto out;
276 }
277
278 static int process_chunk_item(struct cache_tree *chunk_cache,
279                               struct extent_buffer *leaf, struct btrfs_key *key,
280                               int slot)
281 {
282         struct chunk_record *rec;
283         struct chunk_record *exist;
284         struct cache_extent *cache;
285         int ret = 0;
286
287         rec = btrfs_new_chunk_record(leaf, key, slot);
288         if (!rec->cache.size)
289                 goto free_out;
290 again:
291         cache = lookup_cache_extent(chunk_cache, rec->offset, rec->length);
292         if (cache) {
293                 exist = container_of(cache, struct chunk_record, cache);
294
295                 if (exist->generation > rec->generation)
296                         goto free_out;
297                 if (exist->generation == rec->generation) {
298                         int num_stripes = rec->num_stripes;
299                         int rec_size = btrfs_chunk_record_size(num_stripes);
300                         int offset = offsetof(struct chunk_record, generation);
301
302                         if (exist->num_stripes != rec->num_stripes ||
303                             memcmp(((void *)exist) + offset,
304                                    ((void *)rec) + offset,
305                                    rec_size - offset))
306                                 ret = -EEXIST;
307                         goto free_out;
308                 }
309                 remove_cache_extent(chunk_cache, cache);
310                 free(exist);
311                 goto again;
312         }
313         ret = insert_cache_extent(chunk_cache, &rec->cache);
314         BUG_ON(ret);
315 out:
316         return ret;
317 free_out:
318         free(rec);
319         goto out;
320 }
321
322 static int process_device_extent_item(struct device_extent_tree *devext_cache,
323                                       struct extent_buffer *leaf,
324                                       struct btrfs_key *key, int slot)
325 {
326         struct device_extent_record *rec;
327         struct device_extent_record *exist;
328         struct cache_extent *cache;
329         int ret = 0;
330
331         rec = btrfs_new_device_extent_record(leaf, key, slot);
332         if (!rec->cache.size)
333                 goto free_out;
334 again:
335         cache = lookup_cache_extent2(&devext_cache->tree,
336                                      rec->cache.objectid,
337                                      rec->cache.start,
338                                      rec->cache.size);
339         if (cache) {
340                 exist = container_of(cache, struct device_extent_record, cache);
341                 if (exist->generation > rec->generation)
342                         goto free_out;
343                 if (exist->generation == rec->generation) {
344                         int offset = offsetof(struct device_extent_record,
345                                               generation);
346                         if (memcmp(((void *)exist) + offset,
347                                    ((void *)rec) + offset,
348                                    sizeof(*rec) - offset))
349                                 ret = -EEXIST;
350                         goto free_out;
351                 }
352                 remove_cache_extent(&devext_cache->tree, cache);
353                 list_del_init(&exist->chunk_list);
354                 list_del_init(&exist->device_list);
355                 free(exist);
356                 goto again;
357         }
358
359         ret = insert_device_extent_record(devext_cache, rec);
360         BUG_ON(ret);
361 out:
362         return ret;
363 free_out:
364         free(rec);
365         goto out;
366 }
367
368 static void print_block_group_info(struct block_group_record *rec, char *prefix)
369 {
370         if (prefix)
371                 printf("%s", prefix);
372         printf("Block Group: start = %llu, len = %llu, flag = %llx\n",
373                rec->objectid, rec->offset, rec->flags);
374 }
375
376 static void print_block_group_tree(struct block_group_tree *tree)
377 {
378         struct cache_extent *cache;
379         struct block_group_record *rec;
380
381         printf("All Block Groups:\n");
382         for (cache = first_cache_extent(&tree->tree); cache;
383              cache = next_cache_extent(cache)) {
384                 rec = container_of(cache, struct block_group_record, cache);
385                 print_block_group_info(rec, "\t");
386         }
387         printf("\n");
388 }
389
390 static void print_stripe_info(struct stripe *data, char *prefix1, char *prefix2,
391                               int index)
392 {
393         if (prefix1)
394                 printf("%s", prefix1);
395         if (prefix2)
396                 printf("%s", prefix2);
397         printf("[%2d] Stripe: devid = %llu, offset = %llu\n",
398                index, data->devid, data->offset);
399 }
400
401 static void print_chunk_self_info(struct chunk_record *rec, char *prefix)
402 {
403         int i;
404
405         if (prefix)
406                 printf("%s", prefix);
407         printf("Chunk: start = %llu, len = %llu, type = %llx, num_stripes = %u\n",
408                rec->offset, rec->length, rec->type_flags, rec->num_stripes);
409         if (prefix)
410                 printf("%s", prefix);
411         printf("    Stripes list:\n");
412         for (i = 0; i < rec->num_stripes; i++)
413                 print_stripe_info(&rec->stripes[i], prefix, "    ", i);
414 }
415
416 static void print_chunk_tree(struct cache_tree *tree)
417 {
418         struct cache_extent *n;
419         struct chunk_record *entry;
420
421         printf("All Chunks:\n");
422         for (n = first_cache_extent(tree); n;
423              n = next_cache_extent(n)) {
424                 entry = container_of(n, struct chunk_record, cache);
425                 print_chunk_self_info(entry, "\t");
426         }
427         printf("\n");
428 }
429
430 static void print_device_extent_info(struct device_extent_record *rec,
431                                      char *prefix)
432 {
433         if (prefix)
434                 printf("%s", prefix);
435         printf("Device extent: devid = %llu, start = %llu, len = %llu, chunk offset = %llu\n",
436                rec->objectid, rec->offset, rec->length, rec->chunk_offset);
437 }
438
439 static void print_device_extent_tree(struct device_extent_tree *tree)
440 {
441         struct cache_extent *n;
442         struct device_extent_record *entry;
443
444         printf("All Device Extents:\n");
445         for (n = first_cache_extent(&tree->tree); n;
446              n = next_cache_extent(n)) {
447                 entry = container_of(n, struct device_extent_record, cache);
448                 print_device_extent_info(entry, "\t");
449         }
450         printf("\n");
451 }
452
453 static void print_device_info(struct btrfs_device *device, char *prefix)
454 {
455         if (prefix)
456                 printf("%s", prefix);
457         printf("Device: id = %llu, name = %s\n",
458                device->devid, device->name);
459 }
460
461 static void print_all_devices(struct list_head *devices)
462 {
463         struct btrfs_device *dev;
464
465         printf("All Devices:\n");
466         list_for_each_entry(dev, devices, dev_list)
467                 print_device_info(dev, "\t");
468         printf("\n");
469 }
470
471 static void print_scan_result(struct recover_control *rc)
472 {
473         if (!rc->verbose)
474                 return;
475
476         printf("DEVICE SCAN RESULT:\n");
477         printf("Filesystem Information:\n");
478         printf("\tsectorsize: %d\n", rc->sectorsize);
479         printf("\tleafsize: %d\n", rc->leafsize);
480         printf("\ttree root generation: %llu\n", rc->generation);
481         printf("\tchunk root generation: %llu\n", rc->chunk_root_generation);
482         printf("\n");
483
484         print_all_devices(&rc->fs_devices->devices);
485         print_block_group_tree(&rc->bg);
486         print_chunk_tree(&rc->chunk);
487         print_device_extent_tree(&rc->devext);
488 }
489
490 static void print_chunk_info(struct chunk_record *chunk, char *prefix)
491 {
492         struct device_extent_record *devext;
493         int i;
494
495         print_chunk_self_info(chunk, prefix);
496         if (prefix)
497                 printf("%s", prefix);
498         if (chunk->bg_rec)
499                 print_block_group_info(chunk->bg_rec, "    ");
500         else
501                 printf("    No block group.\n");
502         if (prefix)
503                 printf("%s", prefix);
504         if (list_empty(&chunk->dextents)) {
505                 printf("    No device extent.\n");
506         } else {
507                 printf("    Device extent list:\n");
508                 i = 0;
509                 list_for_each_entry(devext, &chunk->dextents, chunk_list) {
510                         if (prefix)
511                                 printf("%s", prefix);
512                         printf("%s[%2d]", "        ", i);
513                         print_device_extent_info(devext, NULL);
514                         i++;
515                 }
516         }
517 }
518
519 static void print_check_result(struct recover_control *rc)
520 {
521         struct chunk_record *chunk;
522         struct block_group_record *bg;
523         struct device_extent_record *devext;
524         int total = 0;
525         int good = 0;
526         int bad = 0;
527
528         if (!rc->verbose)
529                 return;
530
531         printf("CHECK RESULT:\n");
532         printf("Healthy Chunks:\n");
533         list_for_each_entry(chunk, &rc->good_chunks, list) {
534                 print_chunk_info(chunk, "  ");
535                 good++;
536                 total++;
537         }
538         printf("Bad Chunks:\n");
539         list_for_each_entry(chunk, &rc->bad_chunks, list) {
540                 print_chunk_info(chunk, "  ");
541                 bad++;
542                 total++;
543         }
544         printf("\n");
545         printf("Total Chunks:\t%d\n", total);
546         printf("  Heathy:\t%d\n", good);
547         printf("  Bad:\t%d\n", bad);
548
549         printf("\n");
550         printf("Orphan Block Groups:\n");
551         list_for_each_entry(bg, &rc->bg.block_groups, list)
552                 print_block_group_info(bg, "  ");
553
554         printf("\n");
555         printf("Orphan Device Extents:\n");
556         list_for_each_entry(devext, &rc->devext.no_chunk_orphans, chunk_list)
557                 print_device_extent_info(devext, "  ");
558 }
559
560 static int check_chunk_by_metadata(struct recover_control *rc,
561                                    struct btrfs_root *root,
562                                    struct chunk_record *chunk, int bg_only)
563 {
564         int ret;
565         int i;
566         int slot;
567         struct btrfs_path path;
568         struct btrfs_key key;
569         struct btrfs_root *dev_root;
570         struct stripe *stripe;
571         struct btrfs_dev_extent *dev_extent;
572         struct btrfs_block_group_item *bg_ptr;
573         struct extent_buffer *l;
574
575         btrfs_init_path(&path);
576
577         if (bg_only)
578                 goto bg_check;
579
580         dev_root = root->fs_info->dev_root;
581         for (i = 0; i < chunk->num_stripes; i++) {
582                 stripe = &chunk->stripes[i];
583
584                 key.objectid = stripe->devid;
585                 key.offset = stripe->offset;
586                 key.type = BTRFS_DEV_EXTENT_KEY;
587
588                 ret = btrfs_search_slot(NULL, dev_root, &key, &path, 0, 0);
589                 if (ret < 0) {
590                         fprintf(stderr, "Search device extent failed(%d)\n",
591                                 ret);
592                         btrfs_release_path(&path);
593                         return ret;
594                 } else if (ret > 0) {
595                         if (rc->verbose)
596                                 fprintf(stderr,
597                                         "No device extent[%llu, %llu]\n",
598                                         stripe->devid, stripe->offset);
599                         btrfs_release_path(&path);
600                         return -ENOENT;
601                 }
602                 l = path.nodes[0];
603                 slot = path.slots[0];
604                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
605                 if (chunk->offset !=
606                     btrfs_dev_extent_chunk_offset(l, dev_extent)) {
607                         if (rc->verbose)
608                                 fprintf(stderr,
609                                         "Device tree unmatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
610                                         btrfs_dev_extent_chunk_offset(l,
611                                                                 dev_extent),
612                                         btrfs_dev_extent_length(l, dev_extent),
613                                         chunk->offset, chunk->length);
614                         btrfs_release_path(&path);
615                         return -ENOENT;
616                 }
617                 btrfs_release_path(&path);
618         }
619
620 bg_check:
621         key.objectid = chunk->offset;
622         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
623         key.offset = chunk->length;
624
625         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, &path,
626                                 0, 0);
627         if (ret < 0) {
628                 fprintf(stderr, "Search block group failed(%d)\n", ret);
629                 btrfs_release_path(&path);
630                 return ret;
631         } else if (ret > 0) {
632                 if (rc->verbose)
633                         fprintf(stderr, "No block group[%llu, %llu]\n",
634                                 key.objectid, key.offset);
635                 btrfs_release_path(&path);
636                 return -ENOENT;
637         }
638
639         l = path.nodes[0];
640         slot = path.slots[0];
641         bg_ptr = btrfs_item_ptr(l, slot, struct btrfs_block_group_item);
642         if (chunk->type_flags != btrfs_disk_block_group_flags(l, bg_ptr)) {
643                 if (rc->verbose)
644                         fprintf(stderr,
645                                 "Chunk[%llu, %llu]'s type(%llu) is differemt with Block Group's type(%llu)\n",
646                                 chunk->offset, chunk->length, chunk->type_flags,
647                                 btrfs_disk_block_group_flags(l, bg_ptr));
648                 btrfs_release_path(&path);
649                 return -ENOENT;
650         }
651         btrfs_release_path(&path);
652         return 0;
653 }
654
655 static int check_all_chunks_by_metadata(struct recover_control *rc,
656                                         struct btrfs_root *root)
657 {
658         struct chunk_record *chunk;
659         struct chunk_record *next;
660         LIST_HEAD(orphan_chunks);
661         int ret = 0;
662         int err;
663
664         list_for_each_entry_safe(chunk, next, &rc->good_chunks, list) {
665                 err = check_chunk_by_metadata(rc, root, chunk, 0);
666                 if (err) {
667                         if (err == -ENOENT)
668                                 list_move_tail(&chunk->list, &orphan_chunks);
669                         else if (err && !ret)
670                                 ret = err;
671                 }
672         }
673
674         list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) {
675                 err = check_chunk_by_metadata(rc, root, chunk, 1);
676                 if (err == -ENOENT)
677                         list_move_tail(&chunk->list, &orphan_chunks);
678                 else if (err && !ret)
679                         ret = err;
680         }
681
682         list_for_each_entry(chunk, &rc->bad_chunks, list) {
683                 err = check_chunk_by_metadata(rc, root, chunk, 1);
684                 if (err != -ENOENT && !ret)
685                         ret = err ? err : -EINVAL;
686         }
687         list_splice(&orphan_chunks, &rc->bad_chunks);
688         return ret;
689 }
690
691 static int extract_metadata_record(struct recover_control *rc,
692                                    struct extent_buffer *leaf)
693 {
694         struct btrfs_key key;
695         int ret = 0;
696         int i;
697         u32 nritems;
698
699         nritems = btrfs_header_nritems(leaf);
700         for (i = 0; i < nritems; i++) {
701                 btrfs_item_key_to_cpu(leaf, &key, i);
702                 switch (key.type) {
703                 case BTRFS_BLOCK_GROUP_ITEM_KEY:
704                         pthread_mutex_lock(&rc->rc_lock);
705                         ret = process_block_group_item(&rc->bg, leaf, &key, i);
706                         pthread_mutex_unlock(&rc->rc_lock);
707                         break;
708                 case BTRFS_CHUNK_ITEM_KEY:
709                         pthread_mutex_lock(&rc->rc_lock);
710                         ret = process_chunk_item(&rc->chunk, leaf, &key, i);
711                         pthread_mutex_unlock(&rc->rc_lock);
712                         break;
713                 case BTRFS_DEV_EXTENT_KEY:
714                         pthread_mutex_lock(&rc->rc_lock);
715                         ret = process_device_extent_item(&rc->devext, leaf,
716                                                          &key, i);
717                         pthread_mutex_unlock(&rc->rc_lock);
718                         break;
719                 }
720                 if (ret)
721                         break;
722         }
723         return ret;
724 }
725
726 static inline int is_super_block_address(u64 offset)
727 {
728         int i;
729
730         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
731                 if (offset == btrfs_sb_offset(i))
732                         return 1;
733         }
734         return 0;
735 }
736
737 static int scan_one_device(void *dev_scan_struct)
738 {
739         struct extent_buffer *buf;
740         u64 bytenr;
741         int ret = 0;
742         struct device_scan *dev_scan = (struct device_scan *)dev_scan_struct;
743         struct recover_control *rc = dev_scan->rc;
744         struct btrfs_device *device = dev_scan->dev;
745         int fd = dev_scan->fd;
746         int oldtype;
747
748         ret = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &oldtype);
749         if (ret)
750                 return 1;
751
752         buf = malloc(sizeof(*buf) + rc->leafsize);
753         if (!buf)
754                 return -ENOMEM;
755         buf->len = rc->leafsize;
756
757         bytenr = 0;
758         while (1) {
759                 if (is_super_block_address(bytenr))
760                         bytenr += rc->sectorsize;
761
762                 if (pread64(fd, buf->data, rc->leafsize, bytenr) <
763                     rc->leafsize)
764                         break;
765
766                 if (memcmp_extent_buffer(buf, rc->fs_devices->fsid,
767                                          btrfs_header_fsid(),
768                                          BTRFS_FSID_SIZE)) {
769                         bytenr += rc->sectorsize;
770                         continue;
771                 }
772
773                 if (verify_tree_block_csum_silent(buf, rc->csum_size)) {
774                         bytenr += rc->sectorsize;
775                         continue;
776                 }
777
778                 pthread_mutex_lock(&rc->rc_lock);
779                 ret = process_extent_buffer(&rc->eb_cache, buf, device, bytenr);
780                 pthread_mutex_unlock(&rc->rc_lock);
781                 if (ret)
782                         goto out;
783
784                 if (btrfs_header_level(buf) != 0)
785                         goto next_node;
786
787                 switch (btrfs_header_owner(buf)) {
788                 case BTRFS_EXTENT_TREE_OBJECTID:
789                 case BTRFS_DEV_TREE_OBJECTID:
790                         /* different tree use different generation */
791                         if (btrfs_header_generation(buf) > rc->generation)
792                                 break;
793                         ret = extract_metadata_record(rc, buf);
794                         if (ret)
795                                 goto out;
796                         break;
797                 case BTRFS_CHUNK_TREE_OBJECTID:
798                         if (btrfs_header_generation(buf) >
799                             rc->chunk_root_generation)
800                                 break;
801                         ret = extract_metadata_record(rc, buf);
802                         if (ret)
803                                 goto out;
804                         break;
805                 }
806 next_node:
807                 bytenr += rc->leafsize;
808         }
809 out:
810         close(fd);
811         free(buf);
812         return ret;
813 }
814
815 static int scan_devices(struct recover_control *rc)
816 {
817         int ret = 0;
818         int fd;
819         struct btrfs_device *dev;
820         struct device_scan *dev_scans;
821         pthread_t *t_scans;
822         int *t_rets;
823         int devnr = 0;
824         int devidx = 0;
825         int cancel_from = 0;
826         int cancel_to = 0;
827         int i;
828
829         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list)
830                 devnr++;
831         dev_scans = (struct device_scan *)malloc(sizeof(struct device_scan)
832                                                  * devnr);
833         if (!dev_scans)
834                 return -ENOMEM;
835         t_scans = (pthread_t *)malloc(sizeof(pthread_t) * devnr);
836         if (!t_scans)
837                 return -ENOMEM;
838         t_rets = (int *)malloc(sizeof(int) * devnr);
839         if (!t_rets)
840                 return -ENOMEM;
841
842         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
843                 fd = open(dev->name, O_RDONLY);
844                 if (fd < 0) {
845                         fprintf(stderr, "Failed to open device %s\n",
846                                 dev->name);
847                         ret = 1;
848                         goto out2;
849                 }
850                 dev_scans[devidx].rc = rc;
851                 dev_scans[devidx].dev = dev;
852                 dev_scans[devidx].fd = fd;
853                 ret = pthread_create(&t_scans[devidx], NULL,
854                                      (void *)scan_one_device,
855                                      (void *)&dev_scans[devidx]);
856                 if (ret) {
857                         cancel_from = 0;
858                         cancel_to = devidx - 1;
859                         goto out1;
860                 }
861                 devidx++;
862         }
863
864         i = 0;
865         while (i < devidx) {
866                 ret = pthread_join(t_scans[i], (void **)&t_rets[i]);
867                 if (ret || t_rets[i]) {
868                         ret = 1;
869                         cancel_from = i + 1;
870                         cancel_to = devnr - 1;
871                         goto out1;
872                 }
873                 i++;
874         }
875 out1:
876         while (ret && (cancel_from <= cancel_to)) {
877                 pthread_cancel(t_scans[cancel_from]);
878                 cancel_from++;
879         }
880 out2:
881         free(dev_scans);
882         free(t_scans);
883         free(t_rets);
884         return !!ret;
885 }
886
887 static int build_device_map_by_chunk_record(struct btrfs_root *root,
888                                             struct chunk_record *chunk)
889 {
890         int ret = 0;
891         int i;
892         u64 devid;
893         u8 uuid[BTRFS_UUID_SIZE];
894         u16 num_stripes;
895         struct btrfs_mapping_tree *map_tree;
896         struct map_lookup *map;
897         struct stripe *stripe;
898
899         map_tree = &root->fs_info->mapping_tree;
900         num_stripes = chunk->num_stripes;
901         map = malloc(btrfs_map_lookup_size(num_stripes));
902         if (!map)
903                 return -ENOMEM;
904         map->ce.start = chunk->offset;
905         map->ce.size = chunk->length;
906         map->num_stripes = num_stripes;
907         map->io_width = chunk->io_width;
908         map->io_align = chunk->io_align;
909         map->sector_size = chunk->sector_size;
910         map->stripe_len = chunk->stripe_len;
911         map->type = chunk->type_flags;
912         map->sub_stripes = chunk->sub_stripes;
913
914         for (i = 0, stripe = chunk->stripes; i < num_stripes; i++, stripe++) {
915                 devid = stripe->devid;
916                 memcpy(uuid, stripe->dev_uuid, BTRFS_UUID_SIZE);
917                 map->stripes[i].physical = stripe->offset;
918                 map->stripes[i].dev = btrfs_find_device(root, devid,
919                                                         uuid, NULL);
920                 if (!map->stripes[i].dev) {
921                         kfree(map);
922                         return -EIO;
923                 }
924         }
925
926         ret = insert_cache_extent(&map_tree->cache_tree, &map->ce);
927         return ret;
928 }
929
930 static int build_device_maps_by_chunk_records(struct recover_control *rc,
931                                               struct btrfs_root *root)
932 {
933         int ret = 0;
934         struct chunk_record *chunk;
935
936         list_for_each_entry(chunk, &rc->good_chunks, list) {
937                 ret = build_device_map_by_chunk_record(root, chunk);
938                 if (ret)
939                         return ret;
940         }
941         return ret;
942 }
943
944 static int block_group_remove_all_extent_items(struct btrfs_trans_handle *trans,
945                                                struct btrfs_root *root,
946                                                struct block_group_record *bg)
947 {
948         struct btrfs_fs_info *fs_info = root->fs_info;
949         struct btrfs_key key;
950         struct btrfs_path path;
951         struct extent_buffer *leaf;
952         u64 start = bg->objectid;
953         u64 end = bg->objectid + bg->offset;
954         u64 old_val;
955         int nitems;
956         int ret;
957         int i;
958         int del_s, del_nr;
959
960         btrfs_init_path(&path);
961         root = root->fs_info->extent_root;
962
963         key.objectid = start;
964         key.offset = 0;
965         key.type = BTRFS_EXTENT_ITEM_KEY;
966 again:
967         ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
968         if (ret < 0)
969                 goto err;
970         else if (ret > 0)
971                 ret = 0;
972
973         leaf = path.nodes[0];
974         nitems = btrfs_header_nritems(leaf);
975         if (!nitems) {
976                 /* The tree is empty. */
977                 ret = 0;
978                 goto err;
979         }
980
981         if (path.slots[0] >= nitems) {
982                 ret = btrfs_next_leaf(root, &path);
983                 if (ret < 0)
984                         goto err;
985                 if (ret > 0) {
986                         ret = 0;
987                         goto err;
988                 }
989                 leaf = path.nodes[0];
990                 btrfs_item_key_to_cpu(leaf, &key, 0);
991                 if (key.objectid >= end)
992                         goto err;
993                 btrfs_release_path(&path);
994                 goto again;
995         }
996
997         del_nr = 0;
998         del_s = -1;
999         for (i = path.slots[0]; i < nitems; i++) {
1000                 btrfs_item_key_to_cpu(leaf, &key, i);
1001                 if (key.objectid >= end)
1002                         break;
1003
1004                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1005                         if (del_nr == 0)
1006                                 continue;
1007                         else
1008                                 break;
1009                 }
1010
1011                 if (del_s == -1)
1012                         del_s = i;
1013                 del_nr++;
1014                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
1015                     key.type == BTRFS_METADATA_ITEM_KEY) {
1016                         old_val = btrfs_super_bytes_used(fs_info->super_copy);
1017                         if (key.type == BTRFS_METADATA_ITEM_KEY)
1018                                 old_val += root->leafsize;
1019                         else
1020                                 old_val += key.offset;
1021                         btrfs_set_super_bytes_used(fs_info->super_copy,
1022                                                    old_val);
1023                 }
1024         }
1025
1026         if (del_nr) {
1027                 ret = btrfs_del_items(trans, root, &path, del_s, del_nr);
1028                 if (ret)
1029                         goto err;
1030         }
1031
1032         if (key.objectid < end) {
1033                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1034                         key.objectid += root->sectorsize;
1035                         key.type = BTRFS_EXTENT_ITEM_KEY;
1036                         key.offset = 0;
1037                 }
1038                 btrfs_release_path(&path);
1039                 goto again;
1040         }
1041 err:
1042         btrfs_release_path(&path);
1043         return ret;
1044 }
1045
1046 static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
1047                                        struct btrfs_root *root,
1048                                        struct block_group_record *bg)
1049 {
1050         struct btrfs_block_group_cache *cache;
1051         struct btrfs_fs_info *info;
1052         u64 start;
1053         u64 end;
1054
1055         info = root->fs_info;
1056         cache = btrfs_lookup_block_group(info, bg->objectid);
1057         if (!cache)
1058                 return -ENOENT;
1059
1060         start = cache->key.objectid;
1061         end = start + cache->key.offset - 1;
1062
1063         set_extent_bits(&info->block_group_cache, start, end,
1064                         BLOCK_GROUP_DIRTY, GFP_NOFS);
1065         set_extent_dirty(&info->free_space_cache, start, end, GFP_NOFS);
1066
1067         btrfs_set_block_group_used(&cache->item, 0);
1068
1069         return 0;
1070 }
1071
1072 static int remove_chunk_extent_item(struct btrfs_trans_handle *trans,
1073                                     struct recover_control *rc,
1074                                     struct btrfs_root *root)
1075 {
1076         struct chunk_record *chunk;
1077         int ret = 0;
1078
1079         list_for_each_entry(chunk, &rc->good_chunks, list) {
1080                 if (!(chunk->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
1081                         continue;
1082                 ret = block_group_remove_all_extent_items(trans, root,
1083                                                           chunk->bg_rec);
1084                 if (ret)
1085                         return ret;
1086
1087                 ret = block_group_free_all_extent(trans, root, chunk->bg_rec);
1088                 if (ret)
1089                         return ret;
1090         }
1091         return ret;
1092 }
1093
1094 static int __rebuild_chunk_root(struct btrfs_trans_handle *trans,
1095                                 struct recover_control *rc,
1096                                 struct btrfs_root *root)
1097 {
1098         u64 min_devid = -1;
1099         struct btrfs_device *dev;
1100         struct extent_buffer *cow;
1101         struct btrfs_disk_key disk_key;
1102         int ret = 0;
1103
1104         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
1105                 if (min_devid > dev->devid)
1106                         min_devid = dev->devid;
1107         }
1108         disk_key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1109         disk_key.type = BTRFS_DEV_ITEM_KEY;
1110         disk_key.offset = min_devid;
1111
1112         cow = btrfs_alloc_free_block(trans, root, root->nodesize,
1113                                      BTRFS_CHUNK_TREE_OBJECTID,
1114                                      &disk_key, 0, 0, 0);
1115         btrfs_set_header_bytenr(cow, cow->start);
1116         btrfs_set_header_generation(cow, trans->transid);
1117         btrfs_set_header_nritems(cow, 0);
1118         btrfs_set_header_level(cow, 0);
1119         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1120         btrfs_set_header_owner(cow, BTRFS_CHUNK_TREE_OBJECTID);
1121         write_extent_buffer(cow, root->fs_info->fsid,
1122                         btrfs_header_fsid(), BTRFS_FSID_SIZE);
1123
1124         write_extent_buffer(cow, root->fs_info->chunk_tree_uuid,
1125                         btrfs_header_chunk_tree_uuid(cow),
1126                         BTRFS_UUID_SIZE);
1127
1128         root->node = cow;
1129         btrfs_mark_buffer_dirty(cow);
1130
1131         return ret;
1132 }
1133
1134 static int __rebuild_device_items(struct btrfs_trans_handle *trans,
1135                                   struct recover_control *rc,
1136                                   struct btrfs_root *root)
1137 {
1138         struct btrfs_device *dev;
1139         struct btrfs_key key;
1140         struct btrfs_dev_item *dev_item;
1141         int ret = 0;
1142
1143         dev_item = malloc(sizeof(struct btrfs_dev_item));
1144         if (!dev_item)
1145                 return -ENOMEM;
1146
1147         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
1148                 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1149                 key.type = BTRFS_DEV_ITEM_KEY;
1150                 key.offset = dev->devid;
1151
1152                 btrfs_set_stack_device_generation(dev_item, 0);
1153                 btrfs_set_stack_device_type(dev_item, dev->type);
1154                 btrfs_set_stack_device_id(dev_item, dev->devid);
1155                 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
1156                 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
1157                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
1158                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
1159                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
1160                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
1161                 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
1162
1163                 ret = btrfs_insert_item(trans, root, &key,
1164                                         dev_item, sizeof(*dev_item));
1165         }
1166
1167         free(dev_item);
1168         return ret;
1169 }
1170
1171 static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
1172                                  struct recover_control *rc,
1173                                  struct btrfs_root *root)
1174 {
1175         struct btrfs_key key;
1176         struct btrfs_chunk *chunk = NULL;
1177         struct btrfs_root *chunk_root;
1178         struct chunk_record *chunk_rec;
1179         int ret;
1180
1181         chunk_root = root->fs_info->chunk_root;
1182
1183         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1184                 chunk = create_chunk_item(chunk_rec);
1185                 if (!chunk)
1186                         return -ENOMEM;
1187
1188                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1189                 key.type = BTRFS_CHUNK_ITEM_KEY;
1190                 key.offset = chunk_rec->offset;
1191
1192                 ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
1193                                 btrfs_chunk_item_size(chunk->num_stripes));
1194                 free(chunk);
1195                 if (ret)
1196                         return ret;
1197         }
1198         return 0;
1199 }
1200
1201 static int rebuild_chunk_tree(struct btrfs_trans_handle *trans,
1202                               struct recover_control *rc,
1203                               struct btrfs_root *root)
1204 {
1205         int ret = 0;
1206
1207         root = root->fs_info->chunk_root;
1208
1209         ret = __rebuild_chunk_root(trans, rc, root);
1210         if (ret)
1211                 return ret;
1212
1213         ret = __rebuild_device_items(trans, rc, root);
1214         if (ret)
1215                 return ret;
1216
1217         ret = __rebuild_chunk_items(trans, rc, root);
1218
1219         return ret;
1220 }
1221
1222 static int rebuild_sys_array(struct recover_control *rc,
1223                              struct btrfs_root *root)
1224 {
1225         struct btrfs_chunk *chunk;
1226         struct btrfs_key key;
1227         struct chunk_record *chunk_rec;
1228         int ret = 0;
1229         u16 num_stripes;
1230
1231         btrfs_set_super_sys_array_size(root->fs_info->super_copy, 0);
1232
1233         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1234                 if (!(chunk_rec->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
1235                         continue;
1236
1237                 num_stripes = chunk_rec->num_stripes;
1238                 chunk = create_chunk_item(chunk_rec);
1239                 if (!chunk) {
1240                         ret = -ENOMEM;
1241                         break;
1242                 }
1243
1244                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1245                 key.type = BTRFS_CHUNK_ITEM_KEY;
1246                 key.offset = chunk_rec->offset;
1247
1248                 ret = btrfs_add_system_chunk(NULL, root, &key, chunk,
1249                                 btrfs_chunk_item_size(num_stripes));
1250                 free(chunk);
1251                 if (ret)
1252                         break;
1253         }
1254         return ret;
1255
1256 }
1257
1258 static struct btrfs_root *
1259 open_ctree_with_broken_chunk(struct recover_control *rc)
1260 {
1261         struct btrfs_fs_info *fs_info;
1262         struct btrfs_super_block *disk_super;
1263         struct extent_buffer *eb;
1264         u32 sectorsize;
1265         u32 nodesize;
1266         u32 leafsize;
1267         u32 stripesize;
1268         int ret;
1269
1270         fs_info = btrfs_new_fs_info(1, BTRFS_SUPER_INFO_OFFSET);
1271         if (!fs_info) {
1272                 fprintf(stderr, "Failed to allocate memory for fs_info\n");
1273                 return ERR_PTR(-ENOMEM);
1274         }
1275         fs_info->is_chunk_recover = 1;
1276
1277         fs_info->fs_devices = rc->fs_devices;
1278         ret = btrfs_open_devices(fs_info->fs_devices, O_RDWR);
1279         if (ret)
1280                 goto out;
1281
1282         disk_super = fs_info->super_copy;
1283         ret = btrfs_read_dev_super(fs_info->fs_devices->latest_bdev,
1284                                    disk_super, fs_info->super_bytenr, 1);
1285         if (ret) {
1286                 fprintf(stderr, "No valid btrfs found\n");
1287                 goto out_devices;
1288         }
1289
1290         memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
1291
1292         ret = btrfs_check_fs_compatibility(disk_super, 1);
1293         if (ret)
1294                 goto out_devices;
1295
1296         nodesize = btrfs_super_nodesize(disk_super);
1297         leafsize = btrfs_super_leafsize(disk_super);
1298         sectorsize = btrfs_super_sectorsize(disk_super);
1299         stripesize = btrfs_super_stripesize(disk_super);
1300
1301         __setup_root(nodesize, leafsize, sectorsize, stripesize,
1302                      fs_info->chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1303
1304         ret = build_device_maps_by_chunk_records(rc, fs_info->chunk_root);
1305         if (ret)
1306                 goto out_cleanup;
1307
1308         ret = btrfs_setup_all_roots(fs_info, 0, 0);
1309         if (ret)
1310                 goto out_failed;
1311
1312         eb = fs_info->tree_root->node;
1313         read_extent_buffer(eb, fs_info->chunk_tree_uuid,
1314                            btrfs_header_chunk_tree_uuid(eb),
1315                            BTRFS_UUID_SIZE);
1316
1317         return fs_info->fs_root;
1318 out_failed:
1319         btrfs_release_all_roots(fs_info);
1320 out_cleanup:
1321         btrfs_cleanup_all_caches(fs_info);
1322 out_devices:
1323         btrfs_close_devices(fs_info->fs_devices);
1324 out:
1325         btrfs_free_fs_info(fs_info);
1326         return ERR_PTR(ret);
1327 }
1328
1329 static int recover_prepare(struct recover_control *rc, char *path)
1330 {
1331         int ret;
1332         int fd;
1333         struct btrfs_super_block *sb;
1334         struct btrfs_fs_devices *fs_devices;
1335
1336         ret = 0;
1337         fd = open(path, O_RDONLY);
1338         if (fd < 0) {
1339                 fprintf(stderr, "open %s\n error.\n", path);
1340                 return -1;
1341         }
1342
1343         sb = malloc(sizeof(struct btrfs_super_block));
1344         if (!sb) {
1345                 fprintf(stderr, "allocating memory for sb failed.\n");
1346                 ret = -ENOMEM;
1347                 goto fail_close_fd;
1348         }
1349
1350         ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET, 1);
1351         if (ret) {
1352                 fprintf(stderr, "read super block error\n");
1353                 goto fail_free_sb;
1354         }
1355
1356         rc->sectorsize = btrfs_super_sectorsize(sb);
1357         rc->leafsize = btrfs_super_leafsize(sb);
1358         rc->generation = btrfs_super_generation(sb);
1359         rc->chunk_root_generation = btrfs_super_chunk_root_generation(sb);
1360         rc->csum_size = btrfs_super_csum_size(sb);
1361
1362         /* if seed, the result of scanning below will be partial */
1363         if (btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_SEEDING) {
1364                 fprintf(stderr, "this device is seed device\n");
1365                 ret = -1;
1366                 goto fail_free_sb;
1367         }
1368
1369         ret = btrfs_scan_fs_devices(fd, path, &fs_devices, 0, 1, 1);
1370         if (ret)
1371                 goto fail_free_sb;
1372
1373         rc->fs_devices = fs_devices;
1374
1375         if (rc->verbose)
1376                 print_all_devices(&rc->fs_devices->devices);
1377
1378 fail_free_sb:
1379         free(sb);
1380 fail_close_fd:
1381         close(fd);
1382         return ret;
1383 }
1384
1385 static int btrfs_get_device_extents(u64 chunk_object,
1386                                     struct list_head *orphan_devexts,
1387                                     struct list_head *ret_list)
1388 {
1389         struct device_extent_record *devext;
1390         struct device_extent_record *next;
1391         int count = 0;
1392
1393         list_for_each_entry_safe(devext, next, orphan_devexts, chunk_list) {
1394                 if (devext->chunk_offset == chunk_object) {
1395                         list_move_tail(&devext->chunk_list, ret_list);
1396                         count++;
1397                 }
1398         }
1399         return count;
1400 }
1401
1402 static int calc_num_stripes(u64 type)
1403 {
1404         if (type & (BTRFS_BLOCK_GROUP_RAID0 |
1405                     BTRFS_BLOCK_GROUP_RAID10 |
1406                     BTRFS_BLOCK_GROUP_RAID5 |
1407                     BTRFS_BLOCK_GROUP_RAID6))
1408                 return 0;
1409         else if (type & (BTRFS_BLOCK_GROUP_RAID1 |
1410                          BTRFS_BLOCK_GROUP_DUP))
1411                 return 2;
1412         else
1413                 return 1;
1414 }
1415
1416 static inline int calc_sub_nstripes(u64 type)
1417 {
1418         if (type & BTRFS_BLOCK_GROUP_RAID10)
1419                 return 2;
1420         else
1421                 return 1;
1422 }
1423
1424 static int btrfs_verify_device_extents(struct block_group_record *bg,
1425                                        struct list_head *devexts, int ndevexts)
1426 {
1427         struct device_extent_record *devext;
1428         u64 strpie_length;
1429         int expected_num_stripes;
1430
1431         expected_num_stripes = calc_num_stripes(bg->flags);
1432         if (expected_num_stripes && expected_num_stripes != ndevexts)
1433                 return 1;
1434
1435         strpie_length = calc_stripe_length(bg->flags, bg->offset, ndevexts);
1436         list_for_each_entry(devext, devexts, chunk_list) {
1437                 if (devext->length != strpie_length)
1438                         return 1;
1439         }
1440         return 0;
1441 }
1442
1443 static int btrfs_rebuild_unordered_chunk_stripes(struct recover_control *rc,
1444                                                  struct chunk_record *chunk)
1445 {
1446         struct device_extent_record *devext;
1447         struct btrfs_device *device;
1448         int i;
1449
1450         devext = list_first_entry(&chunk->dextents, struct device_extent_record,
1451                                   chunk_list);
1452         for (i = 0; i < chunk->num_stripes; i++) {
1453                 chunk->stripes[i].devid = devext->objectid;
1454                 chunk->stripes[i].offset = devext->offset;
1455                 device = btrfs_find_device_by_devid(rc->fs_devices,
1456                                                     devext->objectid,
1457                                                     0);
1458                 if (!device)
1459                         return -ENOENT;
1460                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1461                                                   devext->objectid,
1462                                                   1));
1463                 memcpy(chunk->stripes[i].dev_uuid, device->uuid,
1464                        BTRFS_UUID_SIZE);
1465                 devext = list_next_entry(devext, chunk_list);
1466         }
1467         return 0;
1468 }
1469
1470 static int btrfs_calc_stripe_index(struct chunk_record *chunk, u64 logical)
1471 {
1472         u64 offset = logical - chunk->offset;
1473         int stripe_nr;
1474         int nr_data_stripes;
1475         int index;
1476
1477         stripe_nr = offset / chunk->stripe_len;
1478         if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID0) {
1479                 index = stripe_nr % chunk->num_stripes;
1480         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID10) {
1481                 index = stripe_nr % (chunk->num_stripes / chunk->sub_stripes);
1482                 index *= chunk->sub_stripes;
1483         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5) {
1484                 nr_data_stripes = chunk->num_stripes - 1;
1485                 index = stripe_nr % nr_data_stripes;
1486                 stripe_nr /= nr_data_stripes;
1487                 index = (index + stripe_nr) % chunk->num_stripes;
1488         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6) {
1489                 nr_data_stripes = chunk->num_stripes - 2;
1490                 index = stripe_nr % nr_data_stripes;
1491                 stripe_nr /= nr_data_stripes;
1492                 index = (index + stripe_nr) % chunk->num_stripes;
1493         } else {
1494                 BUG_ON(1);
1495         }
1496         return index;
1497 }
1498
1499 /* calc the logical offset which is the start of the next stripe. */
1500 static inline u64 btrfs_next_stripe_logical_offset(struct chunk_record *chunk,
1501                                                    u64 logical)
1502 {
1503         u64 offset = logical - chunk->offset;
1504
1505         offset /= chunk->stripe_len;
1506         offset *= chunk->stripe_len;
1507         offset += chunk->stripe_len;
1508
1509         return offset + chunk->offset;
1510 }
1511
1512 static int is_extent_record_in_device_extent(struct extent_record *er,
1513                                              struct device_extent_record *dext,
1514                                              int *mirror)
1515 {
1516         int i;
1517
1518         for (i = 0; i < er->nmirrors; i++) {
1519                 if (er->devices[i]->devid == dext->objectid &&
1520                     er->offsets[i] >= dext->offset &&
1521                     er->offsets[i] < dext->offset + dext->length) {
1522                         *mirror = i;
1523                         return 1;
1524                 }
1525         }
1526         return 0;
1527 }
1528
1529 static int
1530 btrfs_rebuild_ordered_meta_chunk_stripes(struct recover_control *rc,
1531                                          struct chunk_record *chunk)
1532 {
1533         u64 start = chunk->offset;
1534         u64 end = chunk->offset + chunk->length;
1535         struct cache_extent *cache;
1536         struct extent_record *er;
1537         struct device_extent_record *devext;
1538         struct device_extent_record *next;
1539         struct btrfs_device *device;
1540         LIST_HEAD(devexts);
1541         int index;
1542         int mirror;
1543         int ret;
1544
1545         cache = lookup_cache_extent(&rc->eb_cache,
1546                                     start, chunk->length);
1547         if (!cache) {
1548                 /* No used space, we can reorder the stripes freely. */
1549                 ret = btrfs_rebuild_unordered_chunk_stripes(rc, chunk);
1550                 return ret;
1551         }
1552
1553         list_splice_init(&chunk->dextents, &devexts);
1554 again:
1555         er = container_of(cache, struct extent_record, cache);
1556         index = btrfs_calc_stripe_index(chunk, er->cache.start);
1557         if (chunk->stripes[index].devid)
1558                 goto next;
1559         list_for_each_entry_safe(devext, next, &devexts, chunk_list) {
1560                 if (is_extent_record_in_device_extent(er, devext, &mirror)) {
1561                         chunk->stripes[index].devid = devext->objectid;
1562                         chunk->stripes[index].offset = devext->offset;
1563                         memcpy(chunk->stripes[index].dev_uuid,
1564                                er->devices[mirror]->uuid,
1565                                BTRFS_UUID_SIZE);
1566                         index++;
1567                         list_move(&devext->chunk_list, &chunk->dextents);
1568                 }
1569         }
1570 next:
1571         start = btrfs_next_stripe_logical_offset(chunk, er->cache.start);
1572         if (start >= end)
1573                 goto no_extent_record;
1574
1575         cache = lookup_cache_extent(&rc->eb_cache, start, end - start);
1576         if (cache)
1577                 goto again;
1578 no_extent_record:
1579         if (list_empty(&devexts))
1580                 return 0;
1581
1582         if (chunk->type_flags & (BTRFS_BLOCK_GROUP_RAID5 |
1583                                  BTRFS_BLOCK_GROUP_RAID6)) {
1584                 /* Fixme: try to recover the order by the parity block. */
1585                 list_splice_tail(&devexts, &chunk->dextents);
1586                 return -EINVAL;
1587         }
1588
1589         /* There is no data on the lost stripes, we can reorder them freely. */
1590         for (index = 0; index < chunk->num_stripes; index++) {
1591                 if (chunk->stripes[index].devid)
1592                         continue;
1593
1594                 devext = list_first_entry(&devexts,
1595                                           struct device_extent_record,
1596                                            chunk_list);
1597                 list_move(&devext->chunk_list, &chunk->dextents);
1598
1599                 chunk->stripes[index].devid = devext->objectid;
1600                 chunk->stripes[index].offset = devext->offset;
1601                 device = btrfs_find_device_by_devid(rc->fs_devices,
1602                                                     devext->objectid,
1603                                                     0);
1604                 if (!device) {
1605                         list_splice_tail(&devexts, &chunk->dextents);
1606                         return -EINVAL;
1607                 }
1608                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1609                                                   devext->objectid,
1610                                                   1));
1611                 memcpy(chunk->stripes[index].dev_uuid, device->uuid,
1612                        BTRFS_UUID_SIZE);
1613         }
1614         return 0;
1615 }
1616
1617 #define BTRFS_ORDERED_RAID      (BTRFS_BLOCK_GROUP_RAID0 |      \
1618                                  BTRFS_BLOCK_GROUP_RAID10 |     \
1619                                  BTRFS_BLOCK_GROUP_RAID5 |      \
1620                                  BTRFS_BLOCK_GROUP_RAID6)
1621
1622 static int btrfs_rebuild_chunk_stripes(struct recover_control *rc,
1623                                        struct chunk_record *chunk)
1624 {
1625         int ret;
1626
1627         /*
1628          * All the data in the system metadata chunk will be dropped,
1629          * so we need not guarantee that the data is right or not, that
1630          * is we can reorder the stripes in the system metadata chunk.
1631          */
1632         if ((chunk->type_flags & BTRFS_BLOCK_GROUP_METADATA) &&
1633             (chunk->type_flags & BTRFS_ORDERED_RAID))
1634                 ret =btrfs_rebuild_ordered_meta_chunk_stripes(rc, chunk);
1635         else if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA) &&
1636                  (chunk->type_flags & BTRFS_ORDERED_RAID))
1637                 ret = 1;        /* Be handled after the fs is opened. */
1638         else
1639                 ret = btrfs_rebuild_unordered_chunk_stripes(rc, chunk);
1640
1641         return ret;
1642 }
1643
1644 static int next_csum(struct btrfs_root *root,
1645                      struct extent_buffer **leaf,
1646                      struct btrfs_path *path,
1647                      int *slot,
1648                      u64 *csum_offset,
1649                      u32 *tree_csum,
1650                      u64 end,
1651                      struct btrfs_key *key)
1652 {
1653         int ret = 0;
1654         struct btrfs_root *csum_root = root->fs_info->csum_root;
1655         struct btrfs_csum_item *csum_item;
1656         u32 blocksize = root->sectorsize;
1657         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1658         int csums_in_item = btrfs_item_size_nr(*leaf, *slot) / csum_size;
1659
1660         if (*csum_offset >= csums_in_item) {
1661                 ++(*slot);
1662                 *csum_offset = 0;
1663                 if (*slot >= btrfs_header_nritems(*leaf)) {
1664                         ret = btrfs_next_leaf(csum_root, path);
1665                         if (ret < 0)
1666                                 return -1;
1667                         else if (ret > 0)
1668                                 return 1;
1669                         *leaf = path->nodes[0];
1670                         *slot = path->slots[0];
1671                 }
1672                 btrfs_item_key_to_cpu(*leaf, key, *slot);
1673         }
1674
1675         if (key->offset + (*csum_offset) * blocksize >= end)
1676                 return 2;
1677         csum_item = btrfs_item_ptr(*leaf, *slot, struct btrfs_csum_item);
1678         csum_item = (struct btrfs_csum_item *)((unsigned char *)csum_item
1679                                              + (*csum_offset) * csum_size);
1680         read_extent_buffer(*leaf, tree_csum,
1681                           (unsigned long)csum_item, csum_size);
1682         return ret;
1683 }
1684
1685 static u64 calc_data_offset(struct btrfs_key *key,
1686                             struct chunk_record *chunk,
1687                             u64 dev_offset,
1688                             u64 csum_offset,
1689                             u32 blocksize)
1690 {
1691         u64 data_offset;
1692         int logical_stripe_nr;
1693         int dev_stripe_nr;
1694         int nr_data_stripes;
1695
1696         data_offset = key->offset + csum_offset * blocksize - chunk->offset;
1697         nr_data_stripes = chunk->num_stripes;
1698
1699         if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5)
1700                 nr_data_stripes -= 1;
1701         else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6)
1702                 nr_data_stripes -= 2;
1703
1704         logical_stripe_nr = data_offset / chunk->stripe_len;
1705         dev_stripe_nr = logical_stripe_nr / nr_data_stripes;
1706
1707         data_offset -= logical_stripe_nr * chunk->stripe_len;
1708         data_offset += dev_stripe_nr * chunk->stripe_len;
1709
1710         return dev_offset + data_offset;
1711 }
1712
1713 static int check_one_csum(int fd, u64 start, u32 len, u32 tree_csum)
1714 {
1715         char *data;
1716         int ret = 0;
1717         u32 csum_result = ~(u32)0;
1718
1719         data = malloc(len);
1720         if (!data)
1721                 return -1;
1722         ret = pread64(fd, data, len, start);
1723         if (ret < 0 || ret != len) {
1724                 ret = -1;
1725                 goto out;
1726         }
1727         ret = 0;
1728         csum_result = btrfs_csum_data(NULL, data, csum_result, len);
1729         btrfs_csum_final(csum_result, (char *)&csum_result);
1730         if (csum_result != tree_csum)
1731                 ret = 1;
1732 out:
1733         free(data);
1734         return ret;
1735 }
1736
1737 static u64 item_end_offset(struct btrfs_root *root, struct btrfs_key *key,
1738                            struct extent_buffer *leaf, int slot) {
1739         u32 blocksize = root->sectorsize;
1740         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1741
1742         u64 offset = btrfs_item_size_nr(leaf, slot);
1743         offset /= csum_size;
1744         offset *= blocksize;
1745         offset += key->offset;
1746
1747         return offset;
1748 }
1749
1750 static int insert_stripe(struct list_head *devexts,
1751                          struct recover_control *rc,
1752                          struct chunk_record *chunk,
1753                          int index) {
1754         struct device_extent_record *devext;
1755         struct btrfs_device *dev;
1756
1757         devext = list_entry(devexts->next, struct device_extent_record,
1758                             chunk_list);
1759         dev = btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
1760                                         0);
1761         if (!dev)
1762                 return 1;
1763         BUG_ON(btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
1764                                         1));
1765
1766         chunk->stripes[index].devid = devext->objectid;
1767         chunk->stripes[index].offset = devext->offset;
1768         memcpy(chunk->stripes[index].dev_uuid, dev->uuid, BTRFS_UUID_SIZE);
1769
1770         list_move(&devext->chunk_list, &chunk->dextents);
1771
1772         return 0;
1773 }
1774
1775 static inline int count_devext_records(struct list_head *record_list)
1776 {
1777         int num_of_records = 0;
1778         struct device_extent_record *devext;
1779
1780         list_for_each_entry(devext, record_list, chunk_list)
1781                 num_of_records++;
1782
1783         return num_of_records;
1784 }
1785
1786 static int fill_chunk_up(struct chunk_record *chunk, struct list_head *devexts,
1787                          struct recover_control *rc)
1788 {
1789         int ret = 0;
1790         int i;
1791
1792         for (i = 0; i < chunk->num_stripes; i++) {
1793                 if (!chunk->stripes[i].devid) {
1794                         ret = insert_stripe(devexts, rc, chunk, i);
1795                         if (ret)
1796                                 break;
1797                 }
1798         }
1799
1800         return ret;
1801 }
1802
1803 #define EQUAL_STRIPE (1 << 0)
1804
1805 static int rebuild_raid_data_chunk_stripes(struct recover_control *rc,
1806                                            struct btrfs_root *root,
1807                                            struct chunk_record *chunk,
1808                                            u8 *flags)
1809 {
1810         int i;
1811         int ret = 0;
1812         int slot;
1813         struct btrfs_path path;
1814         struct btrfs_key prev_key;
1815         struct btrfs_key key;
1816         struct btrfs_root *csum_root;
1817         struct extent_buffer *leaf;
1818         struct device_extent_record *devext;
1819         struct device_extent_record *next;
1820         struct btrfs_device *dev;
1821         u64 start = chunk->offset;
1822         u64 end = start + chunk->stripe_len;
1823         u64 chunk_end = chunk->offset + chunk->length;
1824         u64 csum_offset = 0;
1825         u64 data_offset;
1826         u32 blocksize = root->sectorsize;
1827         u32 tree_csum;
1828         int index = 0;
1829         int num_unordered = 0;
1830         LIST_HEAD(unordered);
1831         LIST_HEAD(candidates);
1832
1833         csum_root = root->fs_info->csum_root;
1834         btrfs_init_path(&path);
1835         list_splice_init(&chunk->dextents, &candidates);
1836 again:
1837         if (list_is_last(candidates.next, &candidates))
1838                 goto out;
1839
1840         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1841         key.type = BTRFS_EXTENT_CSUM_KEY;
1842         key.offset = start;
1843
1844         ret = btrfs_search_slot(NULL, csum_root, &key, &path, 0, 0);
1845         if (ret < 0) {
1846                 fprintf(stderr, "Search csum failed(%d)\n", ret);
1847                 goto fail_out;
1848         }
1849         leaf = path.nodes[0];
1850         slot = path.slots[0];
1851         if (ret > 0) {
1852                 if (slot >= btrfs_header_nritems(leaf)) {
1853                         ret = btrfs_next_leaf(csum_root, &path);
1854                         if (ret < 0) {
1855                                 fprintf(stderr,
1856                                         "Walk tree failed(%d)\n", ret);
1857                                 goto fail_out;
1858                         } else if (ret > 0) {
1859                                 slot = btrfs_header_nritems(leaf) - 1;
1860                                 btrfs_item_key_to_cpu(leaf, &key, slot);
1861                                 if (item_end_offset(root, &key, leaf, slot)
1862                                                                 > start) {
1863                                         csum_offset = start - key.offset;
1864                                         csum_offset /= blocksize;
1865                                         goto next_csum;
1866                                 }
1867                                 goto next_stripe;
1868                         }
1869                         leaf = path.nodes[0];
1870                         slot = path.slots[0];
1871                 }
1872                 btrfs_item_key_to_cpu(leaf, &key, slot);
1873                 ret = btrfs_previous_item(csum_root, &path, 0,
1874                                           BTRFS_EXTENT_CSUM_KEY);
1875                 if (ret < 0)
1876                         goto fail_out;
1877                 else if (ret > 0) {
1878                         if (key.offset >= end)
1879                                 goto next_stripe;
1880                         else
1881                                 goto next_csum;
1882                 }
1883                 leaf = path.nodes[0];
1884                 slot = path.slots[0];
1885
1886                 btrfs_item_key_to_cpu(leaf, &prev_key, slot);
1887                 if (item_end_offset(root, &prev_key, leaf, slot) > start) {
1888                         csum_offset = start - prev_key.offset;
1889                         csum_offset /= blocksize;
1890                         btrfs_item_key_to_cpu(leaf, &key, slot);
1891                 } else {
1892                         if (key.offset >= end)
1893                                 goto next_stripe;
1894                 }
1895
1896                 if (key.offset + csum_offset * blocksize > chunk_end)
1897                         goto out;
1898         }
1899 next_csum:
1900         ret = next_csum(root, &leaf, &path, &slot, &csum_offset, &tree_csum,
1901                         end, &key);
1902         if (ret < 0) {
1903                 fprintf(stderr, "Fetch csum failed\n");
1904                 goto fail_out;
1905         } else if (ret == 1) {
1906                 if (!(*flags & EQUAL_STRIPE))
1907                         *flags |= EQUAL_STRIPE;
1908                 goto out;
1909         } else if (ret == 2)
1910                 goto next_stripe;
1911
1912         list_for_each_entry_safe(devext, next, &candidates, chunk_list) {
1913                 data_offset = calc_data_offset(&key, chunk, devext->offset,
1914                                                csum_offset, blocksize);
1915                 dev = btrfs_find_device_by_devid(rc->fs_devices,
1916                                                  devext->objectid, 0);
1917                 if (!dev) {
1918                         ret = 1;
1919                         goto fail_out;
1920                 }
1921                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1922                                                   devext->objectid, 1));
1923
1924                 ret = check_one_csum(dev->fd, data_offset, blocksize,
1925                                      tree_csum);
1926                 if (ret < 0)
1927                         goto fail_out;
1928                 else if (ret > 0)
1929                         list_move(&devext->chunk_list, &unordered);
1930         }
1931
1932         if (list_empty(&candidates)) {
1933                 num_unordered = count_devext_records(&unordered);
1934                 if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6
1935                                         && num_unordered == 2) {
1936                         btrfs_release_path(&path);
1937                         ret = fill_chunk_up(chunk, &unordered, rc);
1938                         return ret;
1939                 }
1940
1941                 goto next_stripe;
1942         }
1943
1944         if (list_is_last(candidates.next, &candidates)) {
1945                 index = btrfs_calc_stripe_index(chunk,
1946                         key.offset + csum_offset * blocksize);
1947                 if (chunk->stripes[index].devid)
1948                         goto next_stripe;
1949                 ret = insert_stripe(&candidates, rc, chunk, index);
1950                 if (ret)
1951                         goto fail_out;
1952         } else {
1953                 csum_offset++;
1954                 goto next_csum;
1955         }
1956 next_stripe:
1957         start = btrfs_next_stripe_logical_offset(chunk, start);
1958         end = min(start + chunk->stripe_len, chunk_end);
1959         list_splice_init(&unordered, &candidates);
1960         btrfs_release_path(&path);
1961         csum_offset = 0;
1962         if (end < chunk_end)
1963                 goto again;
1964 out:
1965         ret = 0;
1966         list_splice_init(&candidates, &unordered);
1967         num_unordered = count_devext_records(&unordered);
1968         if (num_unordered == 1) {
1969                 for (i = 0; i < chunk->num_stripes; i++) {
1970                         if (!chunk->stripes[i].devid) {
1971                                 index = i;
1972                                 break;
1973                         }
1974                 }
1975                 ret = insert_stripe(&unordered, rc, chunk, index);
1976                 if (ret)
1977                         goto fail_out;
1978         } else {
1979                 if ((num_unordered == 2 && chunk->type_flags
1980                         & BTRFS_BLOCK_GROUP_RAID5)
1981                  || (num_unordered == 3 && chunk->type_flags
1982                         & BTRFS_BLOCK_GROUP_RAID6)) {
1983                         ret = fill_chunk_up(chunk, &unordered, rc);
1984                 }
1985         }
1986 fail_out:
1987         ret = !!ret || (list_empty(&unordered) ? 0 : 1);
1988         list_splice_init(&candidates, &chunk->dextents);
1989         list_splice_init(&unordered, &chunk->dextents);
1990         btrfs_release_path(&path);
1991
1992         return ret;
1993 }
1994
1995 static int btrfs_rebuild_ordered_data_chunk_stripes(struct recover_control *rc,
1996                                            struct btrfs_root *root)
1997 {
1998         struct chunk_record *chunk;
1999         struct chunk_record *next;
2000         int ret = 0;
2001         int err;
2002         u8 flags;
2003
2004         list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) {
2005                 if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA)
2006                  && (chunk->type_flags & BTRFS_ORDERED_RAID)) {
2007                         flags = 0;
2008                         err = rebuild_raid_data_chunk_stripes(rc, root, chunk,
2009                                                               &flags);
2010                         if (err) {
2011                                 list_move(&chunk->list, &rc->bad_chunks);
2012                                 if (flags & EQUAL_STRIPE)
2013                                         fprintf(stderr,
2014                         "Failure: too many equal stripes in chunk[%llu %llu]\n",
2015                                                 chunk->offset, chunk->length);
2016                                 if (!ret)
2017                                         ret = err;
2018                         } else
2019                                 list_move(&chunk->list, &rc->good_chunks);
2020                 }
2021         }
2022         return ret;
2023 }
2024
2025 static int btrfs_recover_chunks(struct recover_control *rc)
2026 {
2027         struct chunk_record *chunk;
2028         struct block_group_record *bg;
2029         struct block_group_record *next;
2030         LIST_HEAD(new_chunks);
2031         LIST_HEAD(devexts);
2032         int nstripes;
2033         int ret;
2034
2035         /* create the chunk by block group */
2036         list_for_each_entry_safe(bg, next, &rc->bg.block_groups, list) {
2037                 nstripes = btrfs_get_device_extents(bg->objectid,
2038                                                     &rc->devext.no_chunk_orphans,
2039                                                     &devexts);
2040                 chunk = malloc(btrfs_chunk_record_size(nstripes));
2041                 if (!chunk)
2042                         return -ENOMEM;
2043                 memset(chunk, 0, btrfs_chunk_record_size(nstripes));
2044                 INIT_LIST_HEAD(&chunk->dextents);
2045                 chunk->bg_rec = bg;
2046                 chunk->cache.start = bg->objectid;
2047                 chunk->cache.size = bg->offset;
2048                 chunk->objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2049                 chunk->type = BTRFS_CHUNK_ITEM_KEY;
2050                 chunk->offset = bg->objectid;
2051                 chunk->generation = bg->generation;
2052                 chunk->length = bg->offset;
2053                 chunk->owner = BTRFS_CHUNK_TREE_OBJECTID;
2054                 chunk->stripe_len = BTRFS_STRIPE_LEN;
2055                 chunk->type_flags = bg->flags;
2056                 chunk->io_width = BTRFS_STRIPE_LEN;
2057                 chunk->io_align = BTRFS_STRIPE_LEN;
2058                 chunk->sector_size = rc->sectorsize;
2059                 chunk->sub_stripes = calc_sub_nstripes(bg->flags);
2060
2061                 ret = insert_cache_extent(&rc->chunk, &chunk->cache);
2062                 BUG_ON(ret);
2063
2064                 if (!nstripes) {
2065                         list_add_tail(&chunk->list, &rc->bad_chunks);
2066                         continue;
2067                 }
2068
2069                 list_splice_init(&devexts, &chunk->dextents);
2070
2071                 ret = btrfs_verify_device_extents(bg, &devexts, nstripes);
2072                 if (ret) {
2073                         list_add_tail(&chunk->list, &rc->bad_chunks);
2074                         continue;
2075                 }
2076
2077                 chunk->num_stripes = nstripes;
2078                 ret = btrfs_rebuild_chunk_stripes(rc, chunk);
2079                 if (ret > 0)
2080                         list_add_tail(&chunk->list, &rc->unrepaired_chunks);
2081                 else if (ret < 0)
2082                         list_add_tail(&chunk->list, &rc->bad_chunks);
2083                 else
2084                         list_add_tail(&chunk->list, &rc->good_chunks);
2085         }
2086         /*
2087          * Don't worry about the lost orphan device extents, they don't
2088          * have its chunk and block group, they must be the old ones that
2089          * we have dropped.
2090          */
2091         return 0;
2092 }
2093
2094 /*
2095  * Return 0 when succesful, < 0 on error and > 0 if aborted by user
2096  */
2097 int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
2098 {
2099         int ret = 0;
2100         struct btrfs_root *root = NULL;
2101         struct btrfs_trans_handle *trans;
2102         struct recover_control rc;
2103
2104         init_recover_control(&rc, verbose, yes);
2105
2106         ret = recover_prepare(&rc, path);
2107         if (ret) {
2108                 fprintf(stderr, "recover prepare error\n");
2109                 return ret;
2110         }
2111
2112         ret = scan_devices(&rc);
2113         if (ret) {
2114                 fprintf(stderr, "scan chunk headers error\n");
2115                 goto fail_rc;
2116         }
2117
2118         if (cache_tree_empty(&rc.chunk) &&
2119             cache_tree_empty(&rc.bg.tree) &&
2120             cache_tree_empty(&rc.devext.tree)) {
2121                 fprintf(stderr, "no recoverable chunk\n");
2122                 goto fail_rc;
2123         }
2124
2125         print_scan_result(&rc);
2126
2127         ret = check_chunks(&rc.chunk, &rc.bg, &rc.devext, &rc.good_chunks,
2128                            &rc.bad_chunks, 1);
2129         print_check_result(&rc);
2130         if (ret) {
2131                 if (!list_empty(&rc.bg.block_groups) ||
2132                     !list_empty(&rc.devext.no_chunk_orphans)) {
2133                         ret = btrfs_recover_chunks(&rc);
2134                         if (ret)
2135                                 goto fail_rc;
2136                 }
2137                 /*
2138                  * If the chunk is healthy, its block group item and device
2139                  * extent item should be written on the disks. So, it is very
2140                  * likely that the bad chunk is a old one that has been
2141                  * droppped from the fs. Don't deal with them now, we will
2142                  * check it after the fs is opened.
2143                  */
2144         } else {
2145                 fprintf(stderr, "Check chunks successfully with no orphans\n");
2146                 goto fail_rc;
2147         }
2148
2149         root = open_ctree_with_broken_chunk(&rc);
2150         if (IS_ERR(root)) {
2151                 fprintf(stderr, "open with broken chunk error\n");
2152                 ret = PTR_ERR(root);
2153                 goto fail_rc;
2154         }
2155
2156         ret = check_all_chunks_by_metadata(&rc, root);
2157         if (ret) {
2158                 fprintf(stderr, "The chunks in memory can not match the metadata of the fs. Repair failed.\n");
2159                 goto fail_close_ctree;
2160         }
2161
2162         ret = btrfs_rebuild_ordered_data_chunk_stripes(&rc, root);
2163         if (ret) {
2164                 fprintf(stderr, "Failed to rebuild ordered chunk stripes.\n");
2165                 goto fail_close_ctree;
2166         }
2167
2168         if (!rc.yes) {
2169                 ret = ask_user("We are going to rebuild the chunk tree on disk, it might destroy the old metadata on the disk, Are you sure?");
2170                 if (!ret) {
2171                         ret = 1;
2172                         goto fail_close_ctree;
2173                 }
2174         }
2175
2176         trans = btrfs_start_transaction(root, 1);
2177         ret = remove_chunk_extent_item(trans, &rc, root);
2178         BUG_ON(ret);
2179
2180         ret = rebuild_chunk_tree(trans, &rc, root);
2181         BUG_ON(ret);
2182
2183         ret = rebuild_sys_array(&rc, root);
2184         BUG_ON(ret);
2185
2186         btrfs_commit_transaction(trans, root);
2187 fail_close_ctree:
2188         close_ctree(root);
2189 fail_rc:
2190         free_recover_control(&rc);
2191         return ret;
2192 }