btrfs-progs: introduce a proper structure on which cli will call register-device...
[platform/upstream/btrfs-progs.git] / chunk-recover.c
1 /*
2  * Copyright (C) 2013 FUJITSU LIMITED.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #define _XOPEN_SOURCE 500
19 #define _GNU_SOURCE
20
21 #include <stdio.h>
22 #include <stdio_ext.h>
23 #include <stdlib.h>
24 #include <sys/types.h>
25 #include <sys/stat.h>
26 #include <fcntl.h>
27 #include <unistd.h>
28 #include <uuid/uuid.h>
29 #include <pthread.h>
30
31 #include "kerncompat.h"
32 #include "list.h"
33 #include "radix-tree.h"
34 #include "ctree.h"
35 #include "extent-cache.h"
36 #include "disk-io.h"
37 #include "volumes.h"
38 #include "transaction.h"
39 #include "crc32c.h"
40 #include "utils.h"
41 #include "version.h"
42 #include "btrfsck.h"
43 #include "commands.h"
44
45 struct recover_control {
46         int verbose;
47         int yes;
48
49         u16 csum_size;
50         u32 sectorsize;
51         u32 leafsize;
52         u64 generation;
53         u64 chunk_root_generation;
54
55         struct btrfs_fs_devices *fs_devices;
56
57         struct cache_tree chunk;
58         struct block_group_tree bg;
59         struct device_extent_tree devext;
60         struct cache_tree eb_cache;
61
62         struct list_head good_chunks;
63         struct list_head bad_chunks;
64         struct list_head unrepaired_chunks;
65         pthread_mutex_t rc_lock;
66 };
67
68 struct extent_record {
69         struct cache_extent cache;
70         u64 generation;
71         u8 csum[BTRFS_CSUM_SIZE];
72         struct btrfs_device *devices[BTRFS_MAX_MIRRORS];
73         u64 offsets[BTRFS_MAX_MIRRORS];
74         int nmirrors;
75 };
76
77 struct device_scan {
78         struct recover_control *rc;
79         struct btrfs_device *dev;
80         int fd;
81 };
82
83 static struct extent_record *btrfs_new_extent_record(struct extent_buffer *eb)
84 {
85         struct extent_record *rec;
86
87         rec = malloc(sizeof(*rec));
88         if (!rec) {
89                 fprintf(stderr, "Fail to allocate memory for extent record.\n");
90                 exit(1);
91         }
92
93         memset(rec, 0, sizeof(*rec));
94         rec->cache.start = btrfs_header_bytenr(eb);
95         rec->cache.size = eb->len;
96         rec->generation = btrfs_header_generation(eb);
97         read_extent_buffer(eb, rec->csum, (unsigned long)btrfs_header_csum(eb),
98                            BTRFS_CSUM_SIZE);
99         return rec;
100 }
101
102 static int process_extent_buffer(struct cache_tree *eb_cache,
103                                  struct extent_buffer *eb,
104                                  struct btrfs_device *device, u64 offset)
105 {
106         struct extent_record *rec;
107         struct extent_record *exist;
108         struct cache_extent *cache;
109         int ret = 0;
110
111         rec = btrfs_new_extent_record(eb);
112         if (!rec->cache.size)
113                 goto free_out;
114 again:
115         cache = lookup_cache_extent(eb_cache,
116                                     rec->cache.start,
117                                     rec->cache.size);
118         if (cache) {
119                 exist = container_of(cache, struct extent_record, cache);
120
121                 if (exist->generation > rec->generation)
122                         goto free_out;
123                 if (exist->generation == rec->generation) {
124                         if (exist->cache.start != rec->cache.start ||
125                             exist->cache.size != rec->cache.size ||
126                             memcmp(exist->csum, rec->csum, BTRFS_CSUM_SIZE)) {
127                                 ret = -EEXIST;
128                         } else {
129                                 BUG_ON(exist->nmirrors >= BTRFS_MAX_MIRRORS);
130                                 exist->devices[exist->nmirrors] = device;
131                                 exist->offsets[exist->nmirrors] = offset;
132                                 exist->nmirrors++;
133                         }
134                         goto free_out;
135                 }
136                 remove_cache_extent(eb_cache, cache);
137                 free(exist);
138                 goto again;
139         }
140
141         rec->devices[0] = device;
142         rec->offsets[0] = offset;
143         rec->nmirrors++;
144         ret = insert_cache_extent(eb_cache, &rec->cache);
145         BUG_ON(ret);
146 out:
147         return ret;
148 free_out:
149         free(rec);
150         goto out;
151 }
152
153 static void free_extent_record(struct cache_extent *cache)
154 {
155         struct extent_record *er;
156
157         er = container_of(cache, struct extent_record, cache);
158         free(er);
159 }
160
161 FREE_EXTENT_CACHE_BASED_TREE(extent_record, free_extent_record);
162
163 static struct btrfs_chunk *create_chunk_item(struct chunk_record *record)
164 {
165         struct btrfs_chunk *ret;
166         struct btrfs_stripe *chunk_stripe;
167         int i;
168
169         if (!record || record->num_stripes == 0)
170                 return NULL;
171         ret = malloc(btrfs_chunk_item_size(record->num_stripes));
172         if (!ret)
173                 return NULL;
174         btrfs_set_stack_chunk_length(ret, record->length);
175         btrfs_set_stack_chunk_owner(ret, record->owner);
176         btrfs_set_stack_chunk_stripe_len(ret, record->stripe_len);
177         btrfs_set_stack_chunk_type(ret, record->type_flags);
178         btrfs_set_stack_chunk_io_align(ret, record->io_align);
179         btrfs_set_stack_chunk_io_width(ret, record->io_width);
180         btrfs_set_stack_chunk_sector_size(ret, record->sector_size);
181         btrfs_set_stack_chunk_num_stripes(ret, record->num_stripes);
182         btrfs_set_stack_chunk_sub_stripes(ret, record->sub_stripes);
183         for (i = 0, chunk_stripe = &ret->stripe; i < record->num_stripes;
184              i++, chunk_stripe++) {
185                 btrfs_set_stack_stripe_devid(chunk_stripe,
186                                 record->stripes[i].devid);
187                 btrfs_set_stack_stripe_offset(chunk_stripe,
188                                 record->stripes[i].offset);
189                 memcpy(chunk_stripe->dev_uuid, record->stripes[i].dev_uuid,
190                        BTRFS_UUID_SIZE);
191         }
192         return ret;
193 }
194
195 static void init_recover_control(struct recover_control *rc, int verbose,
196                 int yes)
197 {
198         memset(rc, 0, sizeof(struct recover_control));
199         cache_tree_init(&rc->chunk);
200         cache_tree_init(&rc->eb_cache);
201         block_group_tree_init(&rc->bg);
202         device_extent_tree_init(&rc->devext);
203
204         INIT_LIST_HEAD(&rc->good_chunks);
205         INIT_LIST_HEAD(&rc->bad_chunks);
206         INIT_LIST_HEAD(&rc->unrepaired_chunks);
207
208         rc->verbose = verbose;
209         rc->yes = yes;
210         pthread_mutex_init(&rc->rc_lock, NULL);
211 }
212
213 static void free_recover_control(struct recover_control *rc)
214 {
215         free_block_group_tree(&rc->bg);
216         free_chunk_cache_tree(&rc->chunk);
217         free_device_extent_tree(&rc->devext);
218         free_extent_record_tree(&rc->eb_cache);
219         pthread_mutex_destroy(&rc->rc_lock);
220 }
221
222 static int process_block_group_item(struct block_group_tree *bg_cache,
223                                     struct extent_buffer *leaf,
224                                     struct btrfs_key *key, int slot)
225 {
226         struct block_group_record *rec;
227         struct block_group_record *exist;
228         struct cache_extent *cache;
229         int ret = 0;
230
231         rec = btrfs_new_block_group_record(leaf, key, slot);
232         if (!rec->cache.size)
233                 goto free_out;
234 again:
235         cache = lookup_cache_extent(&bg_cache->tree,
236                                     rec->cache.start,
237                                     rec->cache.size);
238         if (cache) {
239                 exist = container_of(cache, struct block_group_record, cache);
240
241                 /*check the generation and replace if needed*/
242                 if (exist->generation > rec->generation)
243                         goto free_out;
244                 if (exist->generation == rec->generation) {
245                         int offset = offsetof(struct block_group_record,
246                                               generation);
247                         /*
248                          * According to the current kernel code, the following
249                          * case is impossble, or there is something wrong in
250                          * the kernel code.
251                          */
252                         if (memcmp(((void *)exist) + offset,
253                                    ((void *)rec) + offset,
254                                    sizeof(*rec) - offset))
255                                 ret = -EEXIST;
256                         goto free_out;
257                 }
258                 remove_cache_extent(&bg_cache->tree, cache);
259                 list_del_init(&exist->list);
260                 free(exist);
261                 /*
262                  * We must do seach again to avoid the following cache.
263                  * /--old bg 1--//--old bg 2--/
264                  *        /--new bg--/
265                  */
266                 goto again;
267         }
268
269         ret = insert_block_group_record(bg_cache, rec);
270         BUG_ON(ret);
271 out:
272         return ret;
273 free_out:
274         free(rec);
275         goto out;
276 }
277
278 static int process_chunk_item(struct cache_tree *chunk_cache,
279                               struct extent_buffer *leaf, struct btrfs_key *key,
280                               int slot)
281 {
282         struct chunk_record *rec;
283         struct chunk_record *exist;
284         struct cache_extent *cache;
285         int ret = 0;
286
287         rec = btrfs_new_chunk_record(leaf, key, slot);
288         if (!rec->cache.size)
289                 goto free_out;
290 again:
291         cache = lookup_cache_extent(chunk_cache, rec->offset, rec->length);
292         if (cache) {
293                 exist = container_of(cache, struct chunk_record, cache);
294
295                 if (exist->generation > rec->generation)
296                         goto free_out;
297                 if (exist->generation == rec->generation) {
298                         int num_stripes = rec->num_stripes;
299                         int rec_size = btrfs_chunk_record_size(num_stripes);
300                         int offset = offsetof(struct chunk_record, generation);
301
302                         if (exist->num_stripes != rec->num_stripes ||
303                             memcmp(((void *)exist) + offset,
304                                    ((void *)rec) + offset,
305                                    rec_size - offset))
306                                 ret = -EEXIST;
307                         goto free_out;
308                 }
309                 remove_cache_extent(chunk_cache, cache);
310                 free(exist);
311                 goto again;
312         }
313         ret = insert_cache_extent(chunk_cache, &rec->cache);
314         BUG_ON(ret);
315 out:
316         return ret;
317 free_out:
318         free(rec);
319         goto out;
320 }
321
322 static int process_device_extent_item(struct device_extent_tree *devext_cache,
323                                       struct extent_buffer *leaf,
324                                       struct btrfs_key *key, int slot)
325 {
326         struct device_extent_record *rec;
327         struct device_extent_record *exist;
328         struct cache_extent *cache;
329         int ret = 0;
330
331         rec = btrfs_new_device_extent_record(leaf, key, slot);
332         if (!rec->cache.size)
333                 goto free_out;
334 again:
335         cache = lookup_cache_extent2(&devext_cache->tree,
336                                      rec->cache.objectid,
337                                      rec->cache.start,
338                                      rec->cache.size);
339         if (cache) {
340                 exist = container_of(cache, struct device_extent_record, cache);
341                 if (exist->generation > rec->generation)
342                         goto free_out;
343                 if (exist->generation == rec->generation) {
344                         int offset = offsetof(struct device_extent_record,
345                                               generation);
346                         if (memcmp(((void *)exist) + offset,
347                                    ((void *)rec) + offset,
348                                    sizeof(*rec) - offset))
349                                 ret = -EEXIST;
350                         goto free_out;
351                 }
352                 remove_cache_extent(&devext_cache->tree, cache);
353                 list_del_init(&exist->chunk_list);
354                 list_del_init(&exist->device_list);
355                 free(exist);
356                 goto again;
357         }
358
359         ret = insert_device_extent_record(devext_cache, rec);
360         BUG_ON(ret);
361 out:
362         return ret;
363 free_out:
364         free(rec);
365         goto out;
366 }
367
368 static void print_block_group_info(struct block_group_record *rec, char *prefix)
369 {
370         if (prefix)
371                 printf("%s", prefix);
372         printf("Block Group: start = %llu, len = %llu, flag = %llx\n",
373                rec->objectid, rec->offset, rec->flags);
374 }
375
376 static void print_block_group_tree(struct block_group_tree *tree)
377 {
378         struct cache_extent *cache;
379         struct block_group_record *rec;
380
381         printf("All Block Groups:\n");
382         for (cache = first_cache_extent(&tree->tree); cache;
383              cache = next_cache_extent(cache)) {
384                 rec = container_of(cache, struct block_group_record, cache);
385                 print_block_group_info(rec, "\t");
386         }
387         printf("\n");
388 }
389
390 static void print_stripe_info(struct stripe *data, char *prefix1, char *prefix2,
391                               int index)
392 {
393         if (prefix1)
394                 printf("%s", prefix1);
395         if (prefix2)
396                 printf("%s", prefix2);
397         printf("[%2d] Stripe: devid = %llu, offset = %llu\n",
398                index, data->devid, data->offset);
399 }
400
401 static void print_chunk_self_info(struct chunk_record *rec, char *prefix)
402 {
403         int i;
404
405         if (prefix)
406                 printf("%s", prefix);
407         printf("Chunk: start = %llu, len = %llu, type = %llx, num_stripes = %u\n",
408                rec->offset, rec->length, rec->type_flags, rec->num_stripes);
409         if (prefix)
410                 printf("%s", prefix);
411         printf("    Stripes list:\n");
412         for (i = 0; i < rec->num_stripes; i++)
413                 print_stripe_info(&rec->stripes[i], prefix, "    ", i);
414 }
415
416 static void print_chunk_tree(struct cache_tree *tree)
417 {
418         struct cache_extent *n;
419         struct chunk_record *entry;
420
421         printf("All Chunks:\n");
422         for (n = first_cache_extent(tree); n;
423              n = next_cache_extent(n)) {
424                 entry = container_of(n, struct chunk_record, cache);
425                 print_chunk_self_info(entry, "\t");
426         }
427         printf("\n");
428 }
429
430 static void print_device_extent_info(struct device_extent_record *rec,
431                                      char *prefix)
432 {
433         if (prefix)
434                 printf("%s", prefix);
435         printf("Device extent: devid = %llu, start = %llu, len = %llu, chunk offset = %llu\n",
436                rec->objectid, rec->offset, rec->length, rec->chunk_offset);
437 }
438
439 static void print_device_extent_tree(struct device_extent_tree *tree)
440 {
441         struct cache_extent *n;
442         struct device_extent_record *entry;
443
444         printf("All Device Extents:\n");
445         for (n = first_cache_extent(&tree->tree); n;
446              n = next_cache_extent(n)) {
447                 entry = container_of(n, struct device_extent_record, cache);
448                 print_device_extent_info(entry, "\t");
449         }
450         printf("\n");
451 }
452
453 static void print_device_info(struct btrfs_device *device, char *prefix)
454 {
455         if (prefix)
456                 printf("%s", prefix);
457         printf("Device: id = %llu, name = %s\n",
458                device->devid, device->name);
459 }
460
461 static void print_all_devices(struct list_head *devices)
462 {
463         struct btrfs_device *dev;
464
465         printf("All Devices:\n");
466         list_for_each_entry(dev, devices, dev_list)
467                 print_device_info(dev, "\t");
468         printf("\n");
469 }
470
471 static void print_scan_result(struct recover_control *rc)
472 {
473         if (!rc->verbose)
474                 return;
475
476         printf("DEVICE SCAN RESULT:\n");
477         printf("Filesystem Information:\n");
478         printf("\tsectorsize: %d\n", rc->sectorsize);
479         printf("\tleafsize: %d\n", rc->leafsize);
480         printf("\ttree root generation: %llu\n", rc->generation);
481         printf("\tchunk root generation: %llu\n", rc->chunk_root_generation);
482         printf("\n");
483
484         print_all_devices(&rc->fs_devices->devices);
485         print_block_group_tree(&rc->bg);
486         print_chunk_tree(&rc->chunk);
487         print_device_extent_tree(&rc->devext);
488 }
489
490 static void print_chunk_info(struct chunk_record *chunk, char *prefix)
491 {
492         struct device_extent_record *devext;
493         int i;
494
495         print_chunk_self_info(chunk, prefix);
496         if (prefix)
497                 printf("%s", prefix);
498         if (chunk->bg_rec)
499                 print_block_group_info(chunk->bg_rec, "    ");
500         else
501                 printf("    No block group.\n");
502         if (prefix)
503                 printf("%s", prefix);
504         if (list_empty(&chunk->dextents)) {
505                 printf("    No device extent.\n");
506         } else {
507                 printf("    Device extent list:\n");
508                 i = 0;
509                 list_for_each_entry(devext, &chunk->dextents, chunk_list) {
510                         if (prefix)
511                                 printf("%s", prefix);
512                         printf("%s[%2d]", "        ", i);
513                         print_device_extent_info(devext, NULL);
514                         i++;
515                 }
516         }
517 }
518
519 static void print_check_result(struct recover_control *rc)
520 {
521         struct chunk_record *chunk;
522         struct block_group_record *bg;
523         struct device_extent_record *devext;
524         int total = 0;
525         int good = 0;
526         int bad = 0;
527
528         if (!rc->verbose)
529                 return;
530
531         printf("CHECK RESULT:\n");
532         printf("Healthy Chunks:\n");
533         list_for_each_entry(chunk, &rc->good_chunks, list) {
534                 print_chunk_info(chunk, "  ");
535                 good++;
536                 total++;
537         }
538         printf("Bad Chunks:\n");
539         list_for_each_entry(chunk, &rc->bad_chunks, list) {
540                 print_chunk_info(chunk, "  ");
541                 bad++;
542                 total++;
543         }
544         printf("\n");
545         printf("Total Chunks:\t%d\n", total);
546         printf("  Heathy:\t%d\n", good);
547         printf("  Bad:\t%d\n", bad);
548
549         printf("\n");
550         printf("Orphan Block Groups:\n");
551         list_for_each_entry(bg, &rc->bg.block_groups, list)
552                 print_block_group_info(bg, "  ");
553
554         printf("\n");
555         printf("Orphan Device Extents:\n");
556         list_for_each_entry(devext, &rc->devext.no_chunk_orphans, chunk_list)
557                 print_device_extent_info(devext, "  ");
558 }
559
560 static int check_chunk_by_metadata(struct recover_control *rc,
561                                    struct btrfs_root *root,
562                                    struct chunk_record *chunk, int bg_only)
563 {
564         int ret;
565         int i;
566         int slot;
567         struct btrfs_path path;
568         struct btrfs_key key;
569         struct btrfs_root *dev_root;
570         struct stripe *stripe;
571         struct btrfs_dev_extent *dev_extent;
572         struct btrfs_block_group_item *bg_ptr;
573         struct extent_buffer *l;
574
575         btrfs_init_path(&path);
576
577         if (bg_only)
578                 goto bg_check;
579
580         dev_root = root->fs_info->dev_root;
581         for (i = 0; i < chunk->num_stripes; i++) {
582                 stripe = &chunk->stripes[i];
583
584                 key.objectid = stripe->devid;
585                 key.offset = stripe->offset;
586                 key.type = BTRFS_DEV_EXTENT_KEY;
587
588                 ret = btrfs_search_slot(NULL, dev_root, &key, &path, 0, 0);
589                 if (ret < 0) {
590                         fprintf(stderr, "Search device extent failed(%d)\n",
591                                 ret);
592                         btrfs_release_path(&path);
593                         return ret;
594                 } else if (ret > 0) {
595                         if (rc->verbose)
596                                 fprintf(stderr,
597                                         "No device extent[%llu, %llu]\n",
598                                         stripe->devid, stripe->offset);
599                         btrfs_release_path(&path);
600                         return -ENOENT;
601                 }
602                 l = path.nodes[0];
603                 slot = path.slots[0];
604                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
605                 if (chunk->offset !=
606                     btrfs_dev_extent_chunk_offset(l, dev_extent)) {
607                         if (rc->verbose)
608                                 fprintf(stderr,
609                                         "Device tree unmatch with chunks dev_extent[%llu, %llu], chunk[%llu, %llu]\n",
610                                         btrfs_dev_extent_chunk_offset(l,
611                                                                 dev_extent),
612                                         btrfs_dev_extent_length(l, dev_extent),
613                                         chunk->offset, chunk->length);
614                         btrfs_release_path(&path);
615                         return -ENOENT;
616                 }
617                 btrfs_release_path(&path);
618         }
619
620 bg_check:
621         key.objectid = chunk->offset;
622         key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
623         key.offset = chunk->length;
624
625         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, &path,
626                                 0, 0);
627         if (ret < 0) {
628                 fprintf(stderr, "Search block group failed(%d)\n", ret);
629                 btrfs_release_path(&path);
630                 return ret;
631         } else if (ret > 0) {
632                 if (rc->verbose)
633                         fprintf(stderr, "No block group[%llu, %llu]\n",
634                                 key.objectid, key.offset);
635                 btrfs_release_path(&path);
636                 return -ENOENT;
637         }
638
639         l = path.nodes[0];
640         slot = path.slots[0];
641         bg_ptr = btrfs_item_ptr(l, slot, struct btrfs_block_group_item);
642         if (chunk->type_flags != btrfs_disk_block_group_flags(l, bg_ptr)) {
643                 if (rc->verbose)
644                         fprintf(stderr,
645                                 "Chunk[%llu, %llu]'s type(%llu) is differemt with Block Group's type(%llu)\n",
646                                 chunk->offset, chunk->length, chunk->type_flags,
647                                 btrfs_disk_block_group_flags(l, bg_ptr));
648                 btrfs_release_path(&path);
649                 return -ENOENT;
650         }
651         btrfs_release_path(&path);
652         return 0;
653 }
654
655 static int check_all_chunks_by_metadata(struct recover_control *rc,
656                                         struct btrfs_root *root)
657 {
658         struct chunk_record *chunk;
659         struct chunk_record *next;
660         LIST_HEAD(orphan_chunks);
661         int ret = 0;
662         int err;
663
664         list_for_each_entry_safe(chunk, next, &rc->good_chunks, list) {
665                 err = check_chunk_by_metadata(rc, root, chunk, 0);
666                 if (err) {
667                         if (err == -ENOENT)
668                                 list_move_tail(&chunk->list, &orphan_chunks);
669                         else if (err && !ret)
670                                 ret = err;
671                 }
672         }
673
674         list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) {
675                 err = check_chunk_by_metadata(rc, root, chunk, 1);
676                 if (err == -ENOENT)
677                         list_move_tail(&chunk->list, &orphan_chunks);
678                 else if (err && !ret)
679                         ret = err;
680         }
681
682         list_for_each_entry(chunk, &rc->bad_chunks, list) {
683                 err = check_chunk_by_metadata(rc, root, chunk, 1);
684                 if (err != -ENOENT && !ret)
685                         ret = err ? err : -EINVAL;
686         }
687         list_splice(&orphan_chunks, &rc->bad_chunks);
688         return ret;
689 }
690
691 static int extract_metadata_record(struct recover_control *rc,
692                                    struct extent_buffer *leaf)
693 {
694         struct btrfs_key key;
695         int ret = 0;
696         int i;
697         u32 nritems;
698
699         nritems = btrfs_header_nritems(leaf);
700         for (i = 0; i < nritems; i++) {
701                 btrfs_item_key_to_cpu(leaf, &key, i);
702                 switch (key.type) {
703                 case BTRFS_BLOCK_GROUP_ITEM_KEY:
704                         pthread_mutex_lock(&rc->rc_lock);
705                         ret = process_block_group_item(&rc->bg, leaf, &key, i);
706                         pthread_mutex_unlock(&rc->rc_lock);
707                         break;
708                 case BTRFS_CHUNK_ITEM_KEY:
709                         pthread_mutex_lock(&rc->rc_lock);
710                         ret = process_chunk_item(&rc->chunk, leaf, &key, i);
711                         pthread_mutex_unlock(&rc->rc_lock);
712                         break;
713                 case BTRFS_DEV_EXTENT_KEY:
714                         pthread_mutex_lock(&rc->rc_lock);
715                         ret = process_device_extent_item(&rc->devext, leaf,
716                                                          &key, i);
717                         pthread_mutex_unlock(&rc->rc_lock);
718                         break;
719                 }
720                 if (ret)
721                         break;
722         }
723         return ret;
724 }
725
726 static inline int is_super_block_address(u64 offset)
727 {
728         int i;
729
730         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
731                 if (offset == btrfs_sb_offset(i))
732                         return 1;
733         }
734         return 0;
735 }
736
737 static int scan_one_device(void *dev_scan_struct)
738 {
739         struct extent_buffer *buf;
740         u64 bytenr;
741         int ret = 0;
742         struct device_scan *dev_scan = (struct device_scan *)dev_scan_struct;
743         struct recover_control *rc = dev_scan->rc;
744         struct btrfs_device *device = dev_scan->dev;
745         int fd = dev_scan->fd;
746         int oldtype;
747
748         ret = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &oldtype);
749         if (ret)
750                 return 1;
751
752         buf = malloc(sizeof(*buf) + rc->leafsize);
753         if (!buf)
754                 return -ENOMEM;
755         buf->len = rc->leafsize;
756
757         bytenr = 0;
758         while (1) {
759                 if (is_super_block_address(bytenr))
760                         bytenr += rc->sectorsize;
761
762                 if (pread64(fd, buf->data, rc->leafsize, bytenr) <
763                     rc->leafsize)
764                         break;
765
766                 if (memcmp_extent_buffer(buf, rc->fs_devices->fsid,
767                                          btrfs_header_fsid(),
768                                          BTRFS_FSID_SIZE)) {
769                         bytenr += rc->sectorsize;
770                         continue;
771                 }
772
773                 if (verify_tree_block_csum_silent(buf, rc->csum_size)) {
774                         bytenr += rc->sectorsize;
775                         continue;
776                 }
777
778                 pthread_mutex_lock(&rc->rc_lock);
779                 ret = process_extent_buffer(&rc->eb_cache, buf, device, bytenr);
780                 pthread_mutex_unlock(&rc->rc_lock);
781                 if (ret)
782                         goto out;
783
784                 if (btrfs_header_level(buf) != 0)
785                         goto next_node;
786
787                 switch (btrfs_header_owner(buf)) {
788                 case BTRFS_EXTENT_TREE_OBJECTID:
789                 case BTRFS_DEV_TREE_OBJECTID:
790                         /* different tree use different generation */
791                         if (btrfs_header_generation(buf) > rc->generation)
792                                 break;
793                         ret = extract_metadata_record(rc, buf);
794                         if (ret)
795                                 goto out;
796                         break;
797                 case BTRFS_CHUNK_TREE_OBJECTID:
798                         if (btrfs_header_generation(buf) >
799                             rc->chunk_root_generation)
800                                 break;
801                         ret = extract_metadata_record(rc, buf);
802                         if (ret)
803                                 goto out;
804                         break;
805                 }
806 next_node:
807                 bytenr += rc->leafsize;
808         }
809 out:
810         close(fd);
811         free(buf);
812         return ret;
813 }
814
815 static int scan_devices(struct recover_control *rc)
816 {
817         int ret = 0;
818         int fd;
819         struct btrfs_device *dev;
820         struct device_scan *dev_scans;
821         pthread_t *t_scans;
822         int *t_rets;
823         int devnr = 0;
824         int devidx = 0;
825         int cancel_from = 0;
826         int cancel_to = 0;
827         int i;
828
829         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list)
830                 devnr++;
831         dev_scans = (struct device_scan *)malloc(sizeof(struct device_scan)
832                                                  * devnr);
833         if (!dev_scans)
834                 return -ENOMEM;
835         t_scans = (pthread_t *)malloc(sizeof(pthread_t) * devnr);
836         if (!t_scans)
837                 return -ENOMEM;
838         t_rets = (int *)malloc(sizeof(int) * devnr);
839         if (!t_rets)
840                 return -ENOMEM;
841
842         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
843                 fd = open(dev->name, O_RDONLY);
844                 if (fd < 0) {
845                         fprintf(stderr, "Failed to open device %s\n",
846                                 dev->name);
847                         ret = 1;
848                         goto out2;
849                 }
850                 dev_scans[devidx].rc = rc;
851                 dev_scans[devidx].dev = dev;
852                 dev_scans[devidx].fd = fd;
853                 ret = pthread_create(&t_scans[devidx], NULL,
854                                      (void *)scan_one_device,
855                                      (void *)&dev_scans[devidx]);
856                 if (ret) {
857                         cancel_from = 0;
858                         cancel_to = devidx - 1;
859                         goto out1;
860                 }
861                 devidx++;
862         }
863
864         i = 0;
865         while (i < devidx) {
866                 ret = pthread_join(t_scans[i], (void **)&t_rets[i]);
867                 if (ret || t_rets[i]) {
868                         ret = 1;
869                         cancel_from = i + 1;
870                         cancel_to = devnr - 1;
871                         goto out1;
872                 }
873                 i++;
874         }
875 out1:
876         while (ret && (cancel_from <= cancel_to)) {
877                 pthread_cancel(t_scans[cancel_from]);
878                 cancel_from++;
879         }
880 out2:
881         free(dev_scans);
882         free(t_scans);
883         free(t_rets);
884         return !!ret;
885 }
886
887 static int build_device_map_by_chunk_record(struct btrfs_root *root,
888                                             struct chunk_record *chunk)
889 {
890         int ret = 0;
891         int i;
892         u64 devid;
893         u8 uuid[BTRFS_UUID_SIZE];
894         u16 num_stripes;
895         struct btrfs_mapping_tree *map_tree;
896         struct map_lookup *map;
897         struct stripe *stripe;
898
899         map_tree = &root->fs_info->mapping_tree;
900         num_stripes = chunk->num_stripes;
901         map = malloc(btrfs_map_lookup_size(num_stripes));
902         if (!map)
903                 return -ENOMEM;
904         map->ce.start = chunk->offset;
905         map->ce.size = chunk->length;
906         map->num_stripes = num_stripes;
907         map->io_width = chunk->io_width;
908         map->io_align = chunk->io_align;
909         map->sector_size = chunk->sector_size;
910         map->stripe_len = chunk->stripe_len;
911         map->type = chunk->type_flags;
912         map->sub_stripes = chunk->sub_stripes;
913
914         for (i = 0, stripe = chunk->stripes; i < num_stripes; i++, stripe++) {
915                 devid = stripe->devid;
916                 memcpy(uuid, stripe->dev_uuid, BTRFS_UUID_SIZE);
917                 map->stripes[i].physical = stripe->offset;
918                 map->stripes[i].dev = btrfs_find_device(root, devid,
919                                                         uuid, NULL);
920                 if (!map->stripes[i].dev) {
921                         kfree(map);
922                         return -EIO;
923                 }
924         }
925
926         ret = insert_cache_extent(&map_tree->cache_tree, &map->ce);
927         return ret;
928 }
929
930 static int build_device_maps_by_chunk_records(struct recover_control *rc,
931                                               struct btrfs_root *root)
932 {
933         int ret = 0;
934         struct chunk_record *chunk;
935
936         list_for_each_entry(chunk, &rc->good_chunks, list) {
937                 ret = build_device_map_by_chunk_record(root, chunk);
938                 if (ret)
939                         return ret;
940         }
941         return ret;
942 }
943
944 static int block_group_remove_all_extent_items(struct btrfs_trans_handle *trans,
945                                                struct btrfs_root *root,
946                                                struct block_group_record *bg)
947 {
948         struct btrfs_fs_info *fs_info = root->fs_info;
949         struct btrfs_key key;
950         struct btrfs_path path;
951         struct extent_buffer *leaf;
952         u64 start = bg->objectid;
953         u64 end = bg->objectid + bg->offset;
954         u64 old_val;
955         int nitems;
956         int ret;
957         int i;
958         int del_s, del_nr;
959
960         btrfs_init_path(&path);
961         root = root->fs_info->extent_root;
962
963         key.objectid = start;
964         key.offset = 0;
965         key.type = BTRFS_EXTENT_ITEM_KEY;
966 again:
967         ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
968         if (ret < 0)
969                 goto err;
970         else if (ret > 0)
971                 ret = 0;
972
973         leaf = path.nodes[0];
974         nitems = btrfs_header_nritems(leaf);
975         if (!nitems) {
976                 /* The tree is empty. */
977                 ret = 0;
978                 goto err;
979         }
980
981         if (path.slots[0] >= nitems) {
982                 ret = btrfs_next_leaf(root, &path);
983                 if (ret < 0)
984                         goto err;
985                 if (ret > 0) {
986                         ret = 0;
987                         goto err;
988                 }
989                 leaf = path.nodes[0];
990                 btrfs_item_key_to_cpu(leaf, &key, 0);
991                 if (key.objectid >= end)
992                         goto err;
993                 btrfs_release_path(&path);
994                 goto again;
995         }
996
997         del_nr = 0;
998         del_s = -1;
999         for (i = path.slots[0]; i < nitems; i++) {
1000                 btrfs_item_key_to_cpu(leaf, &key, i);
1001                 if (key.objectid >= end)
1002                         break;
1003
1004                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1005                         if (del_nr == 0)
1006                                 continue;
1007                         else
1008                                 break;
1009                 }
1010
1011                 if (del_s == -1)
1012                         del_s = i;
1013                 del_nr++;
1014                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
1015                     key.type == BTRFS_METADATA_ITEM_KEY) {
1016                         old_val = btrfs_super_bytes_used(fs_info->super_copy);
1017                         if (key.type == BTRFS_METADATA_ITEM_KEY)
1018                                 old_val += root->leafsize;
1019                         else
1020                                 old_val += key.offset;
1021                         btrfs_set_super_bytes_used(fs_info->super_copy,
1022                                                    old_val);
1023                 }
1024         }
1025
1026         if (del_nr) {
1027                 ret = btrfs_del_items(trans, root, &path, del_s, del_nr);
1028                 if (ret)
1029                         goto err;
1030         }
1031
1032         if (key.objectid < end) {
1033                 if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
1034                         key.objectid += root->sectorsize;
1035                         key.type = BTRFS_EXTENT_ITEM_KEY;
1036                         key.offset = 0;
1037                 }
1038                 btrfs_release_path(&path);
1039                 goto again;
1040         }
1041 err:
1042         btrfs_release_path(&path);
1043         return ret;
1044 }
1045
1046 static int block_group_free_all_extent(struct btrfs_trans_handle *trans,
1047                                        struct btrfs_root *root,
1048                                        struct block_group_record *bg)
1049 {
1050         struct btrfs_block_group_cache *cache;
1051         struct btrfs_fs_info *info;
1052         u64 start;
1053         u64 end;
1054
1055         info = root->fs_info;
1056         cache = btrfs_lookup_block_group(info, bg->objectid);
1057         if (!cache)
1058                 return -ENOENT;
1059
1060         start = cache->key.objectid;
1061         end = start + cache->key.offset - 1;
1062
1063         set_extent_bits(&info->block_group_cache, start, end,
1064                         BLOCK_GROUP_DIRTY, GFP_NOFS);
1065         set_extent_dirty(&info->free_space_cache, start, end, GFP_NOFS);
1066
1067         btrfs_set_block_group_used(&cache->item, 0);
1068
1069         return 0;
1070 }
1071
1072 static int remove_chunk_extent_item(struct btrfs_trans_handle *trans,
1073                                     struct recover_control *rc,
1074                                     struct btrfs_root *root)
1075 {
1076         struct chunk_record *chunk;
1077         int ret = 0;
1078
1079         list_for_each_entry(chunk, &rc->good_chunks, list) {
1080                 if (!(chunk->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
1081                         continue;
1082                 ret = block_group_remove_all_extent_items(trans, root,
1083                                                           chunk->bg_rec);
1084                 if (ret)
1085                         return ret;
1086
1087                 ret = block_group_free_all_extent(trans, root, chunk->bg_rec);
1088                 if (ret)
1089                         return ret;
1090         }
1091         return ret;
1092 }
1093
1094 static int __rebuild_chunk_root(struct btrfs_trans_handle *trans,
1095                                 struct recover_control *rc,
1096                                 struct btrfs_root *root)
1097 {
1098         u64 min_devid = -1;
1099         struct btrfs_device *dev;
1100         struct extent_buffer *cow;
1101         struct btrfs_disk_key disk_key;
1102         int ret = 0;
1103
1104         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
1105                 if (min_devid > dev->devid)
1106                         min_devid = dev->devid;
1107         }
1108         disk_key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1109         disk_key.type = BTRFS_DEV_ITEM_KEY;
1110         disk_key.offset = min_devid;
1111
1112         cow = btrfs_alloc_free_block(trans, root, root->nodesize,
1113                                      BTRFS_CHUNK_TREE_OBJECTID,
1114                                      &disk_key, 0, 0, 0);
1115         btrfs_set_header_bytenr(cow, cow->start);
1116         btrfs_set_header_generation(cow, trans->transid);
1117         btrfs_set_header_nritems(cow, 0);
1118         btrfs_set_header_level(cow, 0);
1119         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1120         btrfs_set_header_owner(cow, BTRFS_CHUNK_TREE_OBJECTID);
1121         write_extent_buffer(cow, root->fs_info->fsid,
1122                         btrfs_header_fsid(), BTRFS_FSID_SIZE);
1123
1124         write_extent_buffer(cow, root->fs_info->chunk_tree_uuid,
1125                         btrfs_header_chunk_tree_uuid(cow),
1126                         BTRFS_UUID_SIZE);
1127
1128         root->node = cow;
1129         btrfs_mark_buffer_dirty(cow);
1130
1131         return ret;
1132 }
1133
1134 static int __rebuild_device_items(struct btrfs_trans_handle *trans,
1135                                   struct recover_control *rc,
1136                                   struct btrfs_root *root)
1137 {
1138         struct btrfs_device *dev;
1139         struct btrfs_key key;
1140         struct btrfs_dev_item *dev_item;
1141         int ret = 0;
1142
1143         dev_item = malloc(sizeof(struct btrfs_dev_item));
1144         if (!dev_item)
1145                 return -ENOMEM;
1146
1147         list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) {
1148                 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1149                 key.type = BTRFS_DEV_ITEM_KEY;
1150                 key.offset = dev->devid;
1151
1152                 btrfs_set_stack_device_generation(dev_item, 0);
1153                 btrfs_set_stack_device_type(dev_item, dev->type);
1154                 btrfs_set_stack_device_id(dev_item, dev->devid);
1155                 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
1156                 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
1157                 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
1158                 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
1159                 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
1160                 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
1161                 memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
1162
1163                 ret = btrfs_insert_item(trans, root, &key,
1164                                         dev_item, sizeof(*dev_item));
1165         }
1166
1167         free(dev_item);
1168         return ret;
1169 }
1170
1171 static int __rebuild_chunk_items(struct btrfs_trans_handle *trans,
1172                                  struct recover_control *rc,
1173                                  struct btrfs_root *root)
1174 {
1175         struct btrfs_key key;
1176         struct btrfs_chunk *chunk = NULL;
1177         struct btrfs_root *chunk_root;
1178         struct chunk_record *chunk_rec;
1179         int ret;
1180
1181         chunk_root = root->fs_info->chunk_root;
1182
1183         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1184                 chunk = create_chunk_item(chunk_rec);
1185                 if (!chunk)
1186                         return -ENOMEM;
1187
1188                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1189                 key.type = BTRFS_CHUNK_ITEM_KEY;
1190                 key.offset = chunk_rec->offset;
1191
1192                 ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
1193                                 btrfs_chunk_item_size(chunk->num_stripes));
1194                 free(chunk);
1195                 if (ret)
1196                         return ret;
1197         }
1198         return 0;
1199 }
1200
1201 static int rebuild_chunk_tree(struct btrfs_trans_handle *trans,
1202                               struct recover_control *rc,
1203                               struct btrfs_root *root)
1204 {
1205         int ret = 0;
1206
1207         root = root->fs_info->chunk_root;
1208
1209         ret = __rebuild_chunk_root(trans, rc, root);
1210         if (ret)
1211                 return ret;
1212
1213         ret = __rebuild_device_items(trans, rc, root);
1214         if (ret)
1215                 return ret;
1216
1217         ret = __rebuild_chunk_items(trans, rc, root);
1218
1219         return ret;
1220 }
1221
1222 static int rebuild_sys_array(struct recover_control *rc,
1223                              struct btrfs_root *root)
1224 {
1225         struct btrfs_chunk *chunk;
1226         struct btrfs_key key;
1227         struct chunk_record *chunk_rec;
1228         int ret = 0;
1229         u16 num_stripes;
1230
1231         btrfs_set_super_sys_array_size(root->fs_info->super_copy, 0);
1232
1233         list_for_each_entry(chunk_rec, &rc->good_chunks, list) {
1234                 if (!(chunk_rec->type_flags & BTRFS_BLOCK_GROUP_SYSTEM))
1235                         continue;
1236
1237                 num_stripes = chunk_rec->num_stripes;
1238                 chunk = create_chunk_item(chunk_rec);
1239                 if (!chunk) {
1240                         ret = -ENOMEM;
1241                         break;
1242                 }
1243
1244                 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1245                 key.type = BTRFS_CHUNK_ITEM_KEY;
1246                 key.offset = chunk_rec->offset;
1247
1248                 ret = btrfs_add_system_chunk(NULL, root, &key, chunk,
1249                                 btrfs_chunk_item_size(num_stripes));
1250                 free(chunk);
1251                 if (ret)
1252                         break;
1253         }
1254         return ret;
1255
1256 }
1257
1258 static struct btrfs_root *
1259 open_ctree_with_broken_chunk(struct recover_control *rc)
1260 {
1261         struct btrfs_fs_info *fs_info;
1262         struct btrfs_super_block *disk_super;
1263         struct extent_buffer *eb;
1264         u32 sectorsize;
1265         u32 nodesize;
1266         u32 leafsize;
1267         u32 stripesize;
1268         int ret;
1269
1270         fs_info = btrfs_new_fs_info(1, BTRFS_SUPER_INFO_OFFSET);
1271         if (!fs_info) {
1272                 fprintf(stderr, "Failed to allocate memory for fs_info\n");
1273                 return ERR_PTR(-ENOMEM);
1274         }
1275         fs_info->is_chunk_recover = 1;
1276
1277         fs_info->fs_devices = rc->fs_devices;
1278         ret = btrfs_open_devices(fs_info->fs_devices, O_RDWR);
1279         if (ret)
1280                 goto out;
1281
1282         disk_super = fs_info->super_copy;
1283         ret = btrfs_read_dev_super(fs_info->fs_devices->latest_bdev,
1284                                    disk_super, fs_info->super_bytenr, 1);
1285         if (ret) {
1286                 fprintf(stderr, "No valid btrfs found\n");
1287                 goto out_devices;
1288         }
1289
1290         memcpy(fs_info->fsid, &disk_super->fsid, BTRFS_FSID_SIZE);
1291
1292         ret = btrfs_check_fs_compatibility(disk_super, 1);
1293         if (ret)
1294                 goto out_devices;
1295
1296         nodesize = btrfs_super_nodesize(disk_super);
1297         leafsize = btrfs_super_leafsize(disk_super);
1298         sectorsize = btrfs_super_sectorsize(disk_super);
1299         stripesize = btrfs_super_stripesize(disk_super);
1300
1301         __setup_root(nodesize, leafsize, sectorsize, stripesize,
1302                      fs_info->chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
1303
1304         ret = build_device_maps_by_chunk_records(rc, fs_info->chunk_root);
1305         if (ret)
1306                 goto out_cleanup;
1307
1308         ret = btrfs_setup_all_roots(fs_info, 0, 0);
1309         if (ret)
1310                 goto out_failed;
1311
1312         eb = fs_info->tree_root->node;
1313         read_extent_buffer(eb, fs_info->chunk_tree_uuid,
1314                            btrfs_header_chunk_tree_uuid(eb),
1315                            BTRFS_UUID_SIZE);
1316
1317         return fs_info->fs_root;
1318 out_failed:
1319         btrfs_release_all_roots(fs_info);
1320 out_cleanup:
1321         btrfs_cleanup_all_caches(fs_info);
1322 out_devices:
1323         btrfs_close_devices(fs_info->fs_devices);
1324 out:
1325         btrfs_free_fs_info(fs_info);
1326         return ERR_PTR(ret);
1327 }
1328
1329 static int recover_prepare(struct recover_control *rc, char *path)
1330 {
1331         int ret;
1332         int fd;
1333         struct btrfs_super_block *sb;
1334         struct btrfs_fs_devices *fs_devices;
1335
1336         ret = 0;
1337         fd = open(path, O_RDONLY);
1338         if (fd < 0) {
1339                 fprintf(stderr, "open %s\n error.\n", path);
1340                 return -1;
1341         }
1342
1343         sb = malloc(BTRFS_SUPER_INFO_SIZE);
1344         if (!sb) {
1345                 fprintf(stderr, "allocating memory for sb failed.\n");
1346                 ret = -ENOMEM;
1347                 goto fail_close_fd;
1348         }
1349
1350         ret = btrfs_read_dev_super(fd, sb, BTRFS_SUPER_INFO_OFFSET, 1);
1351         if (ret) {
1352                 fprintf(stderr, "read super block error\n");
1353                 goto fail_free_sb;
1354         }
1355
1356         rc->sectorsize = btrfs_super_sectorsize(sb);
1357         rc->leafsize = btrfs_super_leafsize(sb);
1358         rc->generation = btrfs_super_generation(sb);
1359         rc->chunk_root_generation = btrfs_super_chunk_root_generation(sb);
1360         rc->csum_size = btrfs_super_csum_size(sb);
1361
1362         /* if seed, the result of scanning below will be partial */
1363         if (btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_SEEDING) {
1364                 fprintf(stderr, "this device is seed device\n");
1365                 ret = -1;
1366                 goto fail_free_sb;
1367         }
1368
1369         ret = btrfs_scan_fs_devices(fd, path, &fs_devices, 0, 1);
1370         if (ret)
1371                 goto fail_free_sb;
1372
1373         rc->fs_devices = fs_devices;
1374
1375         if (rc->verbose)
1376                 print_all_devices(&rc->fs_devices->devices);
1377
1378 fail_free_sb:
1379         free(sb);
1380 fail_close_fd:
1381         close(fd);
1382         return ret;
1383 }
1384
1385 static int btrfs_get_device_extents(u64 chunk_object,
1386                                     struct list_head *orphan_devexts,
1387                                     struct list_head *ret_list)
1388 {
1389         struct device_extent_record *devext;
1390         struct device_extent_record *next;
1391         int count = 0;
1392
1393         list_for_each_entry_safe(devext, next, orphan_devexts, chunk_list) {
1394                 if (devext->chunk_offset == chunk_object) {
1395                         list_move_tail(&devext->chunk_list, ret_list);
1396                         count++;
1397                 }
1398         }
1399         return count;
1400 }
1401
1402 static int calc_num_stripes(u64 type)
1403 {
1404         if (type & (BTRFS_BLOCK_GROUP_RAID0 |
1405                     BTRFS_BLOCK_GROUP_RAID10 |
1406                     BTRFS_BLOCK_GROUP_RAID5 |
1407                     BTRFS_BLOCK_GROUP_RAID6))
1408                 return 0;
1409         else if (type & (BTRFS_BLOCK_GROUP_RAID1 |
1410                          BTRFS_BLOCK_GROUP_DUP))
1411                 return 2;
1412         else
1413                 return 1;
1414 }
1415
1416 static inline int calc_sub_nstripes(u64 type)
1417 {
1418         if (type & BTRFS_BLOCK_GROUP_RAID10)
1419                 return 2;
1420         else
1421                 return 1;
1422 }
1423
1424 static int btrfs_verify_device_extents(struct block_group_record *bg,
1425                                        struct list_head *devexts, int ndevexts)
1426 {
1427         struct device_extent_record *devext;
1428         u64 strpie_length;
1429         int expected_num_stripes;
1430
1431         expected_num_stripes = calc_num_stripes(bg->flags);
1432         if (expected_num_stripes && expected_num_stripes != ndevexts)
1433                 return 1;
1434
1435         strpie_length = calc_stripe_length(bg->flags, bg->offset, ndevexts);
1436         list_for_each_entry(devext, devexts, chunk_list) {
1437                 if (devext->length != strpie_length)
1438                         return 1;
1439         }
1440         return 0;
1441 }
1442
1443 static int btrfs_rebuild_unordered_chunk_stripes(struct recover_control *rc,
1444                                                  struct chunk_record *chunk)
1445 {
1446         struct device_extent_record *devext;
1447         struct btrfs_device *device;
1448         int i;
1449
1450         devext = list_first_entry(&chunk->dextents, struct device_extent_record,
1451                                   chunk_list);
1452         for (i = 0; i < chunk->num_stripes; i++) {
1453                 chunk->stripes[i].devid = devext->objectid;
1454                 chunk->stripes[i].offset = devext->offset;
1455                 device = btrfs_find_device_by_devid(rc->fs_devices,
1456                                                     devext->objectid,
1457                                                     0);
1458                 if (!device)
1459                         return -ENOENT;
1460                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1461                                                   devext->objectid,
1462                                                   1));
1463                 memcpy(chunk->stripes[i].dev_uuid, device->uuid,
1464                        BTRFS_UUID_SIZE);
1465                 devext = list_next_entry(devext, chunk_list);
1466         }
1467         return 0;
1468 }
1469
1470 static int btrfs_calc_stripe_index(struct chunk_record *chunk, u64 logical)
1471 {
1472         u64 offset = logical - chunk->offset;
1473         int stripe_nr;
1474         int nr_data_stripes;
1475         int index;
1476
1477         stripe_nr = offset / chunk->stripe_len;
1478         if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID0) {
1479                 index = stripe_nr % chunk->num_stripes;
1480         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID10) {
1481                 index = stripe_nr % (chunk->num_stripes / chunk->sub_stripes);
1482                 index *= chunk->sub_stripes;
1483         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5) {
1484                 nr_data_stripes = chunk->num_stripes - 1;
1485                 index = stripe_nr % nr_data_stripes;
1486                 stripe_nr /= nr_data_stripes;
1487                 index = (index + stripe_nr) % chunk->num_stripes;
1488         } else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6) {
1489                 nr_data_stripes = chunk->num_stripes - 2;
1490                 index = stripe_nr % nr_data_stripes;
1491                 stripe_nr /= nr_data_stripes;
1492                 index = (index + stripe_nr) % chunk->num_stripes;
1493         } else {
1494                 return -1;
1495         }
1496         return index;
1497 }
1498
1499 /* calc the logical offset which is the start of the next stripe. */
1500 static inline u64 btrfs_next_stripe_logical_offset(struct chunk_record *chunk,
1501                                                    u64 logical)
1502 {
1503         u64 offset = logical - chunk->offset;
1504
1505         offset /= chunk->stripe_len;
1506         offset *= chunk->stripe_len;
1507         offset += chunk->stripe_len;
1508
1509         return offset + chunk->offset;
1510 }
1511
1512 static int is_extent_record_in_device_extent(struct extent_record *er,
1513                                              struct device_extent_record *dext,
1514                                              int *mirror)
1515 {
1516         int i;
1517
1518         for (i = 0; i < er->nmirrors; i++) {
1519                 if (er->devices[i]->devid == dext->objectid &&
1520                     er->offsets[i] >= dext->offset &&
1521                     er->offsets[i] < dext->offset + dext->length) {
1522                         *mirror = i;
1523                         return 1;
1524                 }
1525         }
1526         return 0;
1527 }
1528
1529 static int
1530 btrfs_rebuild_ordered_meta_chunk_stripes(struct recover_control *rc,
1531                                          struct chunk_record *chunk)
1532 {
1533         u64 start = chunk->offset;
1534         u64 end = chunk->offset + chunk->length;
1535         struct cache_extent *cache;
1536         struct extent_record *er;
1537         struct device_extent_record *devext;
1538         struct device_extent_record *next;
1539         struct btrfs_device *device;
1540         LIST_HEAD(devexts);
1541         int index;
1542         int mirror;
1543         int ret;
1544
1545         cache = lookup_cache_extent(&rc->eb_cache,
1546                                     start, chunk->length);
1547         if (!cache) {
1548                 /* No used space, we can reorder the stripes freely. */
1549                 ret = btrfs_rebuild_unordered_chunk_stripes(rc, chunk);
1550                 return ret;
1551         }
1552
1553         list_splice_init(&chunk->dextents, &devexts);
1554 again:
1555         er = container_of(cache, struct extent_record, cache);
1556         index = btrfs_calc_stripe_index(chunk, er->cache.start);
1557         BUG_ON(index == -1);
1558         if (chunk->stripes[index].devid)
1559                 goto next;
1560         list_for_each_entry_safe(devext, next, &devexts, chunk_list) {
1561                 if (is_extent_record_in_device_extent(er, devext, &mirror)) {
1562                         chunk->stripes[index].devid = devext->objectid;
1563                         chunk->stripes[index].offset = devext->offset;
1564                         memcpy(chunk->stripes[index].dev_uuid,
1565                                er->devices[mirror]->uuid,
1566                                BTRFS_UUID_SIZE);
1567                         index++;
1568                         list_move(&devext->chunk_list, &chunk->dextents);
1569                 }
1570         }
1571 next:
1572         start = btrfs_next_stripe_logical_offset(chunk, er->cache.start);
1573         if (start >= end)
1574                 goto no_extent_record;
1575
1576         cache = lookup_cache_extent(&rc->eb_cache, start, end - start);
1577         if (cache)
1578                 goto again;
1579 no_extent_record:
1580         if (list_empty(&devexts))
1581                 return 0;
1582
1583         if (chunk->type_flags & (BTRFS_BLOCK_GROUP_RAID5 |
1584                                  BTRFS_BLOCK_GROUP_RAID6)) {
1585                 /* Fixme: try to recover the order by the parity block. */
1586                 list_splice_tail(&devexts, &chunk->dextents);
1587                 return -EINVAL;
1588         }
1589
1590         /* There is no data on the lost stripes, we can reorder them freely. */
1591         for (index = 0; index < chunk->num_stripes; index++) {
1592                 if (chunk->stripes[index].devid)
1593                         continue;
1594
1595                 devext = list_first_entry(&devexts,
1596                                           struct device_extent_record,
1597                                            chunk_list);
1598                 list_move(&devext->chunk_list, &chunk->dextents);
1599
1600                 chunk->stripes[index].devid = devext->objectid;
1601                 chunk->stripes[index].offset = devext->offset;
1602                 device = btrfs_find_device_by_devid(rc->fs_devices,
1603                                                     devext->objectid,
1604                                                     0);
1605                 if (!device) {
1606                         list_splice_tail(&devexts, &chunk->dextents);
1607                         return -EINVAL;
1608                 }
1609                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1610                                                   devext->objectid,
1611                                                   1));
1612                 memcpy(chunk->stripes[index].dev_uuid, device->uuid,
1613                        BTRFS_UUID_SIZE);
1614         }
1615         return 0;
1616 }
1617
1618 #define BTRFS_ORDERED_RAID      (BTRFS_BLOCK_GROUP_RAID0 |      \
1619                                  BTRFS_BLOCK_GROUP_RAID10 |     \
1620                                  BTRFS_BLOCK_GROUP_RAID5 |      \
1621                                  BTRFS_BLOCK_GROUP_RAID6)
1622
1623 static int btrfs_rebuild_chunk_stripes(struct recover_control *rc,
1624                                        struct chunk_record *chunk)
1625 {
1626         int ret;
1627
1628         /*
1629          * All the data in the system metadata chunk will be dropped,
1630          * so we need not guarantee that the data is right or not, that
1631          * is we can reorder the stripes in the system metadata chunk.
1632          */
1633         if ((chunk->type_flags & BTRFS_BLOCK_GROUP_METADATA) &&
1634             (chunk->type_flags & BTRFS_ORDERED_RAID))
1635                 ret =btrfs_rebuild_ordered_meta_chunk_stripes(rc, chunk);
1636         else if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA) &&
1637                  (chunk->type_flags & BTRFS_ORDERED_RAID))
1638                 ret = 1;        /* Be handled after the fs is opened. */
1639         else
1640                 ret = btrfs_rebuild_unordered_chunk_stripes(rc, chunk);
1641
1642         return ret;
1643 }
1644
1645 static int next_csum(struct btrfs_root *root,
1646                      struct extent_buffer **leaf,
1647                      struct btrfs_path *path,
1648                      int *slot,
1649                      u64 *csum_offset,
1650                      u32 *tree_csum,
1651                      u64 end,
1652                      struct btrfs_key *key)
1653 {
1654         int ret = 0;
1655         struct btrfs_root *csum_root = root->fs_info->csum_root;
1656         struct btrfs_csum_item *csum_item;
1657         u32 blocksize = root->sectorsize;
1658         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1659         int csums_in_item = btrfs_item_size_nr(*leaf, *slot) / csum_size;
1660
1661         if (*csum_offset >= csums_in_item) {
1662                 ++(*slot);
1663                 *csum_offset = 0;
1664                 if (*slot >= btrfs_header_nritems(*leaf)) {
1665                         ret = btrfs_next_leaf(csum_root, path);
1666                         if (ret < 0)
1667                                 return -1;
1668                         else if (ret > 0)
1669                                 return 1;
1670                         *leaf = path->nodes[0];
1671                         *slot = path->slots[0];
1672                 }
1673                 btrfs_item_key_to_cpu(*leaf, key, *slot);
1674         }
1675
1676         if (key->offset + (*csum_offset) * blocksize >= end)
1677                 return 2;
1678         csum_item = btrfs_item_ptr(*leaf, *slot, struct btrfs_csum_item);
1679         csum_item = (struct btrfs_csum_item *)((unsigned char *)csum_item
1680                                              + (*csum_offset) * csum_size);
1681         read_extent_buffer(*leaf, tree_csum,
1682                           (unsigned long)csum_item, csum_size);
1683         return ret;
1684 }
1685
1686 static u64 calc_data_offset(struct btrfs_key *key,
1687                             struct chunk_record *chunk,
1688                             u64 dev_offset,
1689                             u64 csum_offset,
1690                             u32 blocksize)
1691 {
1692         u64 data_offset;
1693         int logical_stripe_nr;
1694         int dev_stripe_nr;
1695         int nr_data_stripes;
1696
1697         data_offset = key->offset + csum_offset * blocksize - chunk->offset;
1698         nr_data_stripes = chunk->num_stripes;
1699
1700         if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5)
1701                 nr_data_stripes -= 1;
1702         else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6)
1703                 nr_data_stripes -= 2;
1704
1705         logical_stripe_nr = data_offset / chunk->stripe_len;
1706         dev_stripe_nr = logical_stripe_nr / nr_data_stripes;
1707
1708         data_offset -= logical_stripe_nr * chunk->stripe_len;
1709         data_offset += dev_stripe_nr * chunk->stripe_len;
1710
1711         return dev_offset + data_offset;
1712 }
1713
1714 static int check_one_csum(int fd, u64 start, u32 len, u32 tree_csum)
1715 {
1716         char *data;
1717         int ret = 0;
1718         u32 csum_result = ~(u32)0;
1719
1720         data = malloc(len);
1721         if (!data)
1722                 return -1;
1723         ret = pread64(fd, data, len, start);
1724         if (ret < 0 || ret != len) {
1725                 ret = -1;
1726                 goto out;
1727         }
1728         ret = 0;
1729         csum_result = btrfs_csum_data(NULL, data, csum_result, len);
1730         btrfs_csum_final(csum_result, (char *)&csum_result);
1731         if (csum_result != tree_csum)
1732                 ret = 1;
1733 out:
1734         free(data);
1735         return ret;
1736 }
1737
1738 static u64 item_end_offset(struct btrfs_root *root, struct btrfs_key *key,
1739                            struct extent_buffer *leaf, int slot) {
1740         u32 blocksize = root->sectorsize;
1741         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1742
1743         u64 offset = btrfs_item_size_nr(leaf, slot);
1744         offset /= csum_size;
1745         offset *= blocksize;
1746         offset += key->offset;
1747
1748         return offset;
1749 }
1750
1751 static int insert_stripe(struct list_head *devexts,
1752                          struct recover_control *rc,
1753                          struct chunk_record *chunk,
1754                          int index) {
1755         struct device_extent_record *devext;
1756         struct btrfs_device *dev;
1757
1758         devext = list_entry(devexts->next, struct device_extent_record,
1759                             chunk_list);
1760         dev = btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
1761                                         0);
1762         if (!dev)
1763                 return 1;
1764         BUG_ON(btrfs_find_device_by_devid(rc->fs_devices, devext->objectid,
1765                                         1));
1766
1767         chunk->stripes[index].devid = devext->objectid;
1768         chunk->stripes[index].offset = devext->offset;
1769         memcpy(chunk->stripes[index].dev_uuid, dev->uuid, BTRFS_UUID_SIZE);
1770
1771         list_move(&devext->chunk_list, &chunk->dextents);
1772
1773         return 0;
1774 }
1775
1776 static inline int count_devext_records(struct list_head *record_list)
1777 {
1778         int num_of_records = 0;
1779         struct device_extent_record *devext;
1780
1781         list_for_each_entry(devext, record_list, chunk_list)
1782                 num_of_records++;
1783
1784         return num_of_records;
1785 }
1786
1787 static int fill_chunk_up(struct chunk_record *chunk, struct list_head *devexts,
1788                          struct recover_control *rc)
1789 {
1790         int ret = 0;
1791         int i;
1792
1793         for (i = 0; i < chunk->num_stripes; i++) {
1794                 if (!chunk->stripes[i].devid) {
1795                         ret = insert_stripe(devexts, rc, chunk, i);
1796                         if (ret)
1797                                 break;
1798                 }
1799         }
1800
1801         return ret;
1802 }
1803
1804 #define EQUAL_STRIPE (1 << 0)
1805
1806 static int rebuild_raid_data_chunk_stripes(struct recover_control *rc,
1807                                            struct btrfs_root *root,
1808                                            struct chunk_record *chunk,
1809                                            u8 *flags)
1810 {
1811         int i;
1812         int ret = 0;
1813         int slot;
1814         struct btrfs_path path;
1815         struct btrfs_key prev_key;
1816         struct btrfs_key key;
1817         struct btrfs_root *csum_root;
1818         struct extent_buffer *leaf;
1819         struct device_extent_record *devext;
1820         struct device_extent_record *next;
1821         struct btrfs_device *dev;
1822         u64 start = chunk->offset;
1823         u64 end = start + chunk->stripe_len;
1824         u64 chunk_end = chunk->offset + chunk->length;
1825         u64 csum_offset = 0;
1826         u64 data_offset;
1827         u32 blocksize = root->sectorsize;
1828         u32 tree_csum;
1829         int index = 0;
1830         int num_unordered = 0;
1831         LIST_HEAD(unordered);
1832         LIST_HEAD(candidates);
1833
1834         csum_root = root->fs_info->csum_root;
1835         btrfs_init_path(&path);
1836         list_splice_init(&chunk->dextents, &candidates);
1837 again:
1838         if (list_is_last(candidates.next, &candidates))
1839                 goto out;
1840
1841         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1842         key.type = BTRFS_EXTENT_CSUM_KEY;
1843         key.offset = start;
1844
1845         ret = btrfs_search_slot(NULL, csum_root, &key, &path, 0, 0);
1846         if (ret < 0) {
1847                 fprintf(stderr, "Search csum failed(%d)\n", ret);
1848                 goto fail_out;
1849         }
1850         leaf = path.nodes[0];
1851         slot = path.slots[0];
1852         if (ret > 0) {
1853                 if (slot >= btrfs_header_nritems(leaf)) {
1854                         ret = btrfs_next_leaf(csum_root, &path);
1855                         if (ret < 0) {
1856                                 fprintf(stderr,
1857                                         "Walk tree failed(%d)\n", ret);
1858                                 goto fail_out;
1859                         } else if (ret > 0) {
1860                                 slot = btrfs_header_nritems(leaf) - 1;
1861                                 btrfs_item_key_to_cpu(leaf, &key, slot);
1862                                 if (item_end_offset(root, &key, leaf, slot)
1863                                                                 > start) {
1864                                         csum_offset = start - key.offset;
1865                                         csum_offset /= blocksize;
1866                                         goto next_csum;
1867                                 }
1868                                 goto next_stripe;
1869                         }
1870                         leaf = path.nodes[0];
1871                         slot = path.slots[0];
1872                 }
1873                 btrfs_item_key_to_cpu(leaf, &key, slot);
1874                 ret = btrfs_previous_item(csum_root, &path, 0,
1875                                           BTRFS_EXTENT_CSUM_KEY);
1876                 if (ret < 0)
1877                         goto fail_out;
1878                 else if (ret > 0) {
1879                         if (key.offset >= end)
1880                                 goto next_stripe;
1881                         else
1882                                 goto next_csum;
1883                 }
1884                 leaf = path.nodes[0];
1885                 slot = path.slots[0];
1886
1887                 btrfs_item_key_to_cpu(leaf, &prev_key, slot);
1888                 if (item_end_offset(root, &prev_key, leaf, slot) > start) {
1889                         csum_offset = start - prev_key.offset;
1890                         csum_offset /= blocksize;
1891                         btrfs_item_key_to_cpu(leaf, &key, slot);
1892                 } else {
1893                         if (key.offset >= end)
1894                                 goto next_stripe;
1895                 }
1896
1897                 if (key.offset + csum_offset * blocksize > chunk_end)
1898                         goto out;
1899         }
1900 next_csum:
1901         ret = next_csum(root, &leaf, &path, &slot, &csum_offset, &tree_csum,
1902                         end, &key);
1903         if (ret < 0) {
1904                 fprintf(stderr, "Fetch csum failed\n");
1905                 goto fail_out;
1906         } else if (ret == 1) {
1907                 if (!(*flags & EQUAL_STRIPE))
1908                         *flags |= EQUAL_STRIPE;
1909                 goto out;
1910         } else if (ret == 2)
1911                 goto next_stripe;
1912
1913         list_for_each_entry_safe(devext, next, &candidates, chunk_list) {
1914                 data_offset = calc_data_offset(&key, chunk, devext->offset,
1915                                                csum_offset, blocksize);
1916                 dev = btrfs_find_device_by_devid(rc->fs_devices,
1917                                                  devext->objectid, 0);
1918                 if (!dev) {
1919                         ret = 1;
1920                         goto fail_out;
1921                 }
1922                 BUG_ON(btrfs_find_device_by_devid(rc->fs_devices,
1923                                                   devext->objectid, 1));
1924
1925                 ret = check_one_csum(dev->fd, data_offset, blocksize,
1926                                      tree_csum);
1927                 if (ret < 0)
1928                         goto fail_out;
1929                 else if (ret > 0)
1930                         list_move(&devext->chunk_list, &unordered);
1931         }
1932
1933         if (list_empty(&candidates)) {
1934                 num_unordered = count_devext_records(&unordered);
1935                 if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6
1936                                         && num_unordered == 2) {
1937                         btrfs_release_path(&path);
1938                         ret = fill_chunk_up(chunk, &unordered, rc);
1939                         return ret;
1940                 }
1941
1942                 goto next_stripe;
1943         }
1944
1945         if (list_is_last(candidates.next, &candidates)) {
1946                 index = btrfs_calc_stripe_index(chunk,
1947                         key.offset + csum_offset * blocksize);
1948                 BUG_ON(index == -1);
1949                 if (chunk->stripes[index].devid)
1950                         goto next_stripe;
1951                 ret = insert_stripe(&candidates, rc, chunk, index);
1952                 if (ret)
1953                         goto fail_out;
1954         } else {
1955                 csum_offset++;
1956                 goto next_csum;
1957         }
1958 next_stripe:
1959         start = btrfs_next_stripe_logical_offset(chunk, start);
1960         end = min(start + chunk->stripe_len, chunk_end);
1961         list_splice_init(&unordered, &candidates);
1962         btrfs_release_path(&path);
1963         csum_offset = 0;
1964         if (end < chunk_end)
1965                 goto again;
1966 out:
1967         ret = 0;
1968         list_splice_init(&candidates, &unordered);
1969         num_unordered = count_devext_records(&unordered);
1970         if (num_unordered == 1) {
1971                 for (i = 0; i < chunk->num_stripes; i++) {
1972                         if (!chunk->stripes[i].devid) {
1973                                 index = i;
1974                                 break;
1975                         }
1976                 }
1977                 ret = insert_stripe(&unordered, rc, chunk, index);
1978                 if (ret)
1979                         goto fail_out;
1980         } else {
1981                 if ((num_unordered == 2 && chunk->type_flags
1982                         & BTRFS_BLOCK_GROUP_RAID5)
1983                  || (num_unordered == 3 && chunk->type_flags
1984                         & BTRFS_BLOCK_GROUP_RAID6)) {
1985                         ret = fill_chunk_up(chunk, &unordered, rc);
1986                 }
1987         }
1988 fail_out:
1989         ret = !!ret || (list_empty(&unordered) ? 0 : 1);
1990         list_splice_init(&candidates, &chunk->dextents);
1991         list_splice_init(&unordered, &chunk->dextents);
1992         btrfs_release_path(&path);
1993
1994         return ret;
1995 }
1996
1997 static int btrfs_rebuild_ordered_data_chunk_stripes(struct recover_control *rc,
1998                                            struct btrfs_root *root)
1999 {
2000         struct chunk_record *chunk;
2001         struct chunk_record *next;
2002         int ret = 0;
2003         int err;
2004         u8 flags;
2005
2006         list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) {
2007                 if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA)
2008                  && (chunk->type_flags & BTRFS_ORDERED_RAID)) {
2009                         flags = 0;
2010                         err = rebuild_raid_data_chunk_stripes(rc, root, chunk,
2011                                                               &flags);
2012                         if (err) {
2013                                 list_move(&chunk->list, &rc->bad_chunks);
2014                                 if (flags & EQUAL_STRIPE)
2015                                         fprintf(stderr,
2016                         "Failure: too many equal stripes in chunk[%llu %llu]\n",
2017                                                 chunk->offset, chunk->length);
2018                                 if (!ret)
2019                                         ret = err;
2020                         } else
2021                                 list_move(&chunk->list, &rc->good_chunks);
2022                 }
2023         }
2024         return ret;
2025 }
2026
2027 static int btrfs_recover_chunks(struct recover_control *rc)
2028 {
2029         struct chunk_record *chunk;
2030         struct block_group_record *bg;
2031         struct block_group_record *next;
2032         LIST_HEAD(new_chunks);
2033         LIST_HEAD(devexts);
2034         int nstripes;
2035         int ret;
2036
2037         /* create the chunk by block group */
2038         list_for_each_entry_safe(bg, next, &rc->bg.block_groups, list) {
2039                 nstripes = btrfs_get_device_extents(bg->objectid,
2040                                                     &rc->devext.no_chunk_orphans,
2041                                                     &devexts);
2042                 chunk = malloc(btrfs_chunk_record_size(nstripes));
2043                 if (!chunk)
2044                         return -ENOMEM;
2045                 memset(chunk, 0, btrfs_chunk_record_size(nstripes));
2046                 INIT_LIST_HEAD(&chunk->dextents);
2047                 chunk->bg_rec = bg;
2048                 chunk->cache.start = bg->objectid;
2049                 chunk->cache.size = bg->offset;
2050                 chunk->objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2051                 chunk->type = BTRFS_CHUNK_ITEM_KEY;
2052                 chunk->offset = bg->objectid;
2053                 chunk->generation = bg->generation;
2054                 chunk->length = bg->offset;
2055                 chunk->owner = BTRFS_CHUNK_TREE_OBJECTID;
2056                 chunk->stripe_len = BTRFS_STRIPE_LEN;
2057                 chunk->type_flags = bg->flags;
2058                 chunk->io_width = BTRFS_STRIPE_LEN;
2059                 chunk->io_align = BTRFS_STRIPE_LEN;
2060                 chunk->sector_size = rc->sectorsize;
2061                 chunk->sub_stripes = calc_sub_nstripes(bg->flags);
2062
2063                 ret = insert_cache_extent(&rc->chunk, &chunk->cache);
2064                 BUG_ON(ret);
2065
2066                 if (!nstripes) {
2067                         list_add_tail(&chunk->list, &rc->bad_chunks);
2068                         continue;
2069                 }
2070
2071                 list_splice_init(&devexts, &chunk->dextents);
2072
2073                 ret = btrfs_verify_device_extents(bg, &devexts, nstripes);
2074                 if (ret) {
2075                         list_add_tail(&chunk->list, &rc->bad_chunks);
2076                         continue;
2077                 }
2078
2079                 chunk->num_stripes = nstripes;
2080                 ret = btrfs_rebuild_chunk_stripes(rc, chunk);
2081                 if (ret > 0)
2082                         list_add_tail(&chunk->list, &rc->unrepaired_chunks);
2083                 else if (ret < 0)
2084                         list_add_tail(&chunk->list, &rc->bad_chunks);
2085                 else
2086                         list_add_tail(&chunk->list, &rc->good_chunks);
2087         }
2088         /*
2089          * Don't worry about the lost orphan device extents, they don't
2090          * have its chunk and block group, they must be the old ones that
2091          * we have dropped.
2092          */
2093         return 0;
2094 }
2095
2096 /*
2097  * Return 0 when succesful, < 0 on error and > 0 if aborted by user
2098  */
2099 int btrfs_recover_chunk_tree(char *path, int verbose, int yes)
2100 {
2101         int ret = 0;
2102         struct btrfs_root *root = NULL;
2103         struct btrfs_trans_handle *trans;
2104         struct recover_control rc;
2105
2106         init_recover_control(&rc, verbose, yes);
2107
2108         ret = recover_prepare(&rc, path);
2109         if (ret) {
2110                 fprintf(stderr, "recover prepare error\n");
2111                 return ret;
2112         }
2113
2114         ret = scan_devices(&rc);
2115         if (ret) {
2116                 fprintf(stderr, "scan chunk headers error\n");
2117                 goto fail_rc;
2118         }
2119
2120         if (cache_tree_empty(&rc.chunk) &&
2121             cache_tree_empty(&rc.bg.tree) &&
2122             cache_tree_empty(&rc.devext.tree)) {
2123                 fprintf(stderr, "no recoverable chunk\n");
2124                 goto fail_rc;
2125         }
2126
2127         print_scan_result(&rc);
2128
2129         ret = check_chunks(&rc.chunk, &rc.bg, &rc.devext, &rc.good_chunks,
2130                            &rc.bad_chunks, 1);
2131         print_check_result(&rc);
2132         if (ret) {
2133                 if (!list_empty(&rc.bg.block_groups) ||
2134                     !list_empty(&rc.devext.no_chunk_orphans)) {
2135                         ret = btrfs_recover_chunks(&rc);
2136                         if (ret)
2137                                 goto fail_rc;
2138                 }
2139                 /*
2140                  * If the chunk is healthy, its block group item and device
2141                  * extent item should be written on the disks. So, it is very
2142                  * likely that the bad chunk is a old one that has been
2143                  * droppped from the fs. Don't deal with them now, we will
2144                  * check it after the fs is opened.
2145                  */
2146         } else {
2147                 fprintf(stderr, "Check chunks successfully with no orphans\n");
2148                 goto fail_rc;
2149         }
2150
2151         root = open_ctree_with_broken_chunk(&rc);
2152         if (IS_ERR(root)) {
2153                 fprintf(stderr, "open with broken chunk error\n");
2154                 ret = PTR_ERR(root);
2155                 goto fail_rc;
2156         }
2157
2158         ret = check_all_chunks_by_metadata(&rc, root);
2159         if (ret) {
2160                 fprintf(stderr, "The chunks in memory can not match the metadata of the fs. Repair failed.\n");
2161                 goto fail_close_ctree;
2162         }
2163
2164         ret = btrfs_rebuild_ordered_data_chunk_stripes(&rc, root);
2165         if (ret) {
2166                 fprintf(stderr, "Failed to rebuild ordered chunk stripes.\n");
2167                 goto fail_close_ctree;
2168         }
2169
2170         if (!rc.yes) {
2171                 ret = ask_user("We are going to rebuild the chunk tree on disk, it might destroy the old metadata on the disk, Are you sure?");
2172                 if (!ret) {
2173                         ret = 1;
2174                         goto fail_close_ctree;
2175                 }
2176         }
2177
2178         trans = btrfs_start_transaction(root, 1);
2179         ret = remove_chunk_extent_item(trans, &rc, root);
2180         BUG_ON(ret);
2181
2182         ret = rebuild_chunk_tree(trans, &rc, root);
2183         BUG_ON(ret);
2184
2185         ret = rebuild_sys_array(&rc, root);
2186         BUG_ON(ret);
2187
2188         btrfs_commit_transaction(trans, root);
2189 fail_close_ctree:
2190         close_ctree(root);
2191 fail_rc:
2192         free_recover_control(&rc);
2193         return ret;
2194 }