btrfs-progs: check: introduce function to query tree block level
[platform/upstream/btrfs-progs.git] / cmds-check.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <unistd.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <sys/stat.h>
25 #include <unistd.h>
26 #include <getopt.h>
27 #include <uuid/uuid.h>
28 #include "ctree.h"
29 #include "volumes.h"
30 #include "repair.h"
31 #include "disk-io.h"
32 #include "print-tree.h"
33 #include "task-utils.h"
34 #include "transaction.h"
35 #include "utils.h"
36 #include "commands.h"
37 #include "free-space-cache.h"
38 #include "free-space-tree.h"
39 #include "btrfsck.h"
40 #include "qgroup-verify.h"
41 #include "rbtree-utils.h"
42 #include "backref.h"
43 #include "ulist.h"
44
45 enum task_position {
46         TASK_EXTENTS,
47         TASK_FREE_SPACE,
48         TASK_FS_ROOTS,
49         TASK_NOTHING, /* have to be the last element */
50 };
51
52 struct task_ctx {
53         int progress_enabled;
54         enum task_position tp;
55
56         struct task_info *info;
57 };
58
59 static u64 bytes_used = 0;
60 static u64 total_csum_bytes = 0;
61 static u64 total_btree_bytes = 0;
62 static u64 total_fs_tree_bytes = 0;
63 static u64 total_extent_tree_bytes = 0;
64 static u64 btree_space_waste = 0;
65 static u64 data_bytes_allocated = 0;
66 static u64 data_bytes_referenced = 0;
67 static int found_old_backref = 0;
68 static LIST_HEAD(duplicate_extents);
69 static LIST_HEAD(delete_items);
70 static int no_holes = 0;
71 static int init_extent_tree = 0;
72 static int check_data_csum = 0;
73 static struct btrfs_fs_info *global_info;
74 static struct task_ctx ctx = { 0 };
75 static struct cache_tree *roots_info_cache = NULL;
76
77 struct extent_backref {
78         struct rb_node node;
79         unsigned int is_data:1;
80         unsigned int found_extent_tree:1;
81         unsigned int full_backref:1;
82         unsigned int found_ref:1;
83         unsigned int broken:1;
84 };
85
86 static inline struct extent_backref* rb_node_to_extent_backref(struct rb_node *node)
87 {
88         return rb_entry(node, struct extent_backref, node);
89 }
90
91 struct data_backref {
92         struct extent_backref node;
93         union {
94                 u64 parent;
95                 u64 root;
96         };
97         u64 owner;
98         u64 offset;
99         u64 disk_bytenr;
100         u64 bytes;
101         u64 ram_bytes;
102         u32 num_refs;
103         u32 found_ref;
104 };
105
106 static inline struct data_backref* to_data_backref(struct extent_backref *back)
107 {
108         return container_of(back, struct data_backref, node);
109 }
110
111 static int compare_data_backref(struct rb_node *node1, struct rb_node *node2)
112 {
113         struct extent_backref *ext1 = rb_node_to_extent_backref(node1);
114         struct extent_backref *ext2 = rb_node_to_extent_backref(node2);
115         struct data_backref *back1 = to_data_backref(ext1);
116         struct data_backref *back2 = to_data_backref(ext2);
117
118         WARN_ON(!ext1->is_data);
119         WARN_ON(!ext2->is_data);
120
121         /* parent and root are a union, so this covers both */
122         if (back1->parent > back2->parent)
123                 return 1;
124         if (back1->parent < back2->parent)
125                 return -1;
126
127         /* This is a full backref and the parents match. */
128         if (back1->node.full_backref)
129                 return 0;
130
131         if (back1->owner > back2->owner)
132                 return 1;
133         if (back1->owner < back2->owner)
134                 return -1;
135
136         if (back1->offset > back2->offset)
137                 return 1;
138         if (back1->offset < back2->offset)
139                 return -1;
140
141         if (back1->bytes > back2->bytes)
142                 return 1;
143         if (back1->bytes < back2->bytes)
144                 return -1;
145
146         if (back1->found_ref && back2->found_ref) {
147                 if (back1->disk_bytenr > back2->disk_bytenr)
148                         return 1;
149                 if (back1->disk_bytenr < back2->disk_bytenr)
150                         return -1;
151
152                 if (back1->found_ref > back2->found_ref)
153                         return 1;
154                 if (back1->found_ref < back2->found_ref)
155                         return -1;
156         }
157
158         return 0;
159 }
160
161 /*
162  * Much like data_backref, just removed the undetermined members
163  * and change it to use list_head.
164  * During extent scan, it is stored in root->orphan_data_extent.
165  * During fs tree scan, it is then moved to inode_rec->orphan_data_extents.
166  */
167 struct orphan_data_extent {
168         struct list_head list;
169         u64 root;
170         u64 objectid;
171         u64 offset;
172         u64 disk_bytenr;
173         u64 disk_len;
174 };
175
176 struct tree_backref {
177         struct extent_backref node;
178         union {
179                 u64 parent;
180                 u64 root;
181         };
182 };
183
184 static inline struct tree_backref* to_tree_backref(struct extent_backref *back)
185 {
186         return container_of(back, struct tree_backref, node);
187 }
188
189 static int compare_tree_backref(struct rb_node *node1, struct rb_node *node2)
190 {
191         struct extent_backref *ext1 = rb_node_to_extent_backref(node1);
192         struct extent_backref *ext2 = rb_node_to_extent_backref(node2);
193         struct tree_backref *back1 = to_tree_backref(ext1);
194         struct tree_backref *back2 = to_tree_backref(ext2);
195
196         WARN_ON(ext1->is_data);
197         WARN_ON(ext2->is_data);
198
199         /* parent and root are a union, so this covers both */
200         if (back1->parent > back2->parent)
201                 return 1;
202         if (back1->parent < back2->parent)
203                 return -1;
204
205         return 0;
206 }
207
208 static int compare_extent_backref(struct rb_node *node1, struct rb_node *node2)
209 {
210         struct extent_backref *ext1 = rb_node_to_extent_backref(node1);
211         struct extent_backref *ext2 = rb_node_to_extent_backref(node2);
212
213         if (ext1->is_data > ext2->is_data)
214                 return 1;
215
216         if (ext1->is_data < ext2->is_data)
217                 return -1;
218
219         if (ext1->full_backref > ext2->full_backref)
220                 return 1;
221         if (ext1->full_backref < ext2->full_backref)
222                 return -1;
223
224         if (ext1->is_data)
225                 return compare_data_backref(node1, node2);
226         else
227                 return compare_tree_backref(node1, node2);
228 }
229
230 /* Explicit initialization for extent_record::flag_block_full_backref */
231 enum { FLAG_UNSET = 2 };
232
233 struct extent_record {
234         struct list_head backrefs;
235         struct list_head dups;
236         struct rb_root backref_tree;
237         struct list_head list;
238         struct cache_extent cache;
239         struct btrfs_disk_key parent_key;
240         u64 start;
241         u64 max_size;
242         u64 nr;
243         u64 refs;
244         u64 extent_item_refs;
245         u64 generation;
246         u64 parent_generation;
247         u64 info_objectid;
248         u32 num_duplicates;
249         u8 info_level;
250         unsigned int flag_block_full_backref:2;
251         unsigned int found_rec:1;
252         unsigned int content_checked:1;
253         unsigned int owner_ref_checked:1;
254         unsigned int is_root:1;
255         unsigned int metadata:1;
256         unsigned int bad_full_backref:1;
257         unsigned int crossing_stripes:1;
258         unsigned int wrong_chunk_type:1;
259 };
260
261 static inline struct extent_record* to_extent_record(struct list_head *entry)
262 {
263         return container_of(entry, struct extent_record, list);
264 }
265
266 struct inode_backref {
267         struct list_head list;
268         unsigned int found_dir_item:1;
269         unsigned int found_dir_index:1;
270         unsigned int found_inode_ref:1;
271         unsigned int filetype:8;
272         int errors;
273         unsigned int ref_type;
274         u64 dir;
275         u64 index;
276         u16 namelen;
277         char name[0];
278 };
279
280 static inline struct inode_backref* to_inode_backref(struct list_head *entry)
281 {
282         return list_entry(entry, struct inode_backref, list);
283 }
284
285 struct root_item_record {
286         struct list_head list;
287         u64 objectid;
288         u64 bytenr;
289         u64 last_snapshot;
290         u8 level;
291         u8 drop_level;
292         int level_size;
293         struct btrfs_key drop_key;
294 };
295
296 #define REF_ERR_NO_DIR_ITEM             (1 << 0)
297 #define REF_ERR_NO_DIR_INDEX            (1 << 1)
298 #define REF_ERR_NO_INODE_REF            (1 << 2)
299 #define REF_ERR_DUP_DIR_ITEM            (1 << 3)
300 #define REF_ERR_DUP_DIR_INDEX           (1 << 4)
301 #define REF_ERR_DUP_INODE_REF           (1 << 5)
302 #define REF_ERR_INDEX_UNMATCH           (1 << 6)
303 #define REF_ERR_FILETYPE_UNMATCH        (1 << 7)
304 #define REF_ERR_NAME_TOO_LONG           (1 << 8) // 100
305 #define REF_ERR_NO_ROOT_REF             (1 << 9)
306 #define REF_ERR_NO_ROOT_BACKREF         (1 << 10)
307 #define REF_ERR_DUP_ROOT_REF            (1 << 11)
308 #define REF_ERR_DUP_ROOT_BACKREF        (1 << 12)
309
310 struct file_extent_hole {
311         struct rb_node node;
312         u64 start;
313         u64 len;
314 };
315
316 struct inode_record {
317         struct list_head backrefs;
318         unsigned int checked:1;
319         unsigned int merging:1;
320         unsigned int found_inode_item:1;
321         unsigned int found_dir_item:1;
322         unsigned int found_file_extent:1;
323         unsigned int found_csum_item:1;
324         unsigned int some_csum_missing:1;
325         unsigned int nodatasum:1;
326         int errors;
327
328         u64 ino;
329         u32 nlink;
330         u32 imode;
331         u64 isize;
332         u64 nbytes;
333
334         u32 found_link;
335         u64 found_size;
336         u64 extent_start;
337         u64 extent_end;
338         struct rb_root holes;
339         struct list_head orphan_extents;
340
341         u32 refs;
342 };
343
344 #define I_ERR_NO_INODE_ITEM             (1 << 0)
345 #define I_ERR_NO_ORPHAN_ITEM            (1 << 1)
346 #define I_ERR_DUP_INODE_ITEM            (1 << 2)
347 #define I_ERR_DUP_DIR_INDEX             (1 << 3)
348 #define I_ERR_ODD_DIR_ITEM              (1 << 4)
349 #define I_ERR_ODD_FILE_EXTENT           (1 << 5)
350 #define I_ERR_BAD_FILE_EXTENT           (1 << 6)
351 #define I_ERR_FILE_EXTENT_OVERLAP       (1 << 7)
352 #define I_ERR_FILE_EXTENT_DISCOUNT      (1 << 8) // 100
353 #define I_ERR_DIR_ISIZE_WRONG           (1 << 9)
354 #define I_ERR_FILE_NBYTES_WRONG         (1 << 10) // 400
355 #define I_ERR_ODD_CSUM_ITEM             (1 << 11)
356 #define I_ERR_SOME_CSUM_MISSING         (1 << 12)
357 #define I_ERR_LINK_COUNT_WRONG          (1 << 13)
358 #define I_ERR_FILE_EXTENT_ORPHAN        (1 << 14)
359
360 struct root_backref {
361         struct list_head list;
362         unsigned int found_dir_item:1;
363         unsigned int found_dir_index:1;
364         unsigned int found_back_ref:1;
365         unsigned int found_forward_ref:1;
366         unsigned int reachable:1;
367         int errors;
368         u64 ref_root;
369         u64 dir;
370         u64 index;
371         u16 namelen;
372         char name[0];
373 };
374
375 static inline struct root_backref* to_root_backref(struct list_head *entry)
376 {
377         return list_entry(entry, struct root_backref, list);
378 }
379
380 struct root_record {
381         struct list_head backrefs;
382         struct cache_extent cache;
383         unsigned int found_root_item:1;
384         u64 objectid;
385         u32 found_ref;
386 };
387
388 struct ptr_node {
389         struct cache_extent cache;
390         void *data;
391 };
392
393 struct shared_node {
394         struct cache_extent cache;
395         struct cache_tree root_cache;
396         struct cache_tree inode_cache;
397         struct inode_record *current;
398         u32 refs;
399 };
400
401 struct block_info {
402         u64 start;
403         u32 size;
404 };
405
406 struct walk_control {
407         struct cache_tree shared;
408         struct shared_node *nodes[BTRFS_MAX_LEVEL];
409         int active_node;
410         int root_level;
411 };
412
413 struct bad_item {
414         struct btrfs_key key;
415         u64 root_id;
416         struct list_head list;
417 };
418
419 struct extent_entry {
420         u64 bytenr;
421         u64 bytes;
422         int count;
423         int broken;
424         struct list_head list;
425 };
426
427 struct root_item_info {
428         /* level of the root */
429         u8 level;
430         /* number of nodes at this level, must be 1 for a root */
431         int node_count;
432         u64 bytenr;
433         u64 gen;
434         struct cache_extent cache_extent;
435 };
436
437 /*
438  * Error bit for low memory mode check.
439  *
440  * Currently no caller cares about it yet.  Just internal use for error
441  * classification.
442  */
443 #define BACKREF_MISSING         (1 << 0) /* Backref missing in extent tree */
444 #define BACKREF_MISMATCH        (1 << 1) /* Backref exists but does not match */
445 #define BYTES_UNALIGNED         (1 << 2) /* Some bytes are not aligned */
446
447 static void *print_status_check(void *p)
448 {
449         struct task_ctx *priv = p;
450         const char work_indicator[] = { '.', 'o', 'O', 'o' };
451         uint32_t count = 0;
452         static char *task_position_string[] = {
453                 "checking extents",
454                 "checking free space cache",
455                 "checking fs roots",
456         };
457
458         task_period_start(priv->info, 1000 /* 1s */);
459
460         if (priv->tp == TASK_NOTHING)
461                 return NULL;
462
463         while (1) {
464                 printf("%s [%c]\r", task_position_string[priv->tp],
465                                 work_indicator[count % 4]);
466                 count++;
467                 fflush(stdout);
468                 task_period_wait(priv->info);
469         }
470         return NULL;
471 }
472
473 static int print_status_return(void *p)
474 {
475         printf("\n");
476         fflush(stdout);
477
478         return 0;
479 }
480
481 /* Compatible function to allow reuse of old codes */
482 static u64 first_extent_gap(struct rb_root *holes)
483 {
484         struct file_extent_hole *hole;
485
486         if (RB_EMPTY_ROOT(holes))
487                 return (u64)-1;
488
489         hole = rb_entry(rb_first(holes), struct file_extent_hole, node);
490         return hole->start;
491 }
492
493 static int compare_hole(struct rb_node *node1, struct rb_node *node2)
494 {
495         struct file_extent_hole *hole1;
496         struct file_extent_hole *hole2;
497
498         hole1 = rb_entry(node1, struct file_extent_hole, node);
499         hole2 = rb_entry(node2, struct file_extent_hole, node);
500
501         if (hole1->start > hole2->start)
502                 return -1;
503         if (hole1->start < hole2->start)
504                 return 1;
505         /* Now hole1->start == hole2->start */
506         if (hole1->len >= hole2->len)
507                 /*
508                  * Hole 1 will be merge center
509                  * Same hole will be merged later
510                  */
511                 return -1;
512         /* Hole 2 will be merge center */
513         return 1;
514 }
515
516 /*
517  * Add a hole to the record
518  *
519  * This will do hole merge for copy_file_extent_holes(),
520  * which will ensure there won't be continuous holes.
521  */
522 static int add_file_extent_hole(struct rb_root *holes,
523                                 u64 start, u64 len)
524 {
525         struct file_extent_hole *hole;
526         struct file_extent_hole *prev = NULL;
527         struct file_extent_hole *next = NULL;
528
529         hole = malloc(sizeof(*hole));
530         if (!hole)
531                 return -ENOMEM;
532         hole->start = start;
533         hole->len = len;
534         /* Since compare will not return 0, no -EEXIST will happen */
535         rb_insert(holes, &hole->node, compare_hole);
536
537         /* simple merge with previous hole */
538         if (rb_prev(&hole->node))
539                 prev = rb_entry(rb_prev(&hole->node), struct file_extent_hole,
540                                 node);
541         if (prev && prev->start + prev->len >= hole->start) {
542                 hole->len = hole->start + hole->len - prev->start;
543                 hole->start = prev->start;
544                 rb_erase(&prev->node, holes);
545                 free(prev);
546                 prev = NULL;
547         }
548
549         /* iterate merge with next holes */
550         while (1) {
551                 if (!rb_next(&hole->node))
552                         break;
553                 next = rb_entry(rb_next(&hole->node), struct file_extent_hole,
554                                         node);
555                 if (hole->start + hole->len >= next->start) {
556                         if (hole->start + hole->len <= next->start + next->len)
557                                 hole->len = next->start + next->len -
558                                             hole->start;
559                         rb_erase(&next->node, holes);
560                         free(next);
561                         next = NULL;
562                 } else
563                         break;
564         }
565         return 0;
566 }
567
568 static int compare_hole_range(struct rb_node *node, void *data)
569 {
570         struct file_extent_hole *hole;
571         u64 start;
572
573         hole = (struct file_extent_hole *)data;
574         start = hole->start;
575
576         hole = rb_entry(node, struct file_extent_hole, node);
577         if (start < hole->start)
578                 return -1;
579         if (start >= hole->start && start < hole->start + hole->len)
580                 return 0;
581         return 1;
582 }
583
584 /*
585  * Delete a hole in the record
586  *
587  * This will do the hole split and is much restrict than add.
588  */
589 static int del_file_extent_hole(struct rb_root *holes,
590                                 u64 start, u64 len)
591 {
592         struct file_extent_hole *hole;
593         struct file_extent_hole tmp;
594         u64 prev_start = 0;
595         u64 prev_len = 0;
596         u64 next_start = 0;
597         u64 next_len = 0;
598         struct rb_node *node;
599         int have_prev = 0;
600         int have_next = 0;
601         int ret = 0;
602
603         tmp.start = start;
604         tmp.len = len;
605         node = rb_search(holes, &tmp, compare_hole_range, NULL);
606         if (!node)
607                 return -EEXIST;
608         hole = rb_entry(node, struct file_extent_hole, node);
609         if (start + len > hole->start + hole->len)
610                 return -EEXIST;
611
612         /*
613          * Now there will be no overlap, delete the hole and re-add the
614          * split(s) if they exists.
615          */
616         if (start > hole->start) {
617                 prev_start = hole->start;
618                 prev_len = start - hole->start;
619                 have_prev = 1;
620         }
621         if (hole->start + hole->len > start + len) {
622                 next_start = start + len;
623                 next_len = hole->start + hole->len - start - len;
624                 have_next = 1;
625         }
626         rb_erase(node, holes);
627         free(hole);
628         if (have_prev) {
629                 ret = add_file_extent_hole(holes, prev_start, prev_len);
630                 if (ret < 0)
631                         return ret;
632         }
633         if (have_next) {
634                 ret = add_file_extent_hole(holes, next_start, next_len);
635                 if (ret < 0)
636                         return ret;
637         }
638         return 0;
639 }
640
641 static int copy_file_extent_holes(struct rb_root *dst,
642                                   struct rb_root *src)
643 {
644         struct file_extent_hole *hole;
645         struct rb_node *node;
646         int ret = 0;
647
648         node = rb_first(src);
649         while (node) {
650                 hole = rb_entry(node, struct file_extent_hole, node);
651                 ret = add_file_extent_hole(dst, hole->start, hole->len);
652                 if (ret)
653                         break;
654                 node = rb_next(node);
655         }
656         return ret;
657 }
658
659 static void free_file_extent_holes(struct rb_root *holes)
660 {
661         struct rb_node *node;
662         struct file_extent_hole *hole;
663
664         node = rb_first(holes);
665         while (node) {
666                 hole = rb_entry(node, struct file_extent_hole, node);
667                 rb_erase(node, holes);
668                 free(hole);
669                 node = rb_first(holes);
670         }
671 }
672
673 static void reset_cached_block_groups(struct btrfs_fs_info *fs_info);
674
675 static void record_root_in_trans(struct btrfs_trans_handle *trans,
676                                  struct btrfs_root *root)
677 {
678         if (root->last_trans != trans->transid) {
679                 root->track_dirty = 1;
680                 root->last_trans = trans->transid;
681                 root->commit_root = root->node;
682                 extent_buffer_get(root->node);
683         }
684 }
685
686 static u8 imode_to_type(u32 imode)
687 {
688 #define S_SHIFT 12
689         static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
690                 [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
691                 [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
692                 [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
693                 [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
694                 [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
695                 [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
696                 [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
697         };
698
699         return btrfs_type_by_mode[(imode & S_IFMT) >> S_SHIFT];
700 #undef S_SHIFT
701 }
702
703 static int device_record_compare(struct rb_node *node1, struct rb_node *node2)
704 {
705         struct device_record *rec1;
706         struct device_record *rec2;
707
708         rec1 = rb_entry(node1, struct device_record, node);
709         rec2 = rb_entry(node2, struct device_record, node);
710         if (rec1->devid > rec2->devid)
711                 return -1;
712         else if (rec1->devid < rec2->devid)
713                 return 1;
714         else
715                 return 0;
716 }
717
718 static struct inode_record *clone_inode_rec(struct inode_record *orig_rec)
719 {
720         struct inode_record *rec;
721         struct inode_backref *backref;
722         struct inode_backref *orig;
723         struct inode_backref *tmp;
724         struct orphan_data_extent *src_orphan;
725         struct orphan_data_extent *dst_orphan;
726         size_t size;
727         int ret;
728
729         rec = malloc(sizeof(*rec));
730         if (!rec)
731                 return ERR_PTR(-ENOMEM);
732         memcpy(rec, orig_rec, sizeof(*rec));
733         rec->refs = 1;
734         INIT_LIST_HEAD(&rec->backrefs);
735         INIT_LIST_HEAD(&rec->orphan_extents);
736         rec->holes = RB_ROOT;
737
738         list_for_each_entry(orig, &orig_rec->backrefs, list) {
739                 size = sizeof(*orig) + orig->namelen + 1;
740                 backref = malloc(size);
741                 if (!backref) {
742                         ret = -ENOMEM;
743                         goto cleanup;
744                 }
745                 memcpy(backref, orig, size);
746                 list_add_tail(&backref->list, &rec->backrefs);
747         }
748         list_for_each_entry(src_orphan, &orig_rec->orphan_extents, list) {
749                 dst_orphan = malloc(sizeof(*dst_orphan));
750                 if (!dst_orphan) {
751                         ret = -ENOMEM;
752                         goto cleanup;
753                 }
754                 memcpy(dst_orphan, src_orphan, sizeof(*src_orphan));
755                 list_add_tail(&dst_orphan->list, &rec->orphan_extents);
756         }
757         ret = copy_file_extent_holes(&rec->holes, &orig_rec->holes);
758         BUG_ON(ret < 0);
759
760         return rec;
761
762 cleanup:
763         if (!list_empty(&rec->backrefs))
764                 list_for_each_entry_safe(orig, tmp, &rec->backrefs, list) {
765                         list_del(&orig->list);
766                         free(orig);
767                 }
768
769         if (!list_empty(&rec->orphan_extents))
770                 list_for_each_entry_safe(orig, tmp, &rec->orphan_extents, list) {
771                         list_del(&orig->list);
772                         free(orig);
773                 }
774
775         free(rec);
776
777         return ERR_PTR(ret);
778 }
779
780 static void print_orphan_data_extents(struct list_head *orphan_extents,
781                                       u64 objectid)
782 {
783         struct orphan_data_extent *orphan;
784
785         if (list_empty(orphan_extents))
786                 return;
787         printf("The following data extent is lost in tree %llu:\n",
788                objectid);
789         list_for_each_entry(orphan, orphan_extents, list) {
790                 printf("\tinode: %llu, offset:%llu, disk_bytenr: %llu, disk_len: %llu\n",
791                        orphan->objectid, orphan->offset, orphan->disk_bytenr,
792                        orphan->disk_len);
793         }
794 }
795
796 static void print_inode_error(struct btrfs_root *root, struct inode_record *rec)
797 {
798         u64 root_objectid = root->root_key.objectid;
799         int errors = rec->errors;
800
801         if (!errors)
802                 return;
803         /* reloc root errors, we print its corresponding fs root objectid*/
804         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
805                 root_objectid = root->root_key.offset;
806                 fprintf(stderr, "reloc");
807         }
808         fprintf(stderr, "root %llu inode %llu errors %x",
809                 (unsigned long long) root_objectid,
810                 (unsigned long long) rec->ino, rec->errors);
811
812         if (errors & I_ERR_NO_INODE_ITEM)
813                 fprintf(stderr, ", no inode item");
814         if (errors & I_ERR_NO_ORPHAN_ITEM)
815                 fprintf(stderr, ", no orphan item");
816         if (errors & I_ERR_DUP_INODE_ITEM)
817                 fprintf(stderr, ", dup inode item");
818         if (errors & I_ERR_DUP_DIR_INDEX)
819                 fprintf(stderr, ", dup dir index");
820         if (errors & I_ERR_ODD_DIR_ITEM)
821                 fprintf(stderr, ", odd dir item");
822         if (errors & I_ERR_ODD_FILE_EXTENT)
823                 fprintf(stderr, ", odd file extent");
824         if (errors & I_ERR_BAD_FILE_EXTENT)
825                 fprintf(stderr, ", bad file extent");
826         if (errors & I_ERR_FILE_EXTENT_OVERLAP)
827                 fprintf(stderr, ", file extent overlap");
828         if (errors & I_ERR_FILE_EXTENT_DISCOUNT)
829                 fprintf(stderr, ", file extent discount");
830         if (errors & I_ERR_DIR_ISIZE_WRONG)
831                 fprintf(stderr, ", dir isize wrong");
832         if (errors & I_ERR_FILE_NBYTES_WRONG)
833                 fprintf(stderr, ", nbytes wrong");
834         if (errors & I_ERR_ODD_CSUM_ITEM)
835                 fprintf(stderr, ", odd csum item");
836         if (errors & I_ERR_SOME_CSUM_MISSING)
837                 fprintf(stderr, ", some csum missing");
838         if (errors & I_ERR_LINK_COUNT_WRONG)
839                 fprintf(stderr, ", link count wrong");
840         if (errors & I_ERR_FILE_EXTENT_ORPHAN)
841                 fprintf(stderr, ", orphan file extent");
842         fprintf(stderr, "\n");
843         /* Print the orphan extents if needed */
844         if (errors & I_ERR_FILE_EXTENT_ORPHAN)
845                 print_orphan_data_extents(&rec->orphan_extents, root->objectid);
846
847         /* Print the holes if needed */
848         if (errors & I_ERR_FILE_EXTENT_DISCOUNT) {
849                 struct file_extent_hole *hole;
850                 struct rb_node *node;
851                 int found = 0;
852
853                 node = rb_first(&rec->holes);
854                 fprintf(stderr, "Found file extent holes:\n");
855                 while (node) {
856                         found = 1;
857                         hole = rb_entry(node, struct file_extent_hole, node);
858                         fprintf(stderr, "\tstart: %llu, len: %llu\n",
859                                 hole->start, hole->len);
860                         node = rb_next(node);
861                 }
862                 if (!found)
863                         fprintf(stderr, "\tstart: 0, len: %llu\n",
864                                 round_up(rec->isize, root->sectorsize));
865         }
866 }
867
868 static void print_ref_error(int errors)
869 {
870         if (errors & REF_ERR_NO_DIR_ITEM)
871                 fprintf(stderr, ", no dir item");
872         if (errors & REF_ERR_NO_DIR_INDEX)
873                 fprintf(stderr, ", no dir index");
874         if (errors & REF_ERR_NO_INODE_REF)
875                 fprintf(stderr, ", no inode ref");
876         if (errors & REF_ERR_DUP_DIR_ITEM)
877                 fprintf(stderr, ", dup dir item");
878         if (errors & REF_ERR_DUP_DIR_INDEX)
879                 fprintf(stderr, ", dup dir index");
880         if (errors & REF_ERR_DUP_INODE_REF)
881                 fprintf(stderr, ", dup inode ref");
882         if (errors & REF_ERR_INDEX_UNMATCH)
883                 fprintf(stderr, ", index mismatch");
884         if (errors & REF_ERR_FILETYPE_UNMATCH)
885                 fprintf(stderr, ", filetype mismatch");
886         if (errors & REF_ERR_NAME_TOO_LONG)
887                 fprintf(stderr, ", name too long");
888         if (errors & REF_ERR_NO_ROOT_REF)
889                 fprintf(stderr, ", no root ref");
890         if (errors & REF_ERR_NO_ROOT_BACKREF)
891                 fprintf(stderr, ", no root backref");
892         if (errors & REF_ERR_DUP_ROOT_REF)
893                 fprintf(stderr, ", dup root ref");
894         if (errors & REF_ERR_DUP_ROOT_BACKREF)
895                 fprintf(stderr, ", dup root backref");
896         fprintf(stderr, "\n");
897 }
898
899 static struct inode_record *get_inode_rec(struct cache_tree *inode_cache,
900                                           u64 ino, int mod)
901 {
902         struct ptr_node *node;
903         struct cache_extent *cache;
904         struct inode_record *rec = NULL;
905         int ret;
906
907         cache = lookup_cache_extent(inode_cache, ino, 1);
908         if (cache) {
909                 node = container_of(cache, struct ptr_node, cache);
910                 rec = node->data;
911                 if (mod && rec->refs > 1) {
912                         node->data = clone_inode_rec(rec);
913                         if (IS_ERR(node->data))
914                                 return node->data;
915                         rec->refs--;
916                         rec = node->data;
917                 }
918         } else if (mod) {
919                 rec = calloc(1, sizeof(*rec));
920                 if (!rec)
921                         return ERR_PTR(-ENOMEM);
922                 rec->ino = ino;
923                 rec->extent_start = (u64)-1;
924                 rec->refs = 1;
925                 INIT_LIST_HEAD(&rec->backrefs);
926                 INIT_LIST_HEAD(&rec->orphan_extents);
927                 rec->holes = RB_ROOT;
928
929                 node = malloc(sizeof(*node));
930                 if (!node) {
931                         free(rec);
932                         return ERR_PTR(-ENOMEM);
933                 }
934                 node->cache.start = ino;
935                 node->cache.size = 1;
936                 node->data = rec;
937
938                 if (ino == BTRFS_FREE_INO_OBJECTID)
939                         rec->found_link = 1;
940
941                 ret = insert_cache_extent(inode_cache, &node->cache);
942                 if (ret)
943                         return ERR_PTR(-EEXIST);
944         }
945         return rec;
946 }
947
948 static void free_orphan_data_extents(struct list_head *orphan_extents)
949 {
950         struct orphan_data_extent *orphan;
951
952         while (!list_empty(orphan_extents)) {
953                 orphan = list_entry(orphan_extents->next,
954                                     struct orphan_data_extent, list);
955                 list_del(&orphan->list);
956                 free(orphan);
957         }
958 }
959
960 static void free_inode_rec(struct inode_record *rec)
961 {
962         struct inode_backref *backref;
963
964         if (--rec->refs > 0)
965                 return;
966
967         while (!list_empty(&rec->backrefs)) {
968                 backref = to_inode_backref(rec->backrefs.next);
969                 list_del(&backref->list);
970                 free(backref);
971         }
972         free_orphan_data_extents(&rec->orphan_extents);
973         free_file_extent_holes(&rec->holes);
974         free(rec);
975 }
976
977 static int can_free_inode_rec(struct inode_record *rec)
978 {
979         if (!rec->errors && rec->checked && rec->found_inode_item &&
980             rec->nlink == rec->found_link && list_empty(&rec->backrefs))
981                 return 1;
982         return 0;
983 }
984
985 static void maybe_free_inode_rec(struct cache_tree *inode_cache,
986                                  struct inode_record *rec)
987 {
988         struct cache_extent *cache;
989         struct inode_backref *tmp, *backref;
990         struct ptr_node *node;
991         unsigned char filetype;
992
993         if (!rec->found_inode_item)
994                 return;
995
996         filetype = imode_to_type(rec->imode);
997         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
998                 if (backref->found_dir_item && backref->found_dir_index) {
999                         if (backref->filetype != filetype)
1000                                 backref->errors |= REF_ERR_FILETYPE_UNMATCH;
1001                         if (!backref->errors && backref->found_inode_ref &&
1002                             rec->nlink == rec->found_link) {
1003                                 list_del(&backref->list);
1004                                 free(backref);
1005                         }
1006                 }
1007         }
1008
1009         if (!rec->checked || rec->merging)
1010                 return;
1011
1012         if (S_ISDIR(rec->imode)) {
1013                 if (rec->found_size != rec->isize)
1014                         rec->errors |= I_ERR_DIR_ISIZE_WRONG;
1015                 if (rec->found_file_extent)
1016                         rec->errors |= I_ERR_ODD_FILE_EXTENT;
1017         } else if (S_ISREG(rec->imode) || S_ISLNK(rec->imode)) {
1018                 if (rec->found_dir_item)
1019                         rec->errors |= I_ERR_ODD_DIR_ITEM;
1020                 if (rec->found_size != rec->nbytes)
1021                         rec->errors |= I_ERR_FILE_NBYTES_WRONG;
1022                 if (rec->nlink > 0 && !no_holes &&
1023                     (rec->extent_end < rec->isize ||
1024                      first_extent_gap(&rec->holes) < rec->isize))
1025                         rec->errors |= I_ERR_FILE_EXTENT_DISCOUNT;
1026         }
1027
1028         if (S_ISREG(rec->imode) || S_ISLNK(rec->imode)) {
1029                 if (rec->found_csum_item && rec->nodatasum)
1030                         rec->errors |= I_ERR_ODD_CSUM_ITEM;
1031                 if (rec->some_csum_missing && !rec->nodatasum)
1032                         rec->errors |= I_ERR_SOME_CSUM_MISSING;
1033         }
1034
1035         BUG_ON(rec->refs != 1);
1036         if (can_free_inode_rec(rec)) {
1037                 cache = lookup_cache_extent(inode_cache, rec->ino, 1);
1038                 node = container_of(cache, struct ptr_node, cache);
1039                 BUG_ON(node->data != rec);
1040                 remove_cache_extent(inode_cache, &node->cache);
1041                 free(node);
1042                 free_inode_rec(rec);
1043         }
1044 }
1045
1046 static int check_orphan_item(struct btrfs_root *root, u64 ino)
1047 {
1048         struct btrfs_path path;
1049         struct btrfs_key key;
1050         int ret;
1051
1052         key.objectid = BTRFS_ORPHAN_OBJECTID;
1053         key.type = BTRFS_ORPHAN_ITEM_KEY;
1054         key.offset = ino;
1055
1056         btrfs_init_path(&path);
1057         ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
1058         btrfs_release_path(&path);
1059         if (ret > 0)
1060                 ret = -ENOENT;
1061         return ret;
1062 }
1063
1064 static int process_inode_item(struct extent_buffer *eb,
1065                               int slot, struct btrfs_key *key,
1066                               struct shared_node *active_node)
1067 {
1068         struct inode_record *rec;
1069         struct btrfs_inode_item *item;
1070
1071         rec = active_node->current;
1072         BUG_ON(rec->ino != key->objectid || rec->refs > 1);
1073         if (rec->found_inode_item) {
1074                 rec->errors |= I_ERR_DUP_INODE_ITEM;
1075                 return 1;
1076         }
1077         item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
1078         rec->nlink = btrfs_inode_nlink(eb, item);
1079         rec->isize = btrfs_inode_size(eb, item);
1080         rec->nbytes = btrfs_inode_nbytes(eb, item);
1081         rec->imode = btrfs_inode_mode(eb, item);
1082         if (btrfs_inode_flags(eb, item) & BTRFS_INODE_NODATASUM)
1083                 rec->nodatasum = 1;
1084         rec->found_inode_item = 1;
1085         if (rec->nlink == 0)
1086                 rec->errors |= I_ERR_NO_ORPHAN_ITEM;
1087         maybe_free_inode_rec(&active_node->inode_cache, rec);
1088         return 0;
1089 }
1090
1091 static struct inode_backref *get_inode_backref(struct inode_record *rec,
1092                                                 const char *name,
1093                                                 int namelen, u64 dir)
1094 {
1095         struct inode_backref *backref;
1096
1097         list_for_each_entry(backref, &rec->backrefs, list) {
1098                 if (rec->ino == BTRFS_MULTIPLE_OBJECTIDS)
1099                         break;
1100                 if (backref->dir != dir || backref->namelen != namelen)
1101                         continue;
1102                 if (memcmp(name, backref->name, namelen))
1103                         continue;
1104                 return backref;
1105         }
1106
1107         backref = malloc(sizeof(*backref) + namelen + 1);
1108         if (!backref)
1109                 return NULL;
1110         memset(backref, 0, sizeof(*backref));
1111         backref->dir = dir;
1112         backref->namelen = namelen;
1113         memcpy(backref->name, name, namelen);
1114         backref->name[namelen] = '\0';
1115         list_add_tail(&backref->list, &rec->backrefs);
1116         return backref;
1117 }
1118
1119 static int add_inode_backref(struct cache_tree *inode_cache,
1120                              u64 ino, u64 dir, u64 index,
1121                              const char *name, int namelen,
1122                              int filetype, int itemtype, int errors)
1123 {
1124         struct inode_record *rec;
1125         struct inode_backref *backref;
1126
1127         rec = get_inode_rec(inode_cache, ino, 1);
1128         BUG_ON(IS_ERR(rec));
1129         backref = get_inode_backref(rec, name, namelen, dir);
1130         BUG_ON(!backref);
1131         if (errors)
1132                 backref->errors |= errors;
1133         if (itemtype == BTRFS_DIR_INDEX_KEY) {
1134                 if (backref->found_dir_index)
1135                         backref->errors |= REF_ERR_DUP_DIR_INDEX;
1136                 if (backref->found_inode_ref && backref->index != index)
1137                         backref->errors |= REF_ERR_INDEX_UNMATCH;
1138                 if (backref->found_dir_item && backref->filetype != filetype)
1139                         backref->errors |= REF_ERR_FILETYPE_UNMATCH;
1140
1141                 backref->index = index;
1142                 backref->filetype = filetype;
1143                 backref->found_dir_index = 1;
1144         } else if (itemtype == BTRFS_DIR_ITEM_KEY) {
1145                 rec->found_link++;
1146                 if (backref->found_dir_item)
1147                         backref->errors |= REF_ERR_DUP_DIR_ITEM;
1148                 if (backref->found_dir_index && backref->filetype != filetype)
1149                         backref->errors |= REF_ERR_FILETYPE_UNMATCH;
1150
1151                 backref->filetype = filetype;
1152                 backref->found_dir_item = 1;
1153         } else if ((itemtype == BTRFS_INODE_REF_KEY) ||
1154                    (itemtype == BTRFS_INODE_EXTREF_KEY)) {
1155                 if (backref->found_inode_ref)
1156                         backref->errors |= REF_ERR_DUP_INODE_REF;
1157                 if (backref->found_dir_index && backref->index != index)
1158                         backref->errors |= REF_ERR_INDEX_UNMATCH;
1159                 else
1160                         backref->index = index;
1161
1162                 backref->ref_type = itemtype;
1163                 backref->found_inode_ref = 1;
1164         } else {
1165                 BUG_ON(1);
1166         }
1167
1168         maybe_free_inode_rec(inode_cache, rec);
1169         return 0;
1170 }
1171
1172 static int merge_inode_recs(struct inode_record *src, struct inode_record *dst,
1173                             struct cache_tree *dst_cache)
1174 {
1175         struct inode_backref *backref;
1176         u32 dir_count = 0;
1177         int ret = 0;
1178
1179         dst->merging = 1;
1180         list_for_each_entry(backref, &src->backrefs, list) {
1181                 if (backref->found_dir_index) {
1182                         add_inode_backref(dst_cache, dst->ino, backref->dir,
1183                                         backref->index, backref->name,
1184                                         backref->namelen, backref->filetype,
1185                                         BTRFS_DIR_INDEX_KEY, backref->errors);
1186                 }
1187                 if (backref->found_dir_item) {
1188                         dir_count++;
1189                         add_inode_backref(dst_cache, dst->ino,
1190                                         backref->dir, 0, backref->name,
1191                                         backref->namelen, backref->filetype,
1192                                         BTRFS_DIR_ITEM_KEY, backref->errors);
1193                 }
1194                 if (backref->found_inode_ref) {
1195                         add_inode_backref(dst_cache, dst->ino,
1196                                         backref->dir, backref->index,
1197                                         backref->name, backref->namelen, 0,
1198                                         backref->ref_type, backref->errors);
1199                 }
1200         }
1201
1202         if (src->found_dir_item)
1203                 dst->found_dir_item = 1;
1204         if (src->found_file_extent)
1205                 dst->found_file_extent = 1;
1206         if (src->found_csum_item)
1207                 dst->found_csum_item = 1;
1208         if (src->some_csum_missing)
1209                 dst->some_csum_missing = 1;
1210         if (first_extent_gap(&dst->holes) > first_extent_gap(&src->holes)) {
1211                 ret = copy_file_extent_holes(&dst->holes, &src->holes);
1212                 if (ret < 0)
1213                         return ret;
1214         }
1215
1216         BUG_ON(src->found_link < dir_count);
1217         dst->found_link += src->found_link - dir_count;
1218         dst->found_size += src->found_size;
1219         if (src->extent_start != (u64)-1) {
1220                 if (dst->extent_start == (u64)-1) {
1221                         dst->extent_start = src->extent_start;
1222                         dst->extent_end = src->extent_end;
1223                 } else {
1224                         if (dst->extent_end > src->extent_start)
1225                                 dst->errors |= I_ERR_FILE_EXTENT_OVERLAP;
1226                         else if (dst->extent_end < src->extent_start) {
1227                                 ret = add_file_extent_hole(&dst->holes,
1228                                         dst->extent_end,
1229                                         src->extent_start - dst->extent_end);
1230                         }
1231                         if (dst->extent_end < src->extent_end)
1232                                 dst->extent_end = src->extent_end;
1233                 }
1234         }
1235
1236         dst->errors |= src->errors;
1237         if (src->found_inode_item) {
1238                 if (!dst->found_inode_item) {
1239                         dst->nlink = src->nlink;
1240                         dst->isize = src->isize;
1241                         dst->nbytes = src->nbytes;
1242                         dst->imode = src->imode;
1243                         dst->nodatasum = src->nodatasum;
1244                         dst->found_inode_item = 1;
1245                 } else {
1246                         dst->errors |= I_ERR_DUP_INODE_ITEM;
1247                 }
1248         }
1249         dst->merging = 0;
1250
1251         return 0;
1252 }
1253
1254 static int splice_shared_node(struct shared_node *src_node,
1255                               struct shared_node *dst_node)
1256 {
1257         struct cache_extent *cache;
1258         struct ptr_node *node, *ins;
1259         struct cache_tree *src, *dst;
1260         struct inode_record *rec, *conflict;
1261         u64 current_ino = 0;
1262         int splice = 0;
1263         int ret;
1264
1265         if (--src_node->refs == 0)
1266                 splice = 1;
1267         if (src_node->current)
1268                 current_ino = src_node->current->ino;
1269
1270         src = &src_node->root_cache;
1271         dst = &dst_node->root_cache;
1272 again:
1273         cache = search_cache_extent(src, 0);
1274         while (cache) {
1275                 node = container_of(cache, struct ptr_node, cache);
1276                 rec = node->data;
1277                 cache = next_cache_extent(cache);
1278
1279                 if (splice) {
1280                         remove_cache_extent(src, &node->cache);
1281                         ins = node;
1282                 } else {
1283                         ins = malloc(sizeof(*ins));
1284                         BUG_ON(!ins);
1285                         ins->cache.start = node->cache.start;
1286                         ins->cache.size = node->cache.size;
1287                         ins->data = rec;
1288                         rec->refs++;
1289                 }
1290                 ret = insert_cache_extent(dst, &ins->cache);
1291                 if (ret == -EEXIST) {
1292                         conflict = get_inode_rec(dst, rec->ino, 1);
1293                         BUG_ON(IS_ERR(conflict));
1294                         merge_inode_recs(rec, conflict, dst);
1295                         if (rec->checked) {
1296                                 conflict->checked = 1;
1297                                 if (dst_node->current == conflict)
1298                                         dst_node->current = NULL;
1299                         }
1300                         maybe_free_inode_rec(dst, conflict);
1301                         free_inode_rec(rec);
1302                         free(ins);
1303                 } else {
1304                         BUG_ON(ret);
1305                 }
1306         }
1307
1308         if (src == &src_node->root_cache) {
1309                 src = &src_node->inode_cache;
1310                 dst = &dst_node->inode_cache;
1311                 goto again;
1312         }
1313
1314         if (current_ino > 0 && (!dst_node->current ||
1315             current_ino > dst_node->current->ino)) {
1316                 if (dst_node->current) {
1317                         dst_node->current->checked = 1;
1318                         maybe_free_inode_rec(dst, dst_node->current);
1319                 }
1320                 dst_node->current = get_inode_rec(dst, current_ino, 1);
1321                 BUG_ON(IS_ERR(dst_node->current));
1322         }
1323         return 0;
1324 }
1325
1326 static void free_inode_ptr(struct cache_extent *cache)
1327 {
1328         struct ptr_node *node;
1329         struct inode_record *rec;
1330
1331         node = container_of(cache, struct ptr_node, cache);
1332         rec = node->data;
1333         free_inode_rec(rec);
1334         free(node);
1335 }
1336
1337 FREE_EXTENT_CACHE_BASED_TREE(inode_recs, free_inode_ptr);
1338
1339 static struct shared_node *find_shared_node(struct cache_tree *shared,
1340                                             u64 bytenr)
1341 {
1342         struct cache_extent *cache;
1343         struct shared_node *node;
1344
1345         cache = lookup_cache_extent(shared, bytenr, 1);
1346         if (cache) {
1347                 node = container_of(cache, struct shared_node, cache);
1348                 return node;
1349         }
1350         return NULL;
1351 }
1352
1353 static int add_shared_node(struct cache_tree *shared, u64 bytenr, u32 refs)
1354 {
1355         int ret;
1356         struct shared_node *node;
1357
1358         node = calloc(1, sizeof(*node));
1359         if (!node)
1360                 return -ENOMEM;
1361         node->cache.start = bytenr;
1362         node->cache.size = 1;
1363         cache_tree_init(&node->root_cache);
1364         cache_tree_init(&node->inode_cache);
1365         node->refs = refs;
1366
1367         ret = insert_cache_extent(shared, &node->cache);
1368
1369         return ret;
1370 }
1371
1372 static int enter_shared_node(struct btrfs_root *root, u64 bytenr, u32 refs,
1373                              struct walk_control *wc, int level)
1374 {
1375         struct shared_node *node;
1376         struct shared_node *dest;
1377         int ret;
1378
1379         if (level == wc->active_node)
1380                 return 0;
1381
1382         BUG_ON(wc->active_node <= level);
1383         node = find_shared_node(&wc->shared, bytenr);
1384         if (!node) {
1385                 ret = add_shared_node(&wc->shared, bytenr, refs);
1386                 BUG_ON(ret);
1387                 node = find_shared_node(&wc->shared, bytenr);
1388                 wc->nodes[level] = node;
1389                 wc->active_node = level;
1390                 return 0;
1391         }
1392
1393         if (wc->root_level == wc->active_node &&
1394             btrfs_root_refs(&root->root_item) == 0) {
1395                 if (--node->refs == 0) {
1396                         free_inode_recs_tree(&node->root_cache);
1397                         free_inode_recs_tree(&node->inode_cache);
1398                         remove_cache_extent(&wc->shared, &node->cache);
1399                         free(node);
1400                 }
1401                 return 1;
1402         }
1403
1404         dest = wc->nodes[wc->active_node];
1405         splice_shared_node(node, dest);
1406         if (node->refs == 0) {
1407                 remove_cache_extent(&wc->shared, &node->cache);
1408                 free(node);
1409         }
1410         return 1;
1411 }
1412
1413 static int leave_shared_node(struct btrfs_root *root,
1414                              struct walk_control *wc, int level)
1415 {
1416         struct shared_node *node;
1417         struct shared_node *dest;
1418         int i;
1419
1420         if (level == wc->root_level)
1421                 return 0;
1422
1423         for (i = level + 1; i < BTRFS_MAX_LEVEL; i++) {
1424                 if (wc->nodes[i])
1425                         break;
1426         }
1427         BUG_ON(i >= BTRFS_MAX_LEVEL);
1428
1429         node = wc->nodes[wc->active_node];
1430         wc->nodes[wc->active_node] = NULL;
1431         wc->active_node = i;
1432
1433         dest = wc->nodes[wc->active_node];
1434         if (wc->active_node < wc->root_level ||
1435             btrfs_root_refs(&root->root_item) > 0) {
1436                 BUG_ON(node->refs <= 1);
1437                 splice_shared_node(node, dest);
1438         } else {
1439                 BUG_ON(node->refs < 2);
1440                 node->refs--;
1441         }
1442         return 0;
1443 }
1444
1445 /*
1446  * Returns:
1447  * < 0 - on error
1448  * 1   - if the root with id child_root_id is a child of root parent_root_id
1449  * 0   - if the root child_root_id isn't a child of the root parent_root_id but
1450  *       has other root(s) as parent(s)
1451  * 2   - if the root child_root_id doesn't have any parent roots
1452  */
1453 static int is_child_root(struct btrfs_root *root, u64 parent_root_id,
1454                          u64 child_root_id)
1455 {
1456         struct btrfs_path path;
1457         struct btrfs_key key;
1458         struct extent_buffer *leaf;
1459         int has_parent = 0;
1460         int ret;
1461
1462         btrfs_init_path(&path);
1463
1464         key.objectid = parent_root_id;
1465         key.type = BTRFS_ROOT_REF_KEY;
1466         key.offset = child_root_id;
1467         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, &path,
1468                                 0, 0);
1469         if (ret < 0)
1470                 return ret;
1471         btrfs_release_path(&path);
1472         if (!ret)
1473                 return 1;
1474
1475         key.objectid = child_root_id;
1476         key.type = BTRFS_ROOT_BACKREF_KEY;
1477         key.offset = 0;
1478         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, &path,
1479                                 0, 0);
1480         if (ret < 0)
1481                 goto out;
1482
1483         while (1) {
1484                 leaf = path.nodes[0];
1485                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
1486                         ret = btrfs_next_leaf(root->fs_info->tree_root, &path);
1487                         if (ret)
1488                                 break;
1489                         leaf = path.nodes[0];
1490                 }
1491
1492                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1493                 if (key.objectid != child_root_id ||
1494                     key.type != BTRFS_ROOT_BACKREF_KEY)
1495                         break;
1496
1497                 has_parent = 1;
1498
1499                 if (key.offset == parent_root_id) {
1500                         btrfs_release_path(&path);
1501                         return 1;
1502                 }
1503
1504                 path.slots[0]++;
1505         }
1506 out:
1507         btrfs_release_path(&path);
1508         if (ret < 0)
1509                 return ret;
1510         return has_parent ? 0 : 2;
1511 }
1512
1513 static int process_dir_item(struct btrfs_root *root,
1514                             struct extent_buffer *eb,
1515                             int slot, struct btrfs_key *key,
1516                             struct shared_node *active_node)
1517 {
1518         u32 total;
1519         u32 cur = 0;
1520         u32 len;
1521         u32 name_len;
1522         u32 data_len;
1523         int error;
1524         int nritems = 0;
1525         int filetype;
1526         struct btrfs_dir_item *di;
1527         struct inode_record *rec;
1528         struct cache_tree *root_cache;
1529         struct cache_tree *inode_cache;
1530         struct btrfs_key location;
1531         char namebuf[BTRFS_NAME_LEN];
1532
1533         root_cache = &active_node->root_cache;
1534         inode_cache = &active_node->inode_cache;
1535         rec = active_node->current;
1536         rec->found_dir_item = 1;
1537
1538         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1539         total = btrfs_item_size_nr(eb, slot);
1540         while (cur < total) {
1541                 nritems++;
1542                 btrfs_dir_item_key_to_cpu(eb, di, &location);
1543                 name_len = btrfs_dir_name_len(eb, di);
1544                 data_len = btrfs_dir_data_len(eb, di);
1545                 filetype = btrfs_dir_type(eb, di);
1546
1547                 rec->found_size += name_len;
1548                 if (name_len <= BTRFS_NAME_LEN) {
1549                         len = name_len;
1550                         error = 0;
1551                 } else {
1552                         len = BTRFS_NAME_LEN;
1553                         error = REF_ERR_NAME_TOO_LONG;
1554                 }
1555                 read_extent_buffer(eb, namebuf, (unsigned long)(di + 1), len);
1556
1557                 if (location.type == BTRFS_INODE_ITEM_KEY) {
1558                         add_inode_backref(inode_cache, location.objectid,
1559                                           key->objectid, key->offset, namebuf,
1560                                           len, filetype, key->type, error);
1561                 } else if (location.type == BTRFS_ROOT_ITEM_KEY) {
1562                         add_inode_backref(root_cache, location.objectid,
1563                                           key->objectid, key->offset,
1564                                           namebuf, len, filetype,
1565                                           key->type, error);
1566                 } else {
1567                         fprintf(stderr, "invalid location in dir item %u\n",
1568                                 location.type);
1569                         add_inode_backref(inode_cache, BTRFS_MULTIPLE_OBJECTIDS,
1570                                           key->objectid, key->offset, namebuf,
1571                                           len, filetype, key->type, error);
1572                 }
1573
1574                 len = sizeof(*di) + name_len + data_len;
1575                 di = (struct btrfs_dir_item *)((char *)di + len);
1576                 cur += len;
1577         }
1578         if (key->type == BTRFS_DIR_INDEX_KEY && nritems > 1)
1579                 rec->errors |= I_ERR_DUP_DIR_INDEX;
1580
1581         return 0;
1582 }
1583
1584 static int process_inode_ref(struct extent_buffer *eb,
1585                              int slot, struct btrfs_key *key,
1586                              struct shared_node *active_node)
1587 {
1588         u32 total;
1589         u32 cur = 0;
1590         u32 len;
1591         u32 name_len;
1592         u64 index;
1593         int error;
1594         struct cache_tree *inode_cache;
1595         struct btrfs_inode_ref *ref;
1596         char namebuf[BTRFS_NAME_LEN];
1597
1598         inode_cache = &active_node->inode_cache;
1599
1600         ref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1601         total = btrfs_item_size_nr(eb, slot);
1602         while (cur < total) {
1603                 name_len = btrfs_inode_ref_name_len(eb, ref);
1604                 index = btrfs_inode_ref_index(eb, ref);
1605                 if (name_len <= BTRFS_NAME_LEN) {
1606                         len = name_len;
1607                         error = 0;
1608                 } else {
1609                         len = BTRFS_NAME_LEN;
1610                         error = REF_ERR_NAME_TOO_LONG;
1611                 }
1612                 read_extent_buffer(eb, namebuf, (unsigned long)(ref + 1), len);
1613                 add_inode_backref(inode_cache, key->objectid, key->offset,
1614                                   index, namebuf, len, 0, key->type, error);
1615
1616                 len = sizeof(*ref) + name_len;
1617                 ref = (struct btrfs_inode_ref *)((char *)ref + len);
1618                 cur += len;
1619         }
1620         return 0;
1621 }
1622
1623 static int process_inode_extref(struct extent_buffer *eb,
1624                                 int slot, struct btrfs_key *key,
1625                                 struct shared_node *active_node)
1626 {
1627         u32 total;
1628         u32 cur = 0;
1629         u32 len;
1630         u32 name_len;
1631         u64 index;
1632         u64 parent;
1633         int error;
1634         struct cache_tree *inode_cache;
1635         struct btrfs_inode_extref *extref;
1636         char namebuf[BTRFS_NAME_LEN];
1637
1638         inode_cache = &active_node->inode_cache;
1639
1640         extref = btrfs_item_ptr(eb, slot, struct btrfs_inode_extref);
1641         total = btrfs_item_size_nr(eb, slot);
1642         while (cur < total) {
1643                 name_len = btrfs_inode_extref_name_len(eb, extref);
1644                 index = btrfs_inode_extref_index(eb, extref);
1645                 parent = btrfs_inode_extref_parent(eb, extref);
1646                 if (name_len <= BTRFS_NAME_LEN) {
1647                         len = name_len;
1648                         error = 0;
1649                 } else {
1650                         len = BTRFS_NAME_LEN;
1651                         error = REF_ERR_NAME_TOO_LONG;
1652                 }
1653                 read_extent_buffer(eb, namebuf,
1654                                    (unsigned long)(extref + 1), len);
1655                 add_inode_backref(inode_cache, key->objectid, parent,
1656                                   index, namebuf, len, 0, key->type, error);
1657
1658                 len = sizeof(*extref) + name_len;
1659                 extref = (struct btrfs_inode_extref *)((char *)extref + len);
1660                 cur += len;
1661         }
1662         return 0;
1663
1664 }
1665
1666 static int count_csum_range(struct btrfs_root *root, u64 start,
1667                             u64 len, u64 *found)
1668 {
1669         struct btrfs_key key;
1670         struct btrfs_path path;
1671         struct extent_buffer *leaf;
1672         int ret;
1673         size_t size;
1674         *found = 0;
1675         u64 csum_end;
1676         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1677
1678         btrfs_init_path(&path);
1679
1680         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1681         key.offset = start;
1682         key.type = BTRFS_EXTENT_CSUM_KEY;
1683
1684         ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
1685                                 &key, &path, 0, 0);
1686         if (ret < 0)
1687                 goto out;
1688         if (ret > 0 && path.slots[0] > 0) {
1689                 leaf = path.nodes[0];
1690                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0] - 1);
1691                 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
1692                     key.type == BTRFS_EXTENT_CSUM_KEY)
1693                         path.slots[0]--;
1694         }
1695
1696         while (len > 0) {
1697                 leaf = path.nodes[0];
1698                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
1699                         ret = btrfs_next_leaf(root->fs_info->csum_root, &path);
1700                         if (ret > 0)
1701                                 break;
1702                         else if (ret < 0)
1703                                 goto out;
1704                         leaf = path.nodes[0];
1705                 }
1706
1707                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1708                 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
1709                     key.type != BTRFS_EXTENT_CSUM_KEY)
1710                         break;
1711
1712                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1713                 if (key.offset >= start + len)
1714                         break;
1715
1716                 if (key.offset > start)
1717                         start = key.offset;
1718
1719                 size = btrfs_item_size_nr(leaf, path.slots[0]);
1720                 csum_end = key.offset + (size / csum_size) * root->sectorsize;
1721                 if (csum_end > start) {
1722                         size = min(csum_end - start, len);
1723                         len -= size;
1724                         start += size;
1725                         *found += size;
1726                 }
1727
1728                 path.slots[0]++;
1729         }
1730 out:
1731         btrfs_release_path(&path);
1732         if (ret < 0)
1733                 return ret;
1734         return 0;
1735 }
1736
1737 static int process_file_extent(struct btrfs_root *root,
1738                                 struct extent_buffer *eb,
1739                                 int slot, struct btrfs_key *key,
1740                                 struct shared_node *active_node)
1741 {
1742         struct inode_record *rec;
1743         struct btrfs_file_extent_item *fi;
1744         u64 num_bytes = 0;
1745         u64 disk_bytenr = 0;
1746         u64 extent_offset = 0;
1747         u64 mask = root->sectorsize - 1;
1748         int extent_type;
1749         int ret;
1750
1751         rec = active_node->current;
1752         BUG_ON(rec->ino != key->objectid || rec->refs > 1);
1753         rec->found_file_extent = 1;
1754
1755         if (rec->extent_start == (u64)-1) {
1756                 rec->extent_start = key->offset;
1757                 rec->extent_end = key->offset;
1758         }
1759
1760         if (rec->extent_end > key->offset)
1761                 rec->errors |= I_ERR_FILE_EXTENT_OVERLAP;
1762         else if (rec->extent_end < key->offset) {
1763                 ret = add_file_extent_hole(&rec->holes, rec->extent_end,
1764                                            key->offset - rec->extent_end);
1765                 if (ret < 0)
1766                         return ret;
1767         }
1768
1769         fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
1770         extent_type = btrfs_file_extent_type(eb, fi);
1771
1772         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1773                 num_bytes = btrfs_file_extent_inline_len(eb, slot, fi);
1774                 if (num_bytes == 0)
1775                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1776                 rec->found_size += num_bytes;
1777                 num_bytes = (num_bytes + mask) & ~mask;
1778         } else if (extent_type == BTRFS_FILE_EXTENT_REG ||
1779                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1780                 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1781                 disk_bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1782                 extent_offset = btrfs_file_extent_offset(eb, fi);
1783                 if (num_bytes == 0 || (num_bytes & mask))
1784                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1785                 if (num_bytes + extent_offset >
1786                     btrfs_file_extent_ram_bytes(eb, fi))
1787                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1788                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
1789                     (btrfs_file_extent_compression(eb, fi) ||
1790                      btrfs_file_extent_encryption(eb, fi) ||
1791                      btrfs_file_extent_other_encoding(eb, fi)))
1792                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1793                 if (disk_bytenr > 0)
1794                         rec->found_size += num_bytes;
1795         } else {
1796                 rec->errors |= I_ERR_BAD_FILE_EXTENT;
1797         }
1798         rec->extent_end = key->offset + num_bytes;
1799
1800         /*
1801          * The data reloc tree will copy full extents into its inode and then
1802          * copy the corresponding csums.  Because the extent it copied could be
1803          * a preallocated extent that hasn't been written to yet there may be no
1804          * csums to copy, ergo we won't have csums for our file extent.  This is
1805          * ok so just don't bother checking csums if the inode belongs to the
1806          * data reloc tree.
1807          */
1808         if (disk_bytenr > 0 &&
1809             btrfs_header_owner(eb) != BTRFS_DATA_RELOC_TREE_OBJECTID) {
1810                 u64 found;
1811                 if (btrfs_file_extent_compression(eb, fi))
1812                         num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1813                 else
1814                         disk_bytenr += extent_offset;
1815
1816                 ret = count_csum_range(root, disk_bytenr, num_bytes, &found);
1817                 if (ret < 0)
1818                         return ret;
1819                 if (extent_type == BTRFS_FILE_EXTENT_REG) {
1820                         if (found > 0)
1821                                 rec->found_csum_item = 1;
1822                         if (found < num_bytes)
1823                                 rec->some_csum_missing = 1;
1824                 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1825                         if (found > 0)
1826                                 rec->errors |= I_ERR_ODD_CSUM_ITEM;
1827                 }
1828         }
1829         return 0;
1830 }
1831
1832 static int process_one_leaf(struct btrfs_root *root, struct extent_buffer *eb,
1833                             struct walk_control *wc)
1834 {
1835         struct btrfs_key key;
1836         u32 nritems;
1837         int i;
1838         int ret = 0;
1839         struct cache_tree *inode_cache;
1840         struct shared_node *active_node;
1841
1842         if (wc->root_level == wc->active_node &&
1843             btrfs_root_refs(&root->root_item) == 0)
1844                 return 0;
1845
1846         active_node = wc->nodes[wc->active_node];
1847         inode_cache = &active_node->inode_cache;
1848         nritems = btrfs_header_nritems(eb);
1849         for (i = 0; i < nritems; i++) {
1850                 btrfs_item_key_to_cpu(eb, &key, i);
1851
1852                 if (key.objectid == BTRFS_FREE_SPACE_OBJECTID)
1853                         continue;
1854                 if (key.type == BTRFS_ORPHAN_ITEM_KEY)
1855                         continue;
1856
1857                 if (active_node->current == NULL ||
1858                     active_node->current->ino < key.objectid) {
1859                         if (active_node->current) {
1860                                 active_node->current->checked = 1;
1861                                 maybe_free_inode_rec(inode_cache,
1862                                                      active_node->current);
1863                         }
1864                         active_node->current = get_inode_rec(inode_cache,
1865                                                              key.objectid, 1);
1866                         BUG_ON(IS_ERR(active_node->current));
1867                 }
1868                 switch (key.type) {
1869                 case BTRFS_DIR_ITEM_KEY:
1870                 case BTRFS_DIR_INDEX_KEY:
1871                         ret = process_dir_item(root, eb, i, &key, active_node);
1872                         break;
1873                 case BTRFS_INODE_REF_KEY:
1874                         ret = process_inode_ref(eb, i, &key, active_node);
1875                         break;
1876                 case BTRFS_INODE_EXTREF_KEY:
1877                         ret = process_inode_extref(eb, i, &key, active_node);
1878                         break;
1879                 case BTRFS_INODE_ITEM_KEY:
1880                         ret = process_inode_item(eb, i, &key, active_node);
1881                         break;
1882                 case BTRFS_EXTENT_DATA_KEY:
1883                         ret = process_file_extent(root, eb, i, &key,
1884                                                   active_node);
1885                         break;
1886                 default:
1887                         break;
1888                 };
1889         }
1890         return ret;
1891 }
1892
1893 static void reada_walk_down(struct btrfs_root *root,
1894                             struct extent_buffer *node, int slot)
1895 {
1896         u64 bytenr;
1897         u64 ptr_gen;
1898         u32 nritems;
1899         u32 blocksize;
1900         int i;
1901         int level;
1902
1903         level = btrfs_header_level(node);
1904         if (level != 1)
1905                 return;
1906
1907         nritems = btrfs_header_nritems(node);
1908         blocksize = root->nodesize;
1909         for (i = slot; i < nritems; i++) {
1910                 bytenr = btrfs_node_blockptr(node, i);
1911                 ptr_gen = btrfs_node_ptr_generation(node, i);
1912                 readahead_tree_block(root, bytenr, blocksize, ptr_gen);
1913         }
1914 }
1915
1916 /*
1917  * Check the child node/leaf by the following condition:
1918  * 1. the first item key of the node/leaf should be the same with the one
1919  *    in parent.
1920  * 2. block in parent node should match the child node/leaf.
1921  * 3. generation of parent node and child's header should be consistent.
1922  *
1923  * Or the child node/leaf pointed by the key in parent is not valid.
1924  *
1925  * We hope to check leaf owner too, but since subvol may share leaves,
1926  * which makes leaf owner check not so strong, key check should be
1927  * sufficient enough for that case.
1928  */
1929 static int check_child_node(struct btrfs_root *root,
1930                             struct extent_buffer *parent, int slot,
1931                             struct extent_buffer *child)
1932 {
1933         struct btrfs_key parent_key;
1934         struct btrfs_key child_key;
1935         int ret = 0;
1936
1937         btrfs_node_key_to_cpu(parent, &parent_key, slot);
1938         if (btrfs_header_level(child) == 0)
1939                 btrfs_item_key_to_cpu(child, &child_key, 0);
1940         else
1941                 btrfs_node_key_to_cpu(child, &child_key, 0);
1942
1943         if (memcmp(&parent_key, &child_key, sizeof(parent_key))) {
1944                 ret = -EINVAL;
1945                 fprintf(stderr,
1946                         "Wrong key of child node/leaf, wanted: (%llu, %u, %llu), have: (%llu, %u, %llu)\n",
1947                         parent_key.objectid, parent_key.type, parent_key.offset,
1948                         child_key.objectid, child_key.type, child_key.offset);
1949         }
1950         if (btrfs_header_bytenr(child) != btrfs_node_blockptr(parent, slot)) {
1951                 ret = -EINVAL;
1952                 fprintf(stderr, "Wrong block of child node/leaf, wanted: %llu, have: %llu\n",
1953                         btrfs_node_blockptr(parent, slot),
1954                         btrfs_header_bytenr(child));
1955         }
1956         if (btrfs_node_ptr_generation(parent, slot) !=
1957             btrfs_header_generation(child)) {
1958                 ret = -EINVAL;
1959                 fprintf(stderr, "Wrong generation of child node/leaf, wanted: %llu, have: %llu\n",
1960                         btrfs_header_generation(child),
1961                         btrfs_node_ptr_generation(parent, slot));
1962         }
1963         return ret;
1964 }
1965
1966 struct node_refs {
1967         u64 bytenr[BTRFS_MAX_LEVEL];
1968         u64 refs[BTRFS_MAX_LEVEL];
1969 };
1970
1971 static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
1972                           struct walk_control *wc, int *level,
1973                           struct node_refs *nrefs)
1974 {
1975         enum btrfs_tree_block_status status;
1976         u64 bytenr;
1977         u64 ptr_gen;
1978         struct extent_buffer *next;
1979         struct extent_buffer *cur;
1980         u32 blocksize;
1981         int ret, err = 0;
1982         u64 refs;
1983
1984         WARN_ON(*level < 0);
1985         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1986
1987         if (path->nodes[*level]->start == nrefs->bytenr[*level]) {
1988                 refs = nrefs->refs[*level];
1989                 ret = 0;
1990         } else {
1991                 ret = btrfs_lookup_extent_info(NULL, root,
1992                                        path->nodes[*level]->start,
1993                                        *level, 1, &refs, NULL);
1994                 if (ret < 0) {
1995                         err = ret;
1996                         goto out;
1997                 }
1998                 nrefs->bytenr[*level] = path->nodes[*level]->start;
1999                 nrefs->refs[*level] = refs;
2000         }
2001
2002         if (refs > 1) {
2003                 ret = enter_shared_node(root, path->nodes[*level]->start,
2004                                         refs, wc, *level);
2005                 if (ret > 0) {
2006                         err = ret;
2007                         goto out;
2008                 }
2009         }
2010
2011         while (*level >= 0) {
2012                 WARN_ON(*level < 0);
2013                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2014                 cur = path->nodes[*level];
2015
2016                 if (btrfs_header_level(cur) != *level)
2017                         WARN_ON(1);
2018
2019                 if (path->slots[*level] >= btrfs_header_nritems(cur))
2020                         break;
2021                 if (*level == 0) {
2022                         ret = process_one_leaf(root, cur, wc);
2023                         if (ret < 0)
2024                                 err = ret;
2025                         break;
2026                 }
2027                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2028                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2029                 blocksize = root->nodesize;
2030
2031                 if (bytenr == nrefs->bytenr[*level - 1]) {
2032                         refs = nrefs->refs[*level - 1];
2033                 } else {
2034                         ret = btrfs_lookup_extent_info(NULL, root, bytenr,
2035                                         *level - 1, 1, &refs, NULL);
2036                         if (ret < 0) {
2037                                 refs = 0;
2038                         } else {
2039                                 nrefs->bytenr[*level - 1] = bytenr;
2040                                 nrefs->refs[*level - 1] = refs;
2041                         }
2042                 }
2043
2044                 if (refs > 1) {
2045                         ret = enter_shared_node(root, bytenr, refs,
2046                                                 wc, *level - 1);
2047                         if (ret > 0) {
2048                                 path->slots[*level]++;
2049                                 continue;
2050                         }
2051                 }
2052
2053                 next = btrfs_find_tree_block(root, bytenr, blocksize);
2054                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
2055                         free_extent_buffer(next);
2056                         reada_walk_down(root, cur, path->slots[*level]);
2057                         next = read_tree_block(root, bytenr, blocksize,
2058                                                ptr_gen);
2059                         if (!extent_buffer_uptodate(next)) {
2060                                 struct btrfs_key node_key;
2061
2062                                 btrfs_node_key_to_cpu(path->nodes[*level],
2063                                                       &node_key,
2064                                                       path->slots[*level]);
2065                                 btrfs_add_corrupt_extent_record(root->fs_info,
2066                                                 &node_key,
2067                                                 path->nodes[*level]->start,
2068                                                 root->nodesize, *level);
2069                                 err = -EIO;
2070                                 goto out;
2071                         }
2072                 }
2073
2074                 ret = check_child_node(root, cur, path->slots[*level], next);
2075                 if (ret) {
2076                         err = ret;
2077                         goto out;
2078                 }
2079
2080                 if (btrfs_is_leaf(next))
2081                         status = btrfs_check_leaf(root, NULL, next);
2082                 else
2083                         status = btrfs_check_node(root, NULL, next);
2084                 if (status != BTRFS_TREE_BLOCK_CLEAN) {
2085                         free_extent_buffer(next);
2086                         err = -EIO;
2087                         goto out;
2088                 }
2089
2090                 *level = *level - 1;
2091                 free_extent_buffer(path->nodes[*level]);
2092                 path->nodes[*level] = next;
2093                 path->slots[*level] = 0;
2094         }
2095 out:
2096         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2097         return err;
2098 }
2099
2100 static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
2101                         struct walk_control *wc, int *level)
2102 {
2103         int i;
2104         struct extent_buffer *leaf;
2105
2106         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2107                 leaf = path->nodes[i];
2108                 if (path->slots[i] + 1 < btrfs_header_nritems(leaf)) {
2109                         path->slots[i]++;
2110                         *level = i;
2111                         return 0;
2112                 } else {
2113                         free_extent_buffer(path->nodes[*level]);
2114                         path->nodes[*level] = NULL;
2115                         BUG_ON(*level > wc->active_node);
2116                         if (*level == wc->active_node)
2117                                 leave_shared_node(root, wc, *level);
2118                         *level = i + 1;
2119                 }
2120         }
2121         return 1;
2122 }
2123
2124 static int check_root_dir(struct inode_record *rec)
2125 {
2126         struct inode_backref *backref;
2127         int ret = -1;
2128
2129         if (!rec->found_inode_item || rec->errors)
2130                 goto out;
2131         if (rec->nlink != 1 || rec->found_link != 0)
2132                 goto out;
2133         if (list_empty(&rec->backrefs))
2134                 goto out;
2135         backref = to_inode_backref(rec->backrefs.next);
2136         if (!backref->found_inode_ref)
2137                 goto out;
2138         if (backref->index != 0 || backref->namelen != 2 ||
2139             memcmp(backref->name, "..", 2))
2140                 goto out;
2141         if (backref->found_dir_index || backref->found_dir_item)
2142                 goto out;
2143         ret = 0;
2144 out:
2145         return ret;
2146 }
2147
2148 static int repair_inode_isize(struct btrfs_trans_handle *trans,
2149                               struct btrfs_root *root, struct btrfs_path *path,
2150                               struct inode_record *rec)
2151 {
2152         struct btrfs_inode_item *ei;
2153         struct btrfs_key key;
2154         int ret;
2155
2156         key.objectid = rec->ino;
2157         key.type = BTRFS_INODE_ITEM_KEY;
2158         key.offset = (u64)-1;
2159
2160         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2161         if (ret < 0)
2162                 goto out;
2163         if (ret) {
2164                 if (!path->slots[0]) {
2165                         ret = -ENOENT;
2166                         goto out;
2167                 }
2168                 path->slots[0]--;
2169                 ret = 0;
2170         }
2171         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2172         if (key.objectid != rec->ino) {
2173                 ret = -ENOENT;
2174                 goto out;
2175         }
2176
2177         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2178                             struct btrfs_inode_item);
2179         btrfs_set_inode_size(path->nodes[0], ei, rec->found_size);
2180         btrfs_mark_buffer_dirty(path->nodes[0]);
2181         rec->errors &= ~I_ERR_DIR_ISIZE_WRONG;
2182         printf("reset isize for dir %Lu root %Lu\n", rec->ino,
2183                root->root_key.objectid);
2184 out:
2185         btrfs_release_path(path);
2186         return ret;
2187 }
2188
2189 static int repair_inode_orphan_item(struct btrfs_trans_handle *trans,
2190                                     struct btrfs_root *root,
2191                                     struct btrfs_path *path,
2192                                     struct inode_record *rec)
2193 {
2194         int ret;
2195
2196         ret = btrfs_add_orphan_item(trans, root, path, rec->ino);
2197         btrfs_release_path(path);
2198         if (!ret)
2199                 rec->errors &= ~I_ERR_NO_ORPHAN_ITEM;
2200         return ret;
2201 }
2202
2203 static int repair_inode_nbytes(struct btrfs_trans_handle *trans,
2204                                struct btrfs_root *root,
2205                                struct btrfs_path *path,
2206                                struct inode_record *rec)
2207 {
2208         struct btrfs_inode_item *ei;
2209         struct btrfs_key key;
2210         int ret = 0;
2211
2212         key.objectid = rec->ino;
2213         key.type = BTRFS_INODE_ITEM_KEY;
2214         key.offset = 0;
2215
2216         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2217         if (ret) {
2218                 if (ret > 0)
2219                         ret = -ENOENT;
2220                 goto out;
2221         }
2222
2223         /* Since ret == 0, no need to check anything */
2224         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2225                             struct btrfs_inode_item);
2226         btrfs_set_inode_nbytes(path->nodes[0], ei, rec->found_size);
2227         btrfs_mark_buffer_dirty(path->nodes[0]);
2228         rec->errors &= ~I_ERR_FILE_NBYTES_WRONG;
2229         printf("reset nbytes for ino %llu root %llu\n",
2230                rec->ino, root->root_key.objectid);
2231 out:
2232         btrfs_release_path(path);
2233         return ret;
2234 }
2235
2236 static int add_missing_dir_index(struct btrfs_root *root,
2237                                  struct cache_tree *inode_cache,
2238                                  struct inode_record *rec,
2239                                  struct inode_backref *backref)
2240 {
2241         struct btrfs_path *path;
2242         struct btrfs_trans_handle *trans;
2243         struct btrfs_dir_item *dir_item;
2244         struct extent_buffer *leaf;
2245         struct btrfs_key key;
2246         struct btrfs_disk_key disk_key;
2247         struct inode_record *dir_rec;
2248         unsigned long name_ptr;
2249         u32 data_size = sizeof(*dir_item) + backref->namelen;
2250         int ret;
2251
2252         path = btrfs_alloc_path();
2253         if (!path)
2254                 return -ENOMEM;
2255
2256         trans = btrfs_start_transaction(root, 1);
2257         if (IS_ERR(trans)) {
2258                 btrfs_free_path(path);
2259                 return PTR_ERR(trans);
2260         }
2261
2262         fprintf(stderr, "repairing missing dir index item for inode %llu\n",
2263                 (unsigned long long)rec->ino);
2264         key.objectid = backref->dir;
2265         key.type = BTRFS_DIR_INDEX_KEY;
2266         key.offset = backref->index;
2267
2268         ret = btrfs_insert_empty_item(trans, root, path, &key, data_size);
2269         BUG_ON(ret);
2270
2271         leaf = path->nodes[0];
2272         dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
2273
2274         disk_key.objectid = cpu_to_le64(rec->ino);
2275         disk_key.type = BTRFS_INODE_ITEM_KEY;
2276         disk_key.offset = 0;
2277
2278         btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
2279         btrfs_set_dir_type(leaf, dir_item, imode_to_type(rec->imode));
2280         btrfs_set_dir_data_len(leaf, dir_item, 0);
2281         btrfs_set_dir_name_len(leaf, dir_item, backref->namelen);
2282         name_ptr = (unsigned long)(dir_item + 1);
2283         write_extent_buffer(leaf, backref->name, name_ptr, backref->namelen);
2284         btrfs_mark_buffer_dirty(leaf);
2285         btrfs_free_path(path);
2286         btrfs_commit_transaction(trans, root);
2287
2288         backref->found_dir_index = 1;
2289         dir_rec = get_inode_rec(inode_cache, backref->dir, 0);
2290         BUG_ON(IS_ERR(dir_rec));
2291         if (!dir_rec)
2292                 return 0;
2293         dir_rec->found_size += backref->namelen;
2294         if (dir_rec->found_size == dir_rec->isize &&
2295             (dir_rec->errors & I_ERR_DIR_ISIZE_WRONG))
2296                 dir_rec->errors &= ~I_ERR_DIR_ISIZE_WRONG;
2297         if (dir_rec->found_size != dir_rec->isize)
2298                 dir_rec->errors |= I_ERR_DIR_ISIZE_WRONG;
2299
2300         return 0;
2301 }
2302
2303 static int delete_dir_index(struct btrfs_root *root,
2304                             struct cache_tree *inode_cache,
2305                             struct inode_record *rec,
2306                             struct inode_backref *backref)
2307 {
2308         struct btrfs_trans_handle *trans;
2309         struct btrfs_dir_item *di;
2310         struct btrfs_path *path;
2311         int ret = 0;
2312
2313         path = btrfs_alloc_path();
2314         if (!path)
2315                 return -ENOMEM;
2316
2317         trans = btrfs_start_transaction(root, 1);
2318         if (IS_ERR(trans)) {
2319                 btrfs_free_path(path);
2320                 return PTR_ERR(trans);
2321         }
2322
2323
2324         fprintf(stderr, "Deleting bad dir index [%llu,%u,%llu] root %llu\n",
2325                 (unsigned long long)backref->dir,
2326                 BTRFS_DIR_INDEX_KEY, (unsigned long long)backref->index,
2327                 (unsigned long long)root->objectid);
2328
2329         di = btrfs_lookup_dir_index(trans, root, path, backref->dir,
2330                                     backref->name, backref->namelen,
2331                                     backref->index, -1);
2332         if (IS_ERR(di)) {
2333                 ret = PTR_ERR(di);
2334                 btrfs_free_path(path);
2335                 btrfs_commit_transaction(trans, root);
2336                 if (ret == -ENOENT)
2337                         return 0;
2338                 return ret;
2339         }
2340
2341         if (!di)
2342                 ret = btrfs_del_item(trans, root, path);
2343         else
2344                 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2345         BUG_ON(ret);
2346         btrfs_free_path(path);
2347         btrfs_commit_transaction(trans, root);
2348         return ret;
2349 }
2350
2351 static int create_inode_item(struct btrfs_root *root,
2352                              struct inode_record *rec,
2353                              struct inode_backref *backref, int root_dir)
2354 {
2355         struct btrfs_trans_handle *trans;
2356         struct btrfs_inode_item inode_item;
2357         time_t now = time(NULL);
2358         int ret;
2359
2360         trans = btrfs_start_transaction(root, 1);
2361         if (IS_ERR(trans)) {
2362                 ret = PTR_ERR(trans);
2363                 return ret;
2364         }
2365
2366         fprintf(stderr, "root %llu inode %llu recreating inode item, this may "
2367                 "be incomplete, please check permissions and content after "
2368                 "the fsck completes.\n", (unsigned long long)root->objectid,
2369                 (unsigned long long)rec->ino);
2370
2371         memset(&inode_item, 0, sizeof(inode_item));
2372         btrfs_set_stack_inode_generation(&inode_item, trans->transid);
2373         if (root_dir)
2374                 btrfs_set_stack_inode_nlink(&inode_item, 1);
2375         else
2376                 btrfs_set_stack_inode_nlink(&inode_item, rec->found_link);
2377         btrfs_set_stack_inode_nbytes(&inode_item, rec->found_size);
2378         if (rec->found_dir_item) {
2379                 if (rec->found_file_extent)
2380                         fprintf(stderr, "root %llu inode %llu has both a dir "
2381                                 "item and extents, unsure if it is a dir or a "
2382                                 "regular file so setting it as a directory\n",
2383                                 (unsigned long long)root->objectid,
2384                                 (unsigned long long)rec->ino);
2385                 btrfs_set_stack_inode_mode(&inode_item, S_IFDIR | 0755);
2386                 btrfs_set_stack_inode_size(&inode_item, rec->found_size);
2387         } else if (!rec->found_dir_item) {
2388                 btrfs_set_stack_inode_size(&inode_item, rec->extent_end);
2389                 btrfs_set_stack_inode_mode(&inode_item, S_IFREG | 0755);
2390         }
2391         btrfs_set_stack_timespec_sec(&inode_item.atime, now);
2392         btrfs_set_stack_timespec_nsec(&inode_item.atime, 0);
2393         btrfs_set_stack_timespec_sec(&inode_item.ctime, now);
2394         btrfs_set_stack_timespec_nsec(&inode_item.ctime, 0);
2395         btrfs_set_stack_timespec_sec(&inode_item.mtime, now);
2396         btrfs_set_stack_timespec_nsec(&inode_item.mtime, 0);
2397         btrfs_set_stack_timespec_sec(&inode_item.otime, 0);
2398         btrfs_set_stack_timespec_nsec(&inode_item.otime, 0);
2399
2400         ret = btrfs_insert_inode(trans, root, rec->ino, &inode_item);
2401         BUG_ON(ret);
2402         btrfs_commit_transaction(trans, root);
2403         return 0;
2404 }
2405
2406 static int repair_inode_backrefs(struct btrfs_root *root,
2407                                  struct inode_record *rec,
2408                                  struct cache_tree *inode_cache,
2409                                  int delete)
2410 {
2411         struct inode_backref *tmp, *backref;
2412         u64 root_dirid = btrfs_root_dirid(&root->root_item);
2413         int ret = 0;
2414         int repaired = 0;
2415
2416         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
2417                 if (!delete && rec->ino == root_dirid) {
2418                         if (!rec->found_inode_item) {
2419                                 ret = create_inode_item(root, rec, backref, 1);
2420                                 if (ret)
2421                                         break;
2422                                 repaired++;
2423                         }
2424                 }
2425
2426                 /* Index 0 for root dir's are special, don't mess with it */
2427                 if (rec->ino == root_dirid && backref->index == 0)
2428                         continue;
2429
2430                 if (delete &&
2431                     ((backref->found_dir_index && !backref->found_inode_ref) ||
2432                      (backref->found_dir_index && backref->found_inode_ref &&
2433                       (backref->errors & REF_ERR_INDEX_UNMATCH)))) {
2434                         ret = delete_dir_index(root, inode_cache, rec, backref);
2435                         if (ret)
2436                                 break;
2437                         repaired++;
2438                         list_del(&backref->list);
2439                         free(backref);
2440                 }
2441
2442                 if (!delete && !backref->found_dir_index &&
2443                     backref->found_dir_item && backref->found_inode_ref) {
2444                         ret = add_missing_dir_index(root, inode_cache, rec,
2445                                                     backref);
2446                         if (ret)
2447                                 break;
2448                         repaired++;
2449                         if (backref->found_dir_item &&
2450                             backref->found_dir_index &&
2451                             backref->found_dir_index) {
2452                                 if (!backref->errors &&
2453                                     backref->found_inode_ref) {
2454                                         list_del(&backref->list);
2455                                         free(backref);
2456                                 }
2457                         }
2458                 }
2459
2460                 if (!delete && (!backref->found_dir_index &&
2461                                 !backref->found_dir_item &&
2462                                 backref->found_inode_ref)) {
2463                         struct btrfs_trans_handle *trans;
2464                         struct btrfs_key location;
2465
2466                         ret = check_dir_conflict(root, backref->name,
2467                                                  backref->namelen,
2468                                                  backref->dir,
2469                                                  backref->index);
2470                         if (ret) {
2471                                 /*
2472                                  * let nlink fixing routine to handle it,
2473                                  * which can do it better.
2474                                  */
2475                                 ret = 0;
2476                                 break;
2477                         }
2478                         location.objectid = rec->ino;
2479                         location.type = BTRFS_INODE_ITEM_KEY;
2480                         location.offset = 0;
2481
2482                         trans = btrfs_start_transaction(root, 1);
2483                         if (IS_ERR(trans)) {
2484                                 ret = PTR_ERR(trans);
2485                                 break;
2486                         }
2487                         fprintf(stderr, "adding missing dir index/item pair "
2488                                 "for inode %llu\n",
2489                                 (unsigned long long)rec->ino);
2490                         ret = btrfs_insert_dir_item(trans, root, backref->name,
2491                                                     backref->namelen,
2492                                                     backref->dir, &location,
2493                                                     imode_to_type(rec->imode),
2494                                                     backref->index);
2495                         BUG_ON(ret);
2496                         btrfs_commit_transaction(trans, root);
2497                         repaired++;
2498                 }
2499
2500                 if (!delete && (backref->found_inode_ref &&
2501                                 backref->found_dir_index &&
2502                                 backref->found_dir_item &&
2503                                 !(backref->errors & REF_ERR_INDEX_UNMATCH) &&
2504                                 !rec->found_inode_item)) {
2505                         ret = create_inode_item(root, rec, backref, 0);
2506                         if (ret)
2507                                 break;
2508                         repaired++;
2509                 }
2510
2511         }
2512         return ret ? ret : repaired;
2513 }
2514
2515 /*
2516  * To determine the file type for nlink/inode_item repair
2517  *
2518  * Return 0 if file type is found and BTRFS_FT_* is stored into type.
2519  * Return -ENOENT if file type is not found.
2520  */
2521 static int find_file_type(struct inode_record *rec, u8 *type)
2522 {
2523         struct inode_backref *backref;
2524
2525         /* For inode item recovered case */
2526         if (rec->found_inode_item) {
2527                 *type = imode_to_type(rec->imode);
2528                 return 0;
2529         }
2530
2531         list_for_each_entry(backref, &rec->backrefs, list) {
2532                 if (backref->found_dir_index || backref->found_dir_item) {
2533                         *type = backref->filetype;
2534                         return 0;
2535                 }
2536         }
2537         return -ENOENT;
2538 }
2539
2540 /*
2541  * To determine the file name for nlink repair
2542  *
2543  * Return 0 if file name is found, set name and namelen.
2544  * Return -ENOENT if file name is not found.
2545  */
2546 static int find_file_name(struct inode_record *rec,
2547                           char *name, int *namelen)
2548 {
2549         struct inode_backref *backref;
2550
2551         list_for_each_entry(backref, &rec->backrefs, list) {
2552                 if (backref->found_dir_index || backref->found_dir_item ||
2553                     backref->found_inode_ref) {
2554                         memcpy(name, backref->name, backref->namelen);
2555                         *namelen = backref->namelen;
2556                         return 0;
2557                 }
2558         }
2559         return -ENOENT;
2560 }
2561
2562 /* Reset the nlink of the inode to the correct one */
2563 static int reset_nlink(struct btrfs_trans_handle *trans,
2564                        struct btrfs_root *root,
2565                        struct btrfs_path *path,
2566                        struct inode_record *rec)
2567 {
2568         struct inode_backref *backref;
2569         struct inode_backref *tmp;
2570         struct btrfs_key key;
2571         struct btrfs_inode_item *inode_item;
2572         int ret = 0;
2573
2574         /* We don't believe this either, reset it and iterate backref */
2575         rec->found_link = 0;
2576
2577         /* Remove all backref including the valid ones */
2578         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
2579                 ret = btrfs_unlink(trans, root, rec->ino, backref->dir,
2580                                    backref->index, backref->name,
2581                                    backref->namelen, 0);
2582                 if (ret < 0)
2583                         goto out;
2584
2585                 /* remove invalid backref, so it won't be added back */
2586                 if (!(backref->found_dir_index &&
2587                       backref->found_dir_item &&
2588                       backref->found_inode_ref)) {
2589                         list_del(&backref->list);
2590                         free(backref);
2591                 } else {
2592                         rec->found_link++;
2593                 }
2594         }
2595
2596         /* Set nlink to 0 */
2597         key.objectid = rec->ino;
2598         key.type = BTRFS_INODE_ITEM_KEY;
2599         key.offset = 0;
2600         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2601         if (ret < 0)
2602                 goto out;
2603         if (ret > 0) {
2604                 ret = -ENOENT;
2605                 goto out;
2606         }
2607         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2608                                     struct btrfs_inode_item);
2609         btrfs_set_inode_nlink(path->nodes[0], inode_item, 0);
2610         btrfs_mark_buffer_dirty(path->nodes[0]);
2611         btrfs_release_path(path);
2612
2613         /*
2614          * Add back valid inode_ref/dir_item/dir_index,
2615          * add_link() will handle the nlink inc, so new nlink must be correct
2616          */
2617         list_for_each_entry(backref, &rec->backrefs, list) {
2618                 ret = btrfs_add_link(trans, root, rec->ino, backref->dir,
2619                                      backref->name, backref->namelen,
2620                                      backref->filetype, &backref->index, 1);
2621                 if (ret < 0)
2622                         goto out;
2623         }
2624 out:
2625         btrfs_release_path(path);
2626         return ret;
2627 }
2628
2629 static int repair_inode_nlinks(struct btrfs_trans_handle *trans,
2630                                struct btrfs_root *root,
2631                                struct btrfs_path *path,
2632                                struct inode_record *rec)
2633 {
2634         char *dir_name = "lost+found";
2635         char namebuf[BTRFS_NAME_LEN] = {0};
2636         u64 lost_found_ino;
2637         u32 mode = 0700;
2638         u8 type = 0;
2639         int namelen = 0;
2640         int name_recovered = 0;
2641         int type_recovered = 0;
2642         int ret = 0;
2643
2644         /*
2645          * Get file name and type first before these invalid inode ref
2646          * are deleted by remove_all_invalid_backref()
2647          */
2648         name_recovered = !find_file_name(rec, namebuf, &namelen);
2649         type_recovered = !find_file_type(rec, &type);
2650
2651         if (!name_recovered) {
2652                 printf("Can't get file name for inode %llu, using '%llu' as fallback\n",
2653                        rec->ino, rec->ino);
2654                 namelen = count_digits(rec->ino);
2655                 sprintf(namebuf, "%llu", rec->ino);
2656                 name_recovered = 1;
2657         }
2658         if (!type_recovered) {
2659                 printf("Can't get file type for inode %llu, using FILE as fallback\n",
2660                        rec->ino);
2661                 type = BTRFS_FT_REG_FILE;
2662                 type_recovered = 1;
2663         }
2664
2665         ret = reset_nlink(trans, root, path, rec);
2666         if (ret < 0) {
2667                 fprintf(stderr,
2668                         "Failed to reset nlink for inode %llu: %s\n",
2669                         rec->ino, strerror(-ret));
2670                 goto out;
2671         }
2672
2673         if (rec->found_link == 0) {
2674                 lost_found_ino = root->highest_inode;
2675                 if (lost_found_ino >= BTRFS_LAST_FREE_OBJECTID) {
2676                         ret = -EOVERFLOW;
2677                         goto out;
2678                 }
2679                 lost_found_ino++;
2680                 ret = btrfs_mkdir(trans, root, dir_name, strlen(dir_name),
2681                                   BTRFS_FIRST_FREE_OBJECTID, &lost_found_ino,
2682                                   mode);
2683                 if (ret < 0) {
2684                         fprintf(stderr, "Failed to create '%s' dir: %s\n",
2685                                 dir_name, strerror(-ret));
2686                         goto out;
2687                 }
2688                 ret = btrfs_add_link(trans, root, rec->ino, lost_found_ino,
2689                                      namebuf, namelen, type, NULL, 1);
2690                 /*
2691                  * Add ".INO" suffix several times to handle case where
2692                  * "FILENAME.INO" is already taken by another file.
2693                  */
2694                 while (ret == -EEXIST) {
2695                         /*
2696                          * Conflicting file name, add ".INO" as suffix * +1 for '.'
2697                          */
2698                         if (namelen + count_digits(rec->ino) + 1 >
2699                             BTRFS_NAME_LEN) {
2700                                 ret = -EFBIG;
2701                                 goto out;
2702                         }
2703                         snprintf(namebuf + namelen, BTRFS_NAME_LEN - namelen,
2704                                  ".%llu", rec->ino);
2705                         namelen += count_digits(rec->ino) + 1;
2706                         ret = btrfs_add_link(trans, root, rec->ino,
2707                                              lost_found_ino, namebuf,
2708                                              namelen, type, NULL, 1);
2709                 }
2710                 if (ret < 0) {
2711                         fprintf(stderr,
2712                                 "Failed to link the inode %llu to %s dir: %s\n",
2713                                 rec->ino, dir_name, strerror(-ret));
2714                         goto out;
2715                 }
2716                 /*
2717                  * Just increase the found_link, don't actually add the
2718                  * backref. This will make things easier and this inode
2719                  * record will be freed after the repair is done.
2720                  * So fsck will not report problem about this inode.
2721                  */
2722                 rec->found_link++;
2723                 printf("Moving file '%.*s' to '%s' dir since it has no valid backref\n",
2724                        namelen, namebuf, dir_name);
2725         }
2726         printf("Fixed the nlink of inode %llu\n", rec->ino);
2727 out:
2728         /*
2729          * Clear the flag anyway, or we will loop forever for the same inode
2730          * as it will not be removed from the bad inode list and the dead loop
2731          * happens.
2732          */
2733         rec->errors &= ~I_ERR_LINK_COUNT_WRONG;
2734         btrfs_release_path(path);
2735         return ret;
2736 }
2737
2738 /*
2739  * Check if there is any normal(reg or prealloc) file extent for given
2740  * ino.
2741  * This is used to determine the file type when neither its dir_index/item or
2742  * inode_item exists.
2743  *
2744  * This will *NOT* report error, if any error happens, just consider it does
2745  * not have any normal file extent.
2746  */
2747 static int find_normal_file_extent(struct btrfs_root *root, u64 ino)
2748 {
2749         struct btrfs_path *path;
2750         struct btrfs_key key;
2751         struct btrfs_key found_key;
2752         struct btrfs_file_extent_item *fi;
2753         u8 type;
2754         int ret = 0;
2755
2756         path = btrfs_alloc_path();
2757         if (!path)
2758                 goto out;
2759         key.objectid = ino;
2760         key.type = BTRFS_EXTENT_DATA_KEY;
2761         key.offset = 0;
2762
2763         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2764         if (ret < 0) {
2765                 ret = 0;
2766                 goto out;
2767         }
2768         if (ret && path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2769                 ret = btrfs_next_leaf(root, path);
2770                 if (ret) {
2771                         ret = 0;
2772                         goto out;
2773                 }
2774         }
2775         while (1) {
2776                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2777                                       path->slots[0]);
2778                 if (found_key.objectid != ino ||
2779                     found_key.type != BTRFS_EXTENT_DATA_KEY)
2780                         break;
2781                 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
2782                                     struct btrfs_file_extent_item);
2783                 type = btrfs_file_extent_type(path->nodes[0], fi);
2784                 if (type != BTRFS_FILE_EXTENT_INLINE) {
2785                         ret = 1;
2786                         goto out;
2787                 }
2788         }
2789 out:
2790         btrfs_free_path(path);
2791         return ret;
2792 }
2793
2794 static u32 btrfs_type_to_imode(u8 type)
2795 {
2796         static u32 imode_by_btrfs_type[] = {
2797                 [BTRFS_FT_REG_FILE]     = S_IFREG,
2798                 [BTRFS_FT_DIR]          = S_IFDIR,
2799                 [BTRFS_FT_CHRDEV]       = S_IFCHR,
2800                 [BTRFS_FT_BLKDEV]       = S_IFBLK,
2801                 [BTRFS_FT_FIFO]         = S_IFIFO,
2802                 [BTRFS_FT_SOCK]         = S_IFSOCK,
2803                 [BTRFS_FT_SYMLINK]      = S_IFLNK,
2804         };
2805
2806         return imode_by_btrfs_type[(type)];
2807 }
2808
2809 static int repair_inode_no_item(struct btrfs_trans_handle *trans,
2810                                 struct btrfs_root *root,
2811                                 struct btrfs_path *path,
2812                                 struct inode_record *rec)
2813 {
2814         u8 filetype;
2815         u32 mode = 0700;
2816         int type_recovered = 0;
2817         int ret = 0;
2818
2819         printf("Trying to rebuild inode:%llu\n", rec->ino);
2820
2821         type_recovered = !find_file_type(rec, &filetype);
2822
2823         /*
2824          * Try to determine inode type if type not found.
2825          *
2826          * For found regular file extent, it must be FILE.
2827          * For found dir_item/index, it must be DIR.
2828          *
2829          * For undetermined one, use FILE as fallback.
2830          *
2831          * TODO:
2832          * 1. If found backref(inode_index/item is already handled) to it,
2833          *    it must be DIR.
2834          *    Need new inode-inode ref structure to allow search for that.
2835          */
2836         if (!type_recovered) {
2837                 if (rec->found_file_extent &&
2838                     find_normal_file_extent(root, rec->ino)) {
2839                         type_recovered = 1;
2840                         filetype = BTRFS_FT_REG_FILE;
2841                 } else if (rec->found_dir_item) {
2842                         type_recovered = 1;
2843                         filetype = BTRFS_FT_DIR;
2844                 } else if (!list_empty(&rec->orphan_extents)) {
2845                         type_recovered = 1;
2846                         filetype = BTRFS_FT_REG_FILE;
2847                 } else{
2848                         printf("Can't determine the filetype for inode %llu, assume it is a normal file\n",
2849                                rec->ino);
2850                         type_recovered = 1;
2851                         filetype = BTRFS_FT_REG_FILE;
2852                 }
2853         }
2854
2855         ret = btrfs_new_inode(trans, root, rec->ino,
2856                               mode | btrfs_type_to_imode(filetype));
2857         if (ret < 0)
2858                 goto out;
2859
2860         /*
2861          * Here inode rebuild is done, we only rebuild the inode item,
2862          * don't repair the nlink(like move to lost+found).
2863          * That is the job of nlink repair.
2864          *
2865          * We just fill the record and return
2866          */
2867         rec->found_dir_item = 1;
2868         rec->imode = mode | btrfs_type_to_imode(filetype);
2869         rec->nlink = 0;
2870         rec->errors &= ~I_ERR_NO_INODE_ITEM;
2871         /* Ensure the inode_nlinks repair function will be called */
2872         rec->errors |= I_ERR_LINK_COUNT_WRONG;
2873 out:
2874         return ret;
2875 }
2876
2877 static int repair_inode_orphan_extent(struct btrfs_trans_handle *trans,
2878                                       struct btrfs_root *root,
2879                                       struct btrfs_path *path,
2880                                       struct inode_record *rec)
2881 {
2882         struct orphan_data_extent *orphan;
2883         struct orphan_data_extent *tmp;
2884         int ret = 0;
2885
2886         list_for_each_entry_safe(orphan, tmp, &rec->orphan_extents, list) {
2887                 /*
2888                  * Check for conflicting file extents
2889                  *
2890                  * Here we don't know whether the extents is compressed or not,
2891                  * so we can only assume it not compressed nor data offset,
2892                  * and use its disk_len as extent length.
2893                  */
2894                 ret = btrfs_get_extent(NULL, root, path, orphan->objectid,
2895                                        orphan->offset, orphan->disk_len, 0);
2896                 btrfs_release_path(path);
2897                 if (ret < 0)
2898                         goto out;
2899                 if (!ret) {
2900                         fprintf(stderr,
2901                                 "orphan extent (%llu, %llu) conflicts, delete the orphan\n",
2902                                 orphan->disk_bytenr, orphan->disk_len);
2903                         ret = btrfs_free_extent(trans,
2904                                         root->fs_info->extent_root,
2905                                         orphan->disk_bytenr, orphan->disk_len,
2906                                         0, root->objectid, orphan->objectid,
2907                                         orphan->offset);
2908                         if (ret < 0)
2909                                 goto out;
2910                 }
2911                 ret = btrfs_insert_file_extent(trans, root, orphan->objectid,
2912                                 orphan->offset, orphan->disk_bytenr,
2913                                 orphan->disk_len, orphan->disk_len);
2914                 if (ret < 0)
2915                         goto out;
2916
2917                 /* Update file size info */
2918                 rec->found_size += orphan->disk_len;
2919                 if (rec->found_size == rec->nbytes)
2920                         rec->errors &= ~I_ERR_FILE_NBYTES_WRONG;
2921
2922                 /* Update the file extent hole info too */
2923                 ret = del_file_extent_hole(&rec->holes, orphan->offset,
2924                                            orphan->disk_len);
2925                 if (ret < 0)
2926                         goto out;
2927                 if (RB_EMPTY_ROOT(&rec->holes))
2928                         rec->errors &= ~I_ERR_FILE_EXTENT_DISCOUNT;
2929
2930                 list_del(&orphan->list);
2931                 free(orphan);
2932         }
2933         rec->errors &= ~I_ERR_FILE_EXTENT_ORPHAN;
2934 out:
2935         return ret;
2936 }
2937
2938 static int repair_inode_discount_extent(struct btrfs_trans_handle *trans,
2939                                         struct btrfs_root *root,
2940                                         struct btrfs_path *path,
2941                                         struct inode_record *rec)
2942 {
2943         struct rb_node *node;
2944         struct file_extent_hole *hole;
2945         int found = 0;
2946         int ret = 0;
2947
2948         node = rb_first(&rec->holes);
2949
2950         while (node) {
2951                 found = 1;
2952                 hole = rb_entry(node, struct file_extent_hole, node);
2953                 ret = btrfs_punch_hole(trans, root, rec->ino,
2954                                        hole->start, hole->len);
2955                 if (ret < 0)
2956                         goto out;
2957                 ret = del_file_extent_hole(&rec->holes, hole->start,
2958                                            hole->len);
2959                 if (ret < 0)
2960                         goto out;
2961                 if (RB_EMPTY_ROOT(&rec->holes))
2962                         rec->errors &= ~I_ERR_FILE_EXTENT_DISCOUNT;
2963                 node = rb_first(&rec->holes);
2964         }
2965         /* special case for a file losing all its file extent */
2966         if (!found) {
2967                 ret = btrfs_punch_hole(trans, root, rec->ino, 0,
2968                                        round_up(rec->isize, root->sectorsize));
2969                 if (ret < 0)
2970                         goto out;
2971         }
2972         printf("Fixed discount file extents for inode: %llu in root: %llu\n",
2973                rec->ino, root->objectid);
2974 out:
2975         return ret;
2976 }
2977
2978 static int try_repair_inode(struct btrfs_root *root, struct inode_record *rec)
2979 {
2980         struct btrfs_trans_handle *trans;
2981         struct btrfs_path *path;
2982         int ret = 0;
2983
2984         if (!(rec->errors & (I_ERR_DIR_ISIZE_WRONG |
2985                              I_ERR_NO_ORPHAN_ITEM |
2986                              I_ERR_LINK_COUNT_WRONG |
2987                              I_ERR_NO_INODE_ITEM |
2988                              I_ERR_FILE_EXTENT_ORPHAN |
2989                              I_ERR_FILE_EXTENT_DISCOUNT|
2990                              I_ERR_FILE_NBYTES_WRONG)))
2991                 return rec->errors;
2992
2993         path = btrfs_alloc_path();
2994         if (!path)
2995                 return -ENOMEM;
2996
2997         /*
2998          * For nlink repair, it may create a dir and add link, so
2999          * 2 for parent(256)'s dir_index and dir_item
3000          * 2 for lost+found dir's inode_item and inode_ref
3001          * 1 for the new inode_ref of the file
3002          * 2 for lost+found dir's dir_index and dir_item for the file
3003          */
3004         trans = btrfs_start_transaction(root, 7);
3005         if (IS_ERR(trans)) {
3006                 btrfs_free_path(path);
3007                 return PTR_ERR(trans);
3008         }
3009
3010         if (rec->errors & I_ERR_NO_INODE_ITEM)
3011                 ret = repair_inode_no_item(trans, root, path, rec);
3012         if (!ret && rec->errors & I_ERR_FILE_EXTENT_ORPHAN)
3013                 ret = repair_inode_orphan_extent(trans, root, path, rec);
3014         if (!ret && rec->errors & I_ERR_FILE_EXTENT_DISCOUNT)
3015                 ret = repair_inode_discount_extent(trans, root, path, rec);
3016         if (!ret && rec->errors & I_ERR_DIR_ISIZE_WRONG)
3017                 ret = repair_inode_isize(trans, root, path, rec);
3018         if (!ret && rec->errors & I_ERR_NO_ORPHAN_ITEM)
3019                 ret = repair_inode_orphan_item(trans, root, path, rec);
3020         if (!ret && rec->errors & I_ERR_LINK_COUNT_WRONG)
3021                 ret = repair_inode_nlinks(trans, root, path, rec);
3022         if (!ret && rec->errors & I_ERR_FILE_NBYTES_WRONG)
3023                 ret = repair_inode_nbytes(trans, root, path, rec);
3024         btrfs_commit_transaction(trans, root);
3025         btrfs_free_path(path);
3026         return ret;
3027 }
3028
3029 static int check_inode_recs(struct btrfs_root *root,
3030                             struct cache_tree *inode_cache)
3031 {
3032         struct cache_extent *cache;
3033         struct ptr_node *node;
3034         struct inode_record *rec;
3035         struct inode_backref *backref;
3036         int stage = 0;
3037         int ret = 0;
3038         int err = 0;
3039         u64 error = 0;
3040         u64 root_dirid = btrfs_root_dirid(&root->root_item);
3041
3042         if (btrfs_root_refs(&root->root_item) == 0) {
3043                 if (!cache_tree_empty(inode_cache))
3044                         fprintf(stderr, "warning line %d\n", __LINE__);
3045                 return 0;
3046         }
3047
3048         /*
3049          * We need to record the highest inode number for later 'lost+found'
3050          * dir creation.
3051          * We must select an ino not used/referred by any existing inode, or
3052          * 'lost+found' ino may be a missing ino in a corrupted leaf,
3053          * this may cause 'lost+found' dir has wrong nlinks.
3054          */
3055         cache = last_cache_extent(inode_cache);
3056         if (cache) {
3057                 node = container_of(cache, struct ptr_node, cache);
3058                 rec = node->data;
3059                 if (rec->ino > root->highest_inode)
3060                         root->highest_inode = rec->ino;
3061         }
3062
3063         /*
3064          * We need to repair backrefs first because we could change some of the
3065          * errors in the inode recs.
3066          *
3067          * We also need to go through and delete invalid backrefs first and then
3068          * add the correct ones second.  We do this because we may get EEXIST
3069          * when adding back the correct index because we hadn't yet deleted the
3070          * invalid index.
3071          *
3072          * For example, if we were missing a dir index then the directories
3073          * isize would be wrong, so if we fixed the isize to what we thought it
3074          * would be and then fixed the backref we'd still have a invalid fs, so
3075          * we need to add back the dir index and then check to see if the isize
3076          * is still wrong.
3077          */
3078         while (stage < 3) {
3079                 stage++;
3080                 if (stage == 3 && !err)
3081                         break;
3082
3083                 cache = search_cache_extent(inode_cache, 0);
3084                 while (repair && cache) {
3085                         node = container_of(cache, struct ptr_node, cache);
3086                         rec = node->data;
3087                         cache = next_cache_extent(cache);
3088
3089                         /* Need to free everything up and rescan */
3090                         if (stage == 3) {
3091                                 remove_cache_extent(inode_cache, &node->cache);
3092                                 free(node);
3093                                 free_inode_rec(rec);
3094                                 continue;
3095                         }
3096
3097                         if (list_empty(&rec->backrefs))
3098                                 continue;
3099
3100                         ret = repair_inode_backrefs(root, rec, inode_cache,
3101                                                     stage == 1);
3102                         if (ret < 0) {
3103                                 err = ret;
3104                                 stage = 2;
3105                                 break;
3106                         } if (ret > 0) {
3107                                 err = -EAGAIN;
3108                         }
3109                 }
3110         }
3111         if (err)
3112                 return err;
3113
3114         rec = get_inode_rec(inode_cache, root_dirid, 0);
3115         BUG_ON(IS_ERR(rec));
3116         if (rec) {
3117                 ret = check_root_dir(rec);
3118                 if (ret) {
3119                         fprintf(stderr, "root %llu root dir %llu error\n",
3120                                 (unsigned long long)root->root_key.objectid,
3121                                 (unsigned long long)root_dirid);
3122                         print_inode_error(root, rec);
3123                         error++;
3124                 }
3125         } else {
3126                 if (repair) {
3127                         struct btrfs_trans_handle *trans;
3128
3129                         trans = btrfs_start_transaction(root, 1);
3130                         if (IS_ERR(trans)) {
3131                                 err = PTR_ERR(trans);
3132                                 return err;
3133                         }
3134
3135                         fprintf(stderr,
3136                                 "root %llu missing its root dir, recreating\n",
3137                                 (unsigned long long)root->objectid);
3138
3139                         ret = btrfs_make_root_dir(trans, root, root_dirid);
3140                         BUG_ON(ret);
3141
3142                         btrfs_commit_transaction(trans, root);
3143                         return -EAGAIN;
3144                 }
3145
3146                 fprintf(stderr, "root %llu root dir %llu not found\n",
3147                         (unsigned long long)root->root_key.objectid,
3148                         (unsigned long long)root_dirid);
3149         }
3150
3151         while (1) {
3152                 cache = search_cache_extent(inode_cache, 0);
3153                 if (!cache)
3154                         break;
3155                 node = container_of(cache, struct ptr_node, cache);
3156                 rec = node->data;
3157                 remove_cache_extent(inode_cache, &node->cache);
3158                 free(node);
3159                 if (rec->ino == root_dirid ||
3160                     rec->ino == BTRFS_ORPHAN_OBJECTID) {
3161                         free_inode_rec(rec);
3162                         continue;
3163                 }
3164
3165                 if (rec->errors & I_ERR_NO_ORPHAN_ITEM) {
3166                         ret = check_orphan_item(root, rec->ino);
3167                         if (ret == 0)
3168                                 rec->errors &= ~I_ERR_NO_ORPHAN_ITEM;
3169                         if (can_free_inode_rec(rec)) {
3170                                 free_inode_rec(rec);
3171                                 continue;
3172                         }
3173                 }
3174
3175                 if (!rec->found_inode_item)
3176                         rec->errors |= I_ERR_NO_INODE_ITEM;
3177                 if (rec->found_link != rec->nlink)
3178                         rec->errors |= I_ERR_LINK_COUNT_WRONG;
3179                 if (repair) {
3180                         ret = try_repair_inode(root, rec);
3181                         if (ret == 0 && can_free_inode_rec(rec)) {
3182                                 free_inode_rec(rec);
3183                                 continue;
3184                         }
3185                         ret = 0;
3186                 }
3187
3188                 if (!(repair && ret == 0))
3189                         error++;
3190                 print_inode_error(root, rec);
3191                 list_for_each_entry(backref, &rec->backrefs, list) {
3192                         if (!backref->found_dir_item)
3193                                 backref->errors |= REF_ERR_NO_DIR_ITEM;
3194                         if (!backref->found_dir_index)
3195                                 backref->errors |= REF_ERR_NO_DIR_INDEX;
3196                         if (!backref->found_inode_ref)
3197                                 backref->errors |= REF_ERR_NO_INODE_REF;
3198                         fprintf(stderr, "\tunresolved ref dir %llu index %llu"
3199                                 " namelen %u name %s filetype %d errors %x",
3200                                 (unsigned long long)backref->dir,
3201                                 (unsigned long long)backref->index,
3202                                 backref->namelen, backref->name,
3203                                 backref->filetype, backref->errors);
3204                         print_ref_error(backref->errors);
3205                 }
3206                 free_inode_rec(rec);
3207         }
3208         return (error > 0) ? -1 : 0;
3209 }
3210
3211 static struct root_record *get_root_rec(struct cache_tree *root_cache,
3212                                         u64 objectid)
3213 {
3214         struct cache_extent *cache;
3215         struct root_record *rec = NULL;
3216         int ret;
3217
3218         cache = lookup_cache_extent(root_cache, objectid, 1);
3219         if (cache) {
3220                 rec = container_of(cache, struct root_record, cache);
3221         } else {
3222                 rec = calloc(1, sizeof(*rec));
3223                 if (!rec)
3224                         return ERR_PTR(-ENOMEM);
3225                 rec->objectid = objectid;
3226                 INIT_LIST_HEAD(&rec->backrefs);
3227                 rec->cache.start = objectid;
3228                 rec->cache.size = 1;
3229
3230                 ret = insert_cache_extent(root_cache, &rec->cache);
3231                 if (ret)
3232                         return ERR_PTR(-EEXIST);
3233         }
3234         return rec;
3235 }
3236
3237 static struct root_backref *get_root_backref(struct root_record *rec,
3238                                              u64 ref_root, u64 dir, u64 index,
3239                                              const char *name, int namelen)
3240 {
3241         struct root_backref *backref;
3242
3243         list_for_each_entry(backref, &rec->backrefs, list) {
3244                 if (backref->ref_root != ref_root || backref->dir != dir ||
3245                     backref->namelen != namelen)
3246                         continue;
3247                 if (memcmp(name, backref->name, namelen))
3248                         continue;
3249                 return backref;
3250         }
3251
3252         backref = calloc(1, sizeof(*backref) + namelen + 1);
3253         if (!backref)
3254                 return NULL;
3255         backref->ref_root = ref_root;
3256         backref->dir = dir;
3257         backref->index = index;
3258         backref->namelen = namelen;
3259         memcpy(backref->name, name, namelen);
3260         backref->name[namelen] = '\0';
3261         list_add_tail(&backref->list, &rec->backrefs);
3262         return backref;
3263 }
3264
3265 static void free_root_record(struct cache_extent *cache)
3266 {
3267         struct root_record *rec;
3268         struct root_backref *backref;
3269
3270         rec = container_of(cache, struct root_record, cache);
3271         while (!list_empty(&rec->backrefs)) {
3272                 backref = to_root_backref(rec->backrefs.next);
3273                 list_del(&backref->list);
3274                 free(backref);
3275         }
3276
3277         kfree(rec);
3278 }
3279
3280 FREE_EXTENT_CACHE_BASED_TREE(root_recs, free_root_record);
3281
3282 static int add_root_backref(struct cache_tree *root_cache,
3283                             u64 root_id, u64 ref_root, u64 dir, u64 index,
3284                             const char *name, int namelen,
3285                             int item_type, int errors)
3286 {
3287         struct root_record *rec;
3288         struct root_backref *backref;
3289
3290         rec = get_root_rec(root_cache, root_id);
3291         BUG_ON(IS_ERR(rec));
3292         backref = get_root_backref(rec, ref_root, dir, index, name, namelen);
3293         BUG_ON(!backref);
3294
3295         backref->errors |= errors;
3296
3297         if (item_type != BTRFS_DIR_ITEM_KEY) {
3298                 if (backref->found_dir_index || backref->found_back_ref ||
3299                     backref->found_forward_ref) {
3300                         if (backref->index != index)
3301                                 backref->errors |= REF_ERR_INDEX_UNMATCH;
3302                 } else {
3303                         backref->index = index;
3304                 }
3305         }
3306
3307         if (item_type == BTRFS_DIR_ITEM_KEY) {
3308                 if (backref->found_forward_ref)
3309                         rec->found_ref++;
3310                 backref->found_dir_item = 1;
3311         } else if (item_type == BTRFS_DIR_INDEX_KEY) {
3312                 backref->found_dir_index = 1;
3313         } else if (item_type == BTRFS_ROOT_REF_KEY) {
3314                 if (backref->found_forward_ref)
3315                         backref->errors |= REF_ERR_DUP_ROOT_REF;
3316                 else if (backref->found_dir_item)
3317                         rec->found_ref++;
3318                 backref->found_forward_ref = 1;
3319         } else if (item_type == BTRFS_ROOT_BACKREF_KEY) {
3320                 if (backref->found_back_ref)
3321                         backref->errors |= REF_ERR_DUP_ROOT_BACKREF;
3322                 backref->found_back_ref = 1;
3323         } else {
3324                 BUG_ON(1);
3325         }
3326
3327         if (backref->found_forward_ref && backref->found_dir_item)
3328                 backref->reachable = 1;
3329         return 0;
3330 }
3331
3332 static int merge_root_recs(struct btrfs_root *root,
3333                            struct cache_tree *src_cache,
3334                            struct cache_tree *dst_cache)
3335 {
3336         struct cache_extent *cache;
3337         struct ptr_node *node;
3338         struct inode_record *rec;
3339         struct inode_backref *backref;
3340         int ret = 0;
3341
3342         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
3343                 free_inode_recs_tree(src_cache);
3344                 return 0;
3345         }
3346
3347         while (1) {
3348                 cache = search_cache_extent(src_cache, 0);
3349                 if (!cache)
3350                         break;
3351                 node = container_of(cache, struct ptr_node, cache);
3352                 rec = node->data;
3353                 remove_cache_extent(src_cache, &node->cache);
3354                 free(node);
3355
3356                 ret = is_child_root(root, root->objectid, rec->ino);
3357                 if (ret < 0)
3358                         break;
3359                 else if (ret == 0)
3360                         goto skip;
3361
3362                 list_for_each_entry(backref, &rec->backrefs, list) {
3363                         BUG_ON(backref->found_inode_ref);
3364                         if (backref->found_dir_item)
3365                                 add_root_backref(dst_cache, rec->ino,
3366                                         root->root_key.objectid, backref->dir,
3367                                         backref->index, backref->name,
3368                                         backref->namelen, BTRFS_DIR_ITEM_KEY,
3369                                         backref->errors);
3370                         if (backref->found_dir_index)
3371                                 add_root_backref(dst_cache, rec->ino,
3372                                         root->root_key.objectid, backref->dir,
3373                                         backref->index, backref->name,
3374                                         backref->namelen, BTRFS_DIR_INDEX_KEY,
3375                                         backref->errors);
3376                 }
3377 skip:
3378                 free_inode_rec(rec);
3379         }
3380         if (ret < 0)
3381                 return ret;
3382         return 0;
3383 }
3384
3385 static int check_root_refs(struct btrfs_root *root,
3386                            struct cache_tree *root_cache)
3387 {
3388         struct root_record *rec;
3389         struct root_record *ref_root;
3390         struct root_backref *backref;
3391         struct cache_extent *cache;
3392         int loop = 1;
3393         int ret;
3394         int error;
3395         int errors = 0;
3396
3397         rec = get_root_rec(root_cache, BTRFS_FS_TREE_OBJECTID);
3398         BUG_ON(IS_ERR(rec));
3399         rec->found_ref = 1;
3400
3401         /* fixme: this can not detect circular references */
3402         while (loop) {
3403                 loop = 0;
3404                 cache = search_cache_extent(root_cache, 0);
3405                 while (1) {
3406                         if (!cache)
3407                                 break;
3408                         rec = container_of(cache, struct root_record, cache);
3409                         cache = next_cache_extent(cache);
3410
3411                         if (rec->found_ref == 0)
3412                                 continue;
3413
3414                         list_for_each_entry(backref, &rec->backrefs, list) {
3415                                 if (!backref->reachable)
3416                                         continue;
3417
3418                                 ref_root = get_root_rec(root_cache,
3419                                                         backref->ref_root);
3420                                 BUG_ON(IS_ERR(ref_root));
3421                                 if (ref_root->found_ref > 0)
3422                                         continue;
3423
3424                                 backref->reachable = 0;
3425                                 rec->found_ref--;
3426                                 if (rec->found_ref == 0)
3427                                         loop = 1;
3428                         }
3429                 }
3430         }
3431
3432         cache = search_cache_extent(root_cache, 0);
3433         while (1) {
3434                 if (!cache)
3435                         break;
3436                 rec = container_of(cache, struct root_record, cache);
3437                 cache = next_cache_extent(cache);
3438
3439                 if (rec->found_ref == 0 &&
3440                     rec->objectid >= BTRFS_FIRST_FREE_OBJECTID &&
3441                     rec->objectid <= BTRFS_LAST_FREE_OBJECTID) {
3442                         ret = check_orphan_item(root->fs_info->tree_root,
3443                                                 rec->objectid);
3444                         if (ret == 0)
3445                                 continue;
3446
3447                         /*
3448                          * If we don't have a root item then we likely just have
3449                          * a dir item in a snapshot for this root but no actual
3450                          * ref key or anything so it's meaningless.
3451                          */
3452                         if (!rec->found_root_item)
3453                                 continue;
3454                         errors++;
3455                         fprintf(stderr, "fs tree %llu not referenced\n",
3456                                 (unsigned long long)rec->objectid);
3457                 }
3458
3459                 error = 0;
3460                 if (rec->found_ref > 0 && !rec->found_root_item)
3461                         error = 1;
3462                 list_for_each_entry(backref, &rec->backrefs, list) {
3463                         if (!backref->found_dir_item)
3464                                 backref->errors |= REF_ERR_NO_DIR_ITEM;
3465                         if (!backref->found_dir_index)
3466                                 backref->errors |= REF_ERR_NO_DIR_INDEX;
3467                         if (!backref->found_back_ref)
3468                                 backref->errors |= REF_ERR_NO_ROOT_BACKREF;
3469                         if (!backref->found_forward_ref)
3470                                 backref->errors |= REF_ERR_NO_ROOT_REF;
3471                         if (backref->reachable && backref->errors)
3472                                 error = 1;
3473                 }
3474                 if (!error)
3475                         continue;
3476
3477                 errors++;
3478                 fprintf(stderr, "fs tree %llu refs %u %s\n",
3479                         (unsigned long long)rec->objectid, rec->found_ref,
3480                          rec->found_root_item ? "" : "not found");
3481
3482                 list_for_each_entry(backref, &rec->backrefs, list) {
3483                         if (!backref->reachable)
3484                                 continue;
3485                         if (!backref->errors && rec->found_root_item)
3486                                 continue;
3487                         fprintf(stderr, "\tunresolved ref root %llu dir %llu"
3488                                 " index %llu namelen %u name %s errors %x\n",
3489                                 (unsigned long long)backref->ref_root,
3490                                 (unsigned long long)backref->dir,
3491                                 (unsigned long long)backref->index,
3492                                 backref->namelen, backref->name,
3493                                 backref->errors);
3494                         print_ref_error(backref->errors);
3495                 }
3496         }
3497         return errors > 0 ? 1 : 0;
3498 }
3499
3500 static int process_root_ref(struct extent_buffer *eb, int slot,
3501                             struct btrfs_key *key,
3502                             struct cache_tree *root_cache)
3503 {
3504         u64 dirid;
3505         u64 index;
3506         u32 len;
3507         u32 name_len;
3508         struct btrfs_root_ref *ref;
3509         char namebuf[BTRFS_NAME_LEN];
3510         int error;
3511
3512         ref = btrfs_item_ptr(eb, slot, struct btrfs_root_ref);
3513
3514         dirid = btrfs_root_ref_dirid(eb, ref);
3515         index = btrfs_root_ref_sequence(eb, ref);
3516         name_len = btrfs_root_ref_name_len(eb, ref);
3517
3518         if (name_len <= BTRFS_NAME_LEN) {
3519                 len = name_len;
3520                 error = 0;
3521         } else {
3522                 len = BTRFS_NAME_LEN;
3523                 error = REF_ERR_NAME_TOO_LONG;
3524         }
3525         read_extent_buffer(eb, namebuf, (unsigned long)(ref + 1), len);
3526
3527         if (key->type == BTRFS_ROOT_REF_KEY) {
3528                 add_root_backref(root_cache, key->offset, key->objectid, dirid,
3529                                  index, namebuf, len, key->type, error);
3530         } else {
3531                 add_root_backref(root_cache, key->objectid, key->offset, dirid,
3532                                  index, namebuf, len, key->type, error);
3533         }
3534         return 0;
3535 }
3536
3537 static void free_corrupt_block(struct cache_extent *cache)
3538 {
3539         struct btrfs_corrupt_block *corrupt;
3540
3541         corrupt = container_of(cache, struct btrfs_corrupt_block, cache);
3542         free(corrupt);
3543 }
3544
3545 FREE_EXTENT_CACHE_BASED_TREE(corrupt_blocks, free_corrupt_block);
3546
3547 /*
3548  * Repair the btree of the given root.
3549  *
3550  * The fix is to remove the node key in corrupt_blocks cache_tree.
3551  * and rebalance the tree.
3552  * After the fix, the btree should be writeable.
3553  */
3554 static int repair_btree(struct btrfs_root *root,
3555                         struct cache_tree *corrupt_blocks)
3556 {
3557         struct btrfs_trans_handle *trans;
3558         struct btrfs_path *path;
3559         struct btrfs_corrupt_block *corrupt;
3560         struct cache_extent *cache;
3561         struct btrfs_key key;
3562         u64 offset;
3563         int level;
3564         int ret = 0;
3565
3566         if (cache_tree_empty(corrupt_blocks))
3567                 return 0;
3568
3569         path = btrfs_alloc_path();
3570         if (!path)
3571                 return -ENOMEM;
3572
3573         trans = btrfs_start_transaction(root, 1);
3574         if (IS_ERR(trans)) {
3575                 ret = PTR_ERR(trans);
3576                 fprintf(stderr, "Error starting transaction: %s\n",
3577                         strerror(-ret));
3578                 goto out_free_path;
3579         }
3580         cache = first_cache_extent(corrupt_blocks);
3581         while (cache) {
3582                 corrupt = container_of(cache, struct btrfs_corrupt_block,
3583                                        cache);
3584                 level = corrupt->level;
3585                 path->lowest_level = level;
3586                 key.objectid = corrupt->key.objectid;
3587                 key.type = corrupt->key.type;
3588                 key.offset = corrupt->key.offset;
3589
3590                 /*
3591                  * Here we don't want to do any tree balance, since it may
3592                  * cause a balance with corrupted brother leaf/node,
3593                  * so ins_len set to 0 here.
3594                  * Balance will be done after all corrupt node/leaf is deleted.
3595                  */
3596                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3597                 if (ret < 0)
3598                         goto out;
3599                 offset = btrfs_node_blockptr(path->nodes[level],
3600                                              path->slots[level]);
3601
3602                 /* Remove the ptr */
3603                 ret = btrfs_del_ptr(trans, root, path, level,
3604                                     path->slots[level]);
3605                 if (ret < 0)
3606                         goto out;
3607                 /*
3608                  * Remove the corresponding extent
3609                  * return value is not concerned.
3610                  */
3611                 btrfs_release_path(path);
3612                 ret = btrfs_free_extent(trans, root, offset, root->nodesize,
3613                                         0, root->root_key.objectid,
3614                                         level - 1, 0);
3615                 cache = next_cache_extent(cache);
3616         }
3617
3618         /* Balance the btree using btrfs_search_slot() */
3619         cache = first_cache_extent(corrupt_blocks);
3620         while (cache) {
3621                 corrupt = container_of(cache, struct btrfs_corrupt_block,
3622                                        cache);
3623                 memcpy(&key, &corrupt->key, sizeof(key));
3624                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3625                 if (ret < 0)
3626                         goto out;
3627                 /* return will always >0 since it won't find the item */
3628                 ret = 0;
3629                 btrfs_release_path(path);
3630                 cache = next_cache_extent(cache);
3631         }
3632 out:
3633         btrfs_commit_transaction(trans, root);
3634 out_free_path:
3635         btrfs_free_path(path);
3636         return ret;
3637 }
3638
3639 static int check_fs_root(struct btrfs_root *root,
3640                          struct cache_tree *root_cache,
3641                          struct walk_control *wc)
3642 {
3643         int ret = 0;
3644         int err = 0;
3645         int wret;
3646         int level;
3647         struct btrfs_path path;
3648         struct shared_node root_node;
3649         struct root_record *rec;
3650         struct btrfs_root_item *root_item = &root->root_item;
3651         struct cache_tree corrupt_blocks;
3652         struct orphan_data_extent *orphan;
3653         struct orphan_data_extent *tmp;
3654         enum btrfs_tree_block_status status;
3655         struct node_refs nrefs;
3656
3657         /*
3658          * Reuse the corrupt_block cache tree to record corrupted tree block
3659          *
3660          * Unlike the usage in extent tree check, here we do it in a per
3661          * fs/subvol tree base.
3662          */
3663         cache_tree_init(&corrupt_blocks);
3664         root->fs_info->corrupt_blocks = &corrupt_blocks;
3665
3666         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
3667                 rec = get_root_rec(root_cache, root->root_key.objectid);
3668                 BUG_ON(IS_ERR(rec));
3669                 if (btrfs_root_refs(root_item) > 0)
3670                         rec->found_root_item = 1;
3671         }
3672
3673         btrfs_init_path(&path);
3674         memset(&root_node, 0, sizeof(root_node));
3675         cache_tree_init(&root_node.root_cache);
3676         cache_tree_init(&root_node.inode_cache);
3677         memset(&nrefs, 0, sizeof(nrefs));
3678
3679         /* Move the orphan extent record to corresponding inode_record */
3680         list_for_each_entry_safe(orphan, tmp,
3681                                  &root->orphan_data_extents, list) {
3682                 struct inode_record *inode;
3683
3684                 inode = get_inode_rec(&root_node.inode_cache, orphan->objectid,
3685                                       1);
3686                 BUG_ON(IS_ERR(inode));
3687                 inode->errors |= I_ERR_FILE_EXTENT_ORPHAN;
3688                 list_move(&orphan->list, &inode->orphan_extents);
3689         }
3690
3691         level = btrfs_header_level(root->node);
3692         memset(wc->nodes, 0, sizeof(wc->nodes));
3693         wc->nodes[level] = &root_node;
3694         wc->active_node = level;
3695         wc->root_level = level;
3696
3697         /* We may not have checked the root block, lets do that now */
3698         if (btrfs_is_leaf(root->node))
3699                 status = btrfs_check_leaf(root, NULL, root->node);
3700         else
3701                 status = btrfs_check_node(root, NULL, root->node);
3702         if (status != BTRFS_TREE_BLOCK_CLEAN)
3703                 return -EIO;
3704
3705         if (btrfs_root_refs(root_item) > 0 ||
3706             btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3707                 path.nodes[level] = root->node;
3708                 extent_buffer_get(root->node);
3709                 path.slots[level] = 0;
3710         } else {
3711                 struct btrfs_key key;
3712                 struct btrfs_disk_key found_key;
3713
3714                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3715                 level = root_item->drop_level;
3716                 path.lowest_level = level;
3717                 wret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
3718                 if (wret < 0)
3719                         goto skip_walking;
3720                 btrfs_node_key(path.nodes[level], &found_key,
3721                                 path.slots[level]);
3722                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3723                                         sizeof(found_key)));
3724         }
3725
3726         while (1) {
3727                 wret = walk_down_tree(root, &path, wc, &level, &nrefs);
3728                 if (wret < 0)
3729                         ret = wret;
3730                 if (wret != 0)
3731                         break;
3732
3733                 wret = walk_up_tree(root, &path, wc, &level);
3734                 if (wret < 0)
3735                         ret = wret;
3736                 if (wret != 0)
3737                         break;
3738         }
3739 skip_walking:
3740         btrfs_release_path(&path);
3741
3742         if (!cache_tree_empty(&corrupt_blocks)) {
3743                 struct cache_extent *cache;
3744                 struct btrfs_corrupt_block *corrupt;
3745
3746                 printf("The following tree block(s) is corrupted in tree %llu:\n",
3747                        root->root_key.objectid);
3748                 cache = first_cache_extent(&corrupt_blocks);
3749                 while (cache) {
3750                         corrupt = container_of(cache,
3751                                                struct btrfs_corrupt_block,
3752                                                cache);
3753                         printf("\ttree block bytenr: %llu, level: %d, node key: (%llu, %u, %llu)\n",
3754                                cache->start, corrupt->level,
3755                                corrupt->key.objectid, corrupt->key.type,
3756                                corrupt->key.offset);
3757                         cache = next_cache_extent(cache);
3758                 }
3759                 if (repair) {
3760                         printf("Try to repair the btree for root %llu\n",
3761                                root->root_key.objectid);
3762                         ret = repair_btree(root, &corrupt_blocks);
3763                         if (ret < 0)
3764                                 fprintf(stderr, "Failed to repair btree: %s\n",
3765                                         strerror(-ret));
3766                         if (!ret)
3767                                 printf("Btree for root %llu is fixed\n",
3768                                        root->root_key.objectid);
3769                 }
3770         }
3771
3772         err = merge_root_recs(root, &root_node.root_cache, root_cache);
3773         if (err < 0)
3774                 ret = err;
3775
3776         if (root_node.current) {
3777                 root_node.current->checked = 1;
3778                 maybe_free_inode_rec(&root_node.inode_cache,
3779                                 root_node.current);
3780         }
3781
3782         err = check_inode_recs(root, &root_node.inode_cache);
3783         if (!ret)
3784                 ret = err;
3785
3786         free_corrupt_blocks_tree(&corrupt_blocks);
3787         root->fs_info->corrupt_blocks = NULL;
3788         free_orphan_data_extents(&root->orphan_data_extents);
3789         return ret;
3790 }
3791
3792 static int fs_root_objectid(u64 objectid)
3793 {
3794         if (objectid == BTRFS_TREE_RELOC_OBJECTID ||
3795             objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3796                 return 1;
3797         return is_fstree(objectid);
3798 }
3799
3800 static int check_fs_roots(struct btrfs_root *root,
3801                           struct cache_tree *root_cache)
3802 {
3803         struct btrfs_path path;
3804         struct btrfs_key key;
3805         struct walk_control wc;
3806         struct extent_buffer *leaf, *tree_node;
3807         struct btrfs_root *tmp_root;
3808         struct btrfs_root *tree_root = root->fs_info->tree_root;
3809         int ret;
3810         int err = 0;
3811
3812         if (ctx.progress_enabled) {
3813                 ctx.tp = TASK_FS_ROOTS;
3814                 task_start(ctx.info);
3815         }
3816
3817         /*
3818          * Just in case we made any changes to the extent tree that weren't
3819          * reflected into the free space cache yet.
3820          */
3821         if (repair)
3822                 reset_cached_block_groups(root->fs_info);
3823         memset(&wc, 0, sizeof(wc));
3824         cache_tree_init(&wc.shared);
3825         btrfs_init_path(&path);
3826
3827 again:
3828         key.offset = 0;
3829         key.objectid = 0;
3830         key.type = BTRFS_ROOT_ITEM_KEY;
3831         ret = btrfs_search_slot(NULL, tree_root, &key, &path, 0, 0);
3832         if (ret < 0) {
3833                 err = 1;
3834                 goto out;
3835         }
3836         tree_node = tree_root->node;
3837         while (1) {
3838                 if (tree_node != tree_root->node) {
3839                         free_root_recs_tree(root_cache);
3840                         btrfs_release_path(&path);
3841                         goto again;
3842                 }
3843                 leaf = path.nodes[0];
3844                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
3845                         ret = btrfs_next_leaf(tree_root, &path);
3846                         if (ret) {
3847                                 if (ret < 0)
3848                                         err = 1;
3849                                 break;
3850                         }
3851                         leaf = path.nodes[0];
3852                 }
3853                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
3854                 if (key.type == BTRFS_ROOT_ITEM_KEY &&
3855                     fs_root_objectid(key.objectid)) {
3856                         if (key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
3857                                 tmp_root = btrfs_read_fs_root_no_cache(
3858                                                 root->fs_info, &key);
3859                         } else {
3860                                 key.offset = (u64)-1;
3861                                 tmp_root = btrfs_read_fs_root(
3862                                                 root->fs_info, &key);
3863                         }
3864                         if (IS_ERR(tmp_root)) {
3865                                 err = 1;
3866                                 goto next;
3867                         }
3868                         ret = check_fs_root(tmp_root, root_cache, &wc);
3869                         if (ret == -EAGAIN) {
3870                                 free_root_recs_tree(root_cache);
3871                                 btrfs_release_path(&path);
3872                                 goto again;
3873                         }
3874                         if (ret)
3875                                 err = 1;
3876                         if (key.objectid == BTRFS_TREE_RELOC_OBJECTID)
3877                                 btrfs_free_fs_root(tmp_root);
3878                 } else if (key.type == BTRFS_ROOT_REF_KEY ||
3879                            key.type == BTRFS_ROOT_BACKREF_KEY) {
3880                         process_root_ref(leaf, path.slots[0], &key,
3881                                          root_cache);
3882                 }
3883 next:
3884                 path.slots[0]++;
3885         }
3886 out:
3887         btrfs_release_path(&path);
3888         if (err)
3889                 free_extent_cache_tree(&wc.shared);
3890         if (!cache_tree_empty(&wc.shared))
3891                 fprintf(stderr, "warning line %d\n", __LINE__);
3892
3893         task_stop(ctx.info);
3894
3895         return err;
3896 }
3897
3898 static int all_backpointers_checked(struct extent_record *rec, int print_errs)
3899 {
3900         struct rb_node *n;
3901         struct extent_backref *back;
3902         struct tree_backref *tback;
3903         struct data_backref *dback;
3904         u64 found = 0;
3905         int err = 0;
3906
3907         for (n = rb_first(&rec->backref_tree); n; n = rb_next(n)) {
3908                 back = rb_node_to_extent_backref(n);
3909                 if (!back->found_extent_tree) {
3910                         err = 1;
3911                         if (!print_errs)
3912                                 goto out;
3913                         if (back->is_data) {
3914                                 dback = to_data_backref(back);
3915                                 fprintf(stderr, "Backref %llu %s %llu"
3916                                         " owner %llu offset %llu num_refs %lu"
3917                                         " not found in extent tree\n",
3918                                         (unsigned long long)rec->start,
3919                                         back->full_backref ?
3920                                         "parent" : "root",
3921                                         back->full_backref ?
3922                                         (unsigned long long)dback->parent:
3923                                         (unsigned long long)dback->root,
3924                                         (unsigned long long)dback->owner,
3925                                         (unsigned long long)dback->offset,
3926                                         (unsigned long)dback->num_refs);
3927                         } else {
3928                                 tback = to_tree_backref(back);
3929                                 fprintf(stderr, "Backref %llu parent %llu"
3930                                         " root %llu not found in extent tree\n",
3931                                         (unsigned long long)rec->start,
3932                                         (unsigned long long)tback->parent,
3933                                         (unsigned long long)tback->root);
3934                         }
3935                 }
3936                 if (!back->is_data && !back->found_ref) {
3937                         err = 1;
3938                         if (!print_errs)
3939                                 goto out;
3940                         tback = to_tree_backref(back);
3941                         fprintf(stderr, "Backref %llu %s %llu not referenced back %p\n",
3942                                 (unsigned long long)rec->start,
3943                                 back->full_backref ? "parent" : "root",
3944                                 back->full_backref ?
3945                                 (unsigned long long)tback->parent :
3946                                 (unsigned long long)tback->root, back);
3947                 }
3948                 if (back->is_data) {
3949                         dback = to_data_backref(back);
3950                         if (dback->found_ref != dback->num_refs) {
3951                                 err = 1;
3952                                 if (!print_errs)
3953                                         goto out;
3954                                 fprintf(stderr, "Incorrect local backref count"
3955                                         " on %llu %s %llu owner %llu"
3956                                         " offset %llu found %u wanted %u back %p\n",
3957                                         (unsigned long long)rec->start,
3958                                         back->full_backref ?
3959                                         "parent" : "root",
3960                                         back->full_backref ?
3961                                         (unsigned long long)dback->parent:
3962                                         (unsigned long long)dback->root,
3963                                         (unsigned long long)dback->owner,
3964                                         (unsigned long long)dback->offset,
3965                                         dback->found_ref, dback->num_refs, back);
3966                         }
3967                         if (dback->disk_bytenr != rec->start) {
3968                                 err = 1;
3969                                 if (!print_errs)
3970                                         goto out;
3971                                 fprintf(stderr, "Backref disk bytenr does not"
3972                                         " match extent record, bytenr=%llu, "
3973                                         "ref bytenr=%llu\n",
3974                                         (unsigned long long)rec->start,
3975                                         (unsigned long long)dback->disk_bytenr);
3976                         }
3977
3978                         if (dback->bytes != rec->nr) {
3979                                 err = 1;
3980                                 if (!print_errs)
3981                                         goto out;
3982                                 fprintf(stderr, "Backref bytes do not match "
3983                                         "extent backref, bytenr=%llu, ref "
3984                                         "bytes=%llu, backref bytes=%llu\n",
3985                                         (unsigned long long)rec->start,
3986                                         (unsigned long long)rec->nr,
3987                                         (unsigned long long)dback->bytes);
3988                         }
3989                 }
3990                 if (!back->is_data) {
3991                         found += 1;
3992                 } else {
3993                         dback = to_data_backref(back);
3994                         found += dback->found_ref;
3995                 }
3996         }
3997         if (found != rec->refs) {
3998                 err = 1;
3999                 if (!print_errs)
4000                         goto out;
4001                 fprintf(stderr, "Incorrect global backref count "
4002                         "on %llu found %llu wanted %llu\n",
4003                         (unsigned long long)rec->start,
4004                         (unsigned long long)found,
4005                         (unsigned long long)rec->refs);
4006         }
4007 out:
4008         return err;
4009 }
4010
4011 static void __free_one_backref(struct rb_node *node)
4012 {
4013         struct extent_backref *back = rb_node_to_extent_backref(node);
4014
4015         free(back);
4016 }
4017
4018 static void free_all_extent_backrefs(struct extent_record *rec)
4019 {
4020         rb_free_nodes(&rec->backref_tree, __free_one_backref);
4021 }
4022
4023 static void free_extent_record_cache(struct btrfs_fs_info *fs_info,
4024                                      struct cache_tree *extent_cache)
4025 {
4026         struct cache_extent *cache;
4027         struct extent_record *rec;
4028
4029         while (1) {
4030                 cache = first_cache_extent(extent_cache);
4031                 if (!cache)
4032                         break;
4033                 rec = container_of(cache, struct extent_record, cache);
4034                 remove_cache_extent(extent_cache, cache);
4035                 free_all_extent_backrefs(rec);
4036                 free(rec);
4037         }
4038 }
4039
4040 static int maybe_free_extent_rec(struct cache_tree *extent_cache,
4041                                  struct extent_record *rec)
4042 {
4043         if (rec->content_checked && rec->owner_ref_checked &&
4044             rec->extent_item_refs == rec->refs && rec->refs > 0 &&
4045             rec->num_duplicates == 0 && !all_backpointers_checked(rec, 0) &&
4046             !rec->bad_full_backref && !rec->crossing_stripes &&
4047             !rec->wrong_chunk_type) {
4048                 remove_cache_extent(extent_cache, &rec->cache);
4049                 free_all_extent_backrefs(rec);
4050                 list_del_init(&rec->list);
4051                 free(rec);
4052         }
4053         return 0;
4054 }
4055
4056 static int check_owner_ref(struct btrfs_root *root,
4057                             struct extent_record *rec,
4058                             struct extent_buffer *buf)
4059 {
4060         struct extent_backref *node, *tmp;
4061         struct tree_backref *back;
4062         struct btrfs_root *ref_root;
4063         struct btrfs_key key;
4064         struct btrfs_path path;
4065         struct extent_buffer *parent;
4066         int level;
4067         int found = 0;
4068         int ret;
4069
4070         rbtree_postorder_for_each_entry_safe(node, tmp,
4071                                              &rec->backref_tree, node) {
4072                 if (node->is_data)
4073                         continue;
4074                 if (!node->found_ref)
4075                         continue;
4076                 if (node->full_backref)
4077                         continue;
4078                 back = to_tree_backref(node);
4079                 if (btrfs_header_owner(buf) == back->root)
4080                         return 0;
4081         }
4082         BUG_ON(rec->is_root);
4083
4084         /* try to find the block by search corresponding fs tree */
4085         key.objectid = btrfs_header_owner(buf);
4086         key.type = BTRFS_ROOT_ITEM_KEY;
4087         key.offset = (u64)-1;
4088
4089         ref_root = btrfs_read_fs_root(root->fs_info, &key);
4090         if (IS_ERR(ref_root))
4091                 return 1;
4092
4093         level = btrfs_header_level(buf);
4094         if (level == 0)
4095                 btrfs_item_key_to_cpu(buf, &key, 0);
4096         else
4097                 btrfs_node_key_to_cpu(buf, &key, 0);
4098
4099         btrfs_init_path(&path);
4100         path.lowest_level = level + 1;
4101         ret = btrfs_search_slot(NULL, ref_root, &key, &path, 0, 0);
4102         if (ret < 0)
4103                 return 0;
4104
4105         parent = path.nodes[level + 1];
4106         if (parent && buf->start == btrfs_node_blockptr(parent,
4107                                                         path.slots[level + 1]))
4108                 found = 1;
4109
4110         btrfs_release_path(&path);
4111         return found ? 0 : 1;
4112 }
4113
4114 static int is_extent_tree_record(struct extent_record *rec)
4115 {
4116         struct extent_backref *ref, *tmp;
4117         struct tree_backref *back;
4118         int is_extent = 0;
4119
4120         rbtree_postorder_for_each_entry_safe(ref, tmp,
4121                                              &rec->backref_tree, node) {
4122                 if (ref->is_data)
4123                         return 0;
4124                 back = to_tree_backref(ref);
4125                 if (ref->full_backref)
4126                         return 0;
4127                 if (back->root == BTRFS_EXTENT_TREE_OBJECTID)
4128                         is_extent = 1;
4129         }
4130         return is_extent;
4131 }
4132
4133
4134 static int record_bad_block_io(struct btrfs_fs_info *info,
4135                                struct cache_tree *extent_cache,
4136                                u64 start, u64 len)
4137 {
4138         struct extent_record *rec;
4139         struct cache_extent *cache;
4140         struct btrfs_key key;
4141
4142         cache = lookup_cache_extent(extent_cache, start, len);
4143         if (!cache)
4144                 return 0;
4145
4146         rec = container_of(cache, struct extent_record, cache);
4147         if (!is_extent_tree_record(rec))
4148                 return 0;
4149
4150         btrfs_disk_key_to_cpu(&key, &rec->parent_key);
4151         return btrfs_add_corrupt_extent_record(info, &key, start, len, 0);
4152 }
4153
4154 static int swap_values(struct btrfs_root *root, struct btrfs_path *path,
4155                        struct extent_buffer *buf, int slot)
4156 {
4157         if (btrfs_header_level(buf)) {
4158                 struct btrfs_key_ptr ptr1, ptr2;
4159
4160                 read_extent_buffer(buf, &ptr1, btrfs_node_key_ptr_offset(slot),
4161                                    sizeof(struct btrfs_key_ptr));
4162                 read_extent_buffer(buf, &ptr2,
4163                                    btrfs_node_key_ptr_offset(slot + 1),
4164                                    sizeof(struct btrfs_key_ptr));
4165                 write_extent_buffer(buf, &ptr1,
4166                                     btrfs_node_key_ptr_offset(slot + 1),
4167                                     sizeof(struct btrfs_key_ptr));
4168                 write_extent_buffer(buf, &ptr2,
4169                                     btrfs_node_key_ptr_offset(slot),
4170                                     sizeof(struct btrfs_key_ptr));
4171                 if (slot == 0) {
4172                         struct btrfs_disk_key key;
4173                         btrfs_node_key(buf, &key, 0);
4174                         btrfs_fixup_low_keys(root, path, &key,
4175                                              btrfs_header_level(buf) + 1);
4176                 }
4177         } else {
4178                 struct btrfs_item *item1, *item2;
4179                 struct btrfs_key k1, k2;
4180                 char *item1_data, *item2_data;
4181                 u32 item1_offset, item2_offset, item1_size, item2_size;
4182
4183                 item1 = btrfs_item_nr(slot);
4184                 item2 = btrfs_item_nr(slot + 1);
4185                 btrfs_item_key_to_cpu(buf, &k1, slot);
4186                 btrfs_item_key_to_cpu(buf, &k2, slot + 1);
4187                 item1_offset = btrfs_item_offset(buf, item1);
4188                 item2_offset = btrfs_item_offset(buf, item2);
4189                 item1_size = btrfs_item_size(buf, item1);
4190                 item2_size = btrfs_item_size(buf, item2);
4191
4192                 item1_data = malloc(item1_size);
4193                 if (!item1_data)
4194                         return -ENOMEM;
4195                 item2_data = malloc(item2_size);
4196                 if (!item2_data) {
4197                         free(item1_data);
4198                         return -ENOMEM;
4199                 }
4200
4201                 read_extent_buffer(buf, item1_data, item1_offset, item1_size);
4202                 read_extent_buffer(buf, item2_data, item2_offset, item2_size);
4203
4204                 write_extent_buffer(buf, item1_data, item2_offset, item2_size);
4205                 write_extent_buffer(buf, item2_data, item1_offset, item1_size);
4206                 free(item1_data);
4207                 free(item2_data);
4208
4209                 btrfs_set_item_offset(buf, item1, item2_offset);
4210                 btrfs_set_item_offset(buf, item2, item1_offset);
4211                 btrfs_set_item_size(buf, item1, item2_size);
4212                 btrfs_set_item_size(buf, item2, item1_size);
4213
4214                 path->slots[0] = slot;
4215                 btrfs_set_item_key_unsafe(root, path, &k2);
4216                 path->slots[0] = slot + 1;
4217                 btrfs_set_item_key_unsafe(root, path, &k1);
4218         }
4219         return 0;
4220 }
4221
4222 static int fix_key_order(struct btrfs_trans_handle *trans,
4223                          struct btrfs_root *root,
4224                          struct btrfs_path *path)
4225 {
4226         struct extent_buffer *buf;
4227         struct btrfs_key k1, k2;
4228         int i;
4229         int level = path->lowest_level;
4230         int ret = -EIO;
4231
4232         buf = path->nodes[level];
4233         for (i = 0; i < btrfs_header_nritems(buf) - 1; i++) {
4234                 if (level) {
4235                         btrfs_node_key_to_cpu(buf, &k1, i);
4236                         btrfs_node_key_to_cpu(buf, &k2, i + 1);
4237                 } else {
4238                         btrfs_item_key_to_cpu(buf, &k1, i);
4239                         btrfs_item_key_to_cpu(buf, &k2, i + 1);
4240                 }
4241                 if (btrfs_comp_cpu_keys(&k1, &k2) < 0)
4242                         continue;
4243                 ret = swap_values(root, path, buf, i);
4244                 if (ret)
4245                         break;
4246                 btrfs_mark_buffer_dirty(buf);
4247                 i = 0;
4248         }
4249         return ret;
4250 }
4251
4252 static int delete_bogus_item(struct btrfs_trans_handle *trans,
4253                              struct btrfs_root *root,
4254                              struct btrfs_path *path,
4255                              struct extent_buffer *buf, int slot)
4256 {
4257         struct btrfs_key key;
4258         int nritems = btrfs_header_nritems(buf);
4259
4260         btrfs_item_key_to_cpu(buf, &key, slot);
4261
4262         /* These are all the keys we can deal with missing. */
4263         if (key.type != BTRFS_DIR_INDEX_KEY &&
4264             key.type != BTRFS_EXTENT_ITEM_KEY &&
4265             key.type != BTRFS_METADATA_ITEM_KEY &&
4266             key.type != BTRFS_TREE_BLOCK_REF_KEY &&
4267             key.type != BTRFS_EXTENT_DATA_REF_KEY)
4268                 return -1;
4269
4270         printf("Deleting bogus item [%llu,%u,%llu] at slot %d on block %llu\n",
4271                (unsigned long long)key.objectid, key.type,
4272                (unsigned long long)key.offset, slot, buf->start);
4273         memmove_extent_buffer(buf, btrfs_item_nr_offset(slot),
4274                               btrfs_item_nr_offset(slot + 1),
4275                               sizeof(struct btrfs_item) *
4276                               (nritems - slot - 1));
4277         btrfs_set_header_nritems(buf, nritems - 1);
4278         if (slot == 0) {
4279                 struct btrfs_disk_key disk_key;
4280
4281                 btrfs_item_key(buf, &disk_key, 0);
4282                 btrfs_fixup_low_keys(root, path, &disk_key, 1);
4283         }
4284         btrfs_mark_buffer_dirty(buf);
4285         return 0;
4286 }
4287
4288 static int fix_item_offset(struct btrfs_trans_handle *trans,
4289                            struct btrfs_root *root,
4290                            struct btrfs_path *path)
4291 {
4292         struct extent_buffer *buf;
4293         int i;
4294         int ret = 0;
4295
4296         /* We should only get this for leaves */
4297         BUG_ON(path->lowest_level);
4298         buf = path->nodes[0];
4299 again:
4300         for (i = 0; i < btrfs_header_nritems(buf); i++) {
4301                 unsigned int shift = 0, offset;
4302
4303                 if (i == 0 && btrfs_item_end_nr(buf, i) !=
4304                     BTRFS_LEAF_DATA_SIZE(root)) {
4305                         if (btrfs_item_end_nr(buf, i) >
4306                             BTRFS_LEAF_DATA_SIZE(root)) {
4307                                 ret = delete_bogus_item(trans, root, path,
4308                                                         buf, i);
4309                                 if (!ret)
4310                                         goto again;
4311                                 fprintf(stderr, "item is off the end of the "
4312                                         "leaf, can't fix\n");
4313                                 ret = -EIO;
4314                                 break;
4315                         }
4316                         shift = BTRFS_LEAF_DATA_SIZE(root) -
4317                                 btrfs_item_end_nr(buf, i);
4318                 } else if (i > 0 && btrfs_item_end_nr(buf, i) !=
4319                            btrfs_item_offset_nr(buf, i - 1)) {
4320                         if (btrfs_item_end_nr(buf, i) >
4321                             btrfs_item_offset_nr(buf, i - 1)) {
4322                                 ret = delete_bogus_item(trans, root, path,
4323                                                         buf, i);
4324                                 if (!ret)
4325                                         goto again;
4326                                 fprintf(stderr, "items overlap, can't fix\n");
4327                                 ret = -EIO;
4328                                 break;
4329                         }
4330                         shift = btrfs_item_offset_nr(buf, i - 1) -
4331                                 btrfs_item_end_nr(buf, i);
4332                 }
4333                 if (!shift)
4334                         continue;
4335
4336                 printf("Shifting item nr %d by %u bytes in block %llu\n",
4337                        i, shift, (unsigned long long)buf->start);
4338                 offset = btrfs_item_offset_nr(buf, i);
4339                 memmove_extent_buffer(buf,
4340                                       btrfs_leaf_data(buf) + offset + shift,
4341                                       btrfs_leaf_data(buf) + offset,
4342                                       btrfs_item_size_nr(buf, i));
4343                 btrfs_set_item_offset(buf, btrfs_item_nr(i),
4344                                       offset + shift);
4345                 btrfs_mark_buffer_dirty(buf);
4346         }
4347
4348         /*
4349          * We may have moved things, in which case we want to exit so we don't
4350          * write those changes out.  Once we have proper abort functionality in
4351          * progs this can be changed to something nicer.
4352          */
4353         BUG_ON(ret);
4354         return ret;
4355 }
4356
4357 /*
4358  * Attempt to fix basic block failures.  If we can't fix it for whatever reason
4359  * then just return -EIO.
4360  */
4361 static int try_to_fix_bad_block(struct btrfs_root *root,
4362                                 struct extent_buffer *buf,
4363                                 enum btrfs_tree_block_status status)
4364 {
4365         struct btrfs_trans_handle *trans;
4366         struct ulist *roots;
4367         struct ulist_node *node;
4368         struct btrfs_root *search_root;
4369         struct btrfs_path *path;
4370         struct ulist_iterator iter;
4371         struct btrfs_key root_key, key;
4372         int ret;
4373
4374         if (status != BTRFS_TREE_BLOCK_BAD_KEY_ORDER &&
4375             status != BTRFS_TREE_BLOCK_INVALID_OFFSETS)
4376                 return -EIO;
4377
4378         path = btrfs_alloc_path();
4379         if (!path)
4380                 return -EIO;
4381
4382         ret = btrfs_find_all_roots(NULL, root->fs_info, buf->start,
4383                                    0, &roots);
4384         if (ret) {
4385                 btrfs_free_path(path);
4386                 return -EIO;
4387         }
4388
4389         ULIST_ITER_INIT(&iter);
4390         while ((node = ulist_next(roots, &iter))) {
4391                 root_key.objectid = node->val;
4392                 root_key.type = BTRFS_ROOT_ITEM_KEY;
4393                 root_key.offset = (u64)-1;
4394
4395                 search_root = btrfs_read_fs_root(root->fs_info, &root_key);
4396                 if (IS_ERR(root)) {
4397                         ret = -EIO;
4398                         break;
4399                 }
4400
4401
4402                 trans = btrfs_start_transaction(search_root, 0);
4403                 if (IS_ERR(trans)) {
4404                         ret = PTR_ERR(trans);
4405                         break;
4406                 }
4407
4408                 path->lowest_level = btrfs_header_level(buf);
4409                 path->skip_check_block = 1;
4410                 if (path->lowest_level)
4411                         btrfs_node_key_to_cpu(buf, &key, 0);
4412                 else
4413                         btrfs_item_key_to_cpu(buf, &key, 0);
4414                 ret = btrfs_search_slot(trans, search_root, &key, path, 0, 1);
4415                 if (ret) {
4416                         ret = -EIO;
4417                         btrfs_commit_transaction(trans, search_root);
4418                         break;
4419                 }
4420                 if (status == BTRFS_TREE_BLOCK_BAD_KEY_ORDER)
4421                         ret = fix_key_order(trans, search_root, path);
4422                 else if (status == BTRFS_TREE_BLOCK_INVALID_OFFSETS)
4423                         ret = fix_item_offset(trans, search_root, path);
4424                 if (ret) {
4425                         btrfs_commit_transaction(trans, search_root);
4426                         break;
4427                 }
4428                 btrfs_release_path(path);
4429                 btrfs_commit_transaction(trans, search_root);
4430         }
4431         ulist_free(roots);
4432         btrfs_free_path(path);
4433         return ret;
4434 }
4435
4436 static int check_block(struct btrfs_root *root,
4437                        struct cache_tree *extent_cache,
4438                        struct extent_buffer *buf, u64 flags)
4439 {
4440         struct extent_record *rec;
4441         struct cache_extent *cache;
4442         struct btrfs_key key;
4443         enum btrfs_tree_block_status status;
4444         int ret = 0;
4445         int level;
4446
4447         cache = lookup_cache_extent(extent_cache, buf->start, buf->len);
4448         if (!cache)
4449                 return 1;
4450         rec = container_of(cache, struct extent_record, cache);
4451         rec->generation = btrfs_header_generation(buf);
4452
4453         level = btrfs_header_level(buf);
4454         if (btrfs_header_nritems(buf) > 0) {
4455
4456                 if (level == 0)
4457                         btrfs_item_key_to_cpu(buf, &key, 0);
4458                 else
4459                         btrfs_node_key_to_cpu(buf, &key, 0);
4460
4461                 rec->info_objectid = key.objectid;
4462         }
4463         rec->info_level = level;
4464
4465         if (btrfs_is_leaf(buf))
4466                 status = btrfs_check_leaf(root, &rec->parent_key, buf);
4467         else
4468                 status = btrfs_check_node(root, &rec->parent_key, buf);
4469
4470         if (status != BTRFS_TREE_BLOCK_CLEAN) {
4471                 if (repair)
4472                         status = try_to_fix_bad_block(root, buf, status);
4473                 if (status != BTRFS_TREE_BLOCK_CLEAN) {
4474                         ret = -EIO;
4475                         fprintf(stderr, "bad block %llu\n",
4476                                 (unsigned long long)buf->start);
4477                 } else {
4478                         /*
4479                          * Signal to callers we need to start the scan over
4480                          * again since we'll have cowed blocks.
4481                          */
4482                         ret = -EAGAIN;
4483                 }
4484         } else {
4485                 rec->content_checked = 1;
4486                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4487                         rec->owner_ref_checked = 1;
4488                 else {
4489                         ret = check_owner_ref(root, rec, buf);
4490                         if (!ret)
4491                                 rec->owner_ref_checked = 1;
4492                 }
4493         }
4494         if (!ret)
4495                 maybe_free_extent_rec(extent_cache, rec);
4496         return ret;
4497 }
4498
4499
4500 static struct tree_backref *find_tree_backref(struct extent_record *rec,
4501                                                 u64 parent, u64 root)
4502 {
4503         struct rb_node *node;
4504         struct tree_backref *back = NULL;
4505         struct tree_backref match = {
4506                 .node = {
4507                         .is_data = 0,
4508                 },
4509         };
4510
4511         if (parent) {
4512                 match.parent = parent;
4513                 match.node.full_backref = 1;
4514         } else {
4515                 match.root = root;
4516         }
4517
4518         node = rb_search(&rec->backref_tree, &match.node.node,
4519                          (rb_compare_keys)compare_extent_backref, NULL);
4520         if (node)
4521                 back = to_tree_backref(rb_node_to_extent_backref(node));
4522
4523         return back;
4524 }
4525
4526 static struct tree_backref *alloc_tree_backref(struct extent_record *rec,
4527                                                 u64 parent, u64 root)
4528 {
4529         struct tree_backref *ref = malloc(sizeof(*ref));
4530
4531         if (!ref)
4532                 return NULL;
4533         memset(&ref->node, 0, sizeof(ref->node));
4534         if (parent > 0) {
4535                 ref->parent = parent;
4536                 ref->node.full_backref = 1;
4537         } else {
4538                 ref->root = root;
4539                 ref->node.full_backref = 0;
4540         }
4541         rb_insert(&rec->backref_tree, &ref->node.node, compare_extent_backref);
4542
4543         return ref;
4544 }
4545
4546 static struct data_backref *find_data_backref(struct extent_record *rec,
4547                                                 u64 parent, u64 root,
4548                                                 u64 owner, u64 offset,
4549                                                 int found_ref,
4550                                                 u64 disk_bytenr, u64 bytes)
4551 {
4552         struct rb_node *node;
4553         struct data_backref *back = NULL;
4554         struct data_backref match = {
4555                 .node = {
4556                         .is_data = 1,
4557                 },
4558                 .owner = owner,
4559                 .offset = offset,
4560                 .bytes = bytes,
4561                 .found_ref = found_ref,
4562                 .disk_bytenr = disk_bytenr,
4563         };
4564
4565         if (parent) {
4566                 match.parent = parent;
4567                 match.node.full_backref = 1;
4568         } else {
4569                 match.root = root;
4570         }
4571
4572         node = rb_search(&rec->backref_tree, &match.node.node,
4573                          (rb_compare_keys)compare_extent_backref, NULL);
4574         if (node)
4575                 back = to_data_backref(rb_node_to_extent_backref(node));
4576
4577         return back;
4578 }
4579
4580 static struct data_backref *alloc_data_backref(struct extent_record *rec,
4581                                                 u64 parent, u64 root,
4582                                                 u64 owner, u64 offset,
4583                                                 u64 max_size)
4584 {
4585         struct data_backref *ref = malloc(sizeof(*ref));
4586
4587         if (!ref)
4588                 return NULL;
4589         memset(&ref->node, 0, sizeof(ref->node));
4590         ref->node.is_data = 1;
4591
4592         if (parent > 0) {
4593                 ref->parent = parent;
4594                 ref->owner = 0;
4595                 ref->offset = 0;
4596                 ref->node.full_backref = 1;
4597         } else {
4598                 ref->root = root;
4599                 ref->owner = owner;
4600                 ref->offset = offset;
4601                 ref->node.full_backref = 0;
4602         }
4603         ref->bytes = max_size;
4604         ref->found_ref = 0;
4605         ref->num_refs = 0;
4606         rb_insert(&rec->backref_tree, &ref->node.node, compare_extent_backref);
4607         if (max_size > rec->max_size)
4608                 rec->max_size = max_size;
4609         return ref;
4610 }
4611
4612 /* Check if the type of extent matches with its chunk */
4613 static void check_extent_type(struct extent_record *rec)
4614 {
4615         struct btrfs_block_group_cache *bg_cache;
4616
4617         bg_cache = btrfs_lookup_first_block_group(global_info, rec->start);
4618         if (!bg_cache)
4619                 return;
4620
4621         /* data extent, check chunk directly*/
4622         if (!rec->metadata) {
4623                 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_DATA))
4624                         rec->wrong_chunk_type = 1;
4625                 return;
4626         }
4627
4628         /* metadata extent, check the obvious case first */
4629         if (!(bg_cache->flags & (BTRFS_BLOCK_GROUP_SYSTEM |
4630                                  BTRFS_BLOCK_GROUP_METADATA))) {
4631                 rec->wrong_chunk_type = 1;
4632                 return;
4633         }
4634
4635         /*
4636          * Check SYSTEM extent, as it's also marked as metadata, we can only
4637          * make sure it's a SYSTEM extent by its backref
4638          */
4639         if (!RB_EMPTY_ROOT(&rec->backref_tree)) {
4640                 struct extent_backref *node;
4641                 struct tree_backref *tback;
4642                 u64 bg_type;
4643
4644                 node = rb_node_to_extent_backref(rb_first(&rec->backref_tree));
4645                 if (node->is_data) {
4646                         /* tree block shouldn't have data backref */
4647                         rec->wrong_chunk_type = 1;
4648                         return;
4649                 }
4650                 tback = container_of(node, struct tree_backref, node);
4651
4652                 if (tback->root == BTRFS_CHUNK_TREE_OBJECTID)
4653                         bg_type = BTRFS_BLOCK_GROUP_SYSTEM;
4654                 else
4655                         bg_type = BTRFS_BLOCK_GROUP_METADATA;
4656                 if (!(bg_cache->flags & bg_type))
4657                         rec->wrong_chunk_type = 1;
4658         }
4659 }
4660
4661 /*
4662  * Allocate a new extent record, fill default values from @tmpl and insert int
4663  * @extent_cache. Caller is supposed to make sure the [start,nr) is not in
4664  * the cache, otherwise it fails.
4665  */
4666 static int add_extent_rec_nolookup(struct cache_tree *extent_cache,
4667                 struct extent_record *tmpl)
4668 {
4669         struct extent_record *rec;
4670         int ret = 0;
4671
4672         rec = malloc(sizeof(*rec));
4673         if (!rec)
4674                 return -ENOMEM;
4675         rec->start = tmpl->start;
4676         rec->max_size = tmpl->max_size;
4677         rec->nr = max(tmpl->nr, tmpl->max_size);
4678         rec->found_rec = tmpl->found_rec;
4679         rec->content_checked = tmpl->content_checked;
4680         rec->owner_ref_checked = tmpl->owner_ref_checked;
4681         rec->num_duplicates = 0;
4682         rec->metadata = tmpl->metadata;
4683         rec->flag_block_full_backref = FLAG_UNSET;
4684         rec->bad_full_backref = 0;
4685         rec->crossing_stripes = 0;
4686         rec->wrong_chunk_type = 0;
4687         rec->is_root = tmpl->is_root;
4688         rec->refs = tmpl->refs;
4689         rec->extent_item_refs = tmpl->extent_item_refs;
4690         rec->parent_generation = tmpl->parent_generation;
4691         INIT_LIST_HEAD(&rec->backrefs);
4692         INIT_LIST_HEAD(&rec->dups);
4693         INIT_LIST_HEAD(&rec->list);
4694         rec->backref_tree = RB_ROOT;
4695         memcpy(&rec->parent_key, &tmpl->parent_key, sizeof(tmpl->parent_key));
4696         rec->cache.start = tmpl->start;
4697         rec->cache.size = tmpl->nr;
4698         ret = insert_cache_extent(extent_cache, &rec->cache);
4699         BUG_ON(ret);
4700         bytes_used += rec->nr;
4701
4702         if (tmpl->metadata)
4703                 rec->crossing_stripes = check_crossing_stripes(rec->start,
4704                                 global_info->tree_root->nodesize);
4705         check_extent_type(rec);
4706         return ret;
4707 }
4708
4709 /*
4710  * Lookup and modify an extent, some values of @tmpl are interpreted verbatim,
4711  * some are hints:
4712  * - refs              - if found, increase refs
4713  * - is_root           - if found, set
4714  * - content_checked   - if found, set
4715  * - owner_ref_checked - if found, set
4716  *
4717  * If not found, create a new one, initialize and insert.
4718  */
4719 static int add_extent_rec(struct cache_tree *extent_cache,
4720                 struct extent_record *tmpl)
4721 {
4722         struct extent_record *rec;
4723         struct cache_extent *cache;
4724         int ret = 0;
4725         int dup = 0;
4726
4727         cache = lookup_cache_extent(extent_cache, tmpl->start, tmpl->nr);
4728         if (cache) {
4729                 rec = container_of(cache, struct extent_record, cache);
4730                 if (tmpl->refs)
4731                         rec->refs++;
4732                 if (rec->nr == 1)
4733                         rec->nr = max(tmpl->nr, tmpl->max_size);
4734
4735                 /*
4736                  * We need to make sure to reset nr to whatever the extent
4737                  * record says was the real size, this way we can compare it to
4738                  * the backrefs.
4739                  */
4740                 if (tmpl->found_rec) {
4741                         if (tmpl->start != rec->start || rec->found_rec) {
4742                                 struct extent_record *tmp;
4743
4744                                 dup = 1;
4745                                 if (list_empty(&rec->list))
4746                                         list_add_tail(&rec->list,
4747                                                       &duplicate_extents);
4748
4749                                 /*
4750                                  * We have to do this song and dance in case we
4751                                  * find an extent record that falls inside of
4752                                  * our current extent record but does not have
4753                                  * the same objectid.
4754                                  */
4755                                 tmp = malloc(sizeof(*tmp));
4756                                 if (!tmp)
4757                                         return -ENOMEM;
4758                                 tmp->start = tmpl->start;
4759                                 tmp->max_size = tmpl->max_size;
4760                                 tmp->nr = tmpl->nr;
4761                                 tmp->found_rec = 1;
4762                                 tmp->metadata = tmpl->metadata;
4763                                 tmp->extent_item_refs = tmpl->extent_item_refs;
4764                                 INIT_LIST_HEAD(&tmp->list);
4765                                 list_add_tail(&tmp->list, &rec->dups);
4766                                 rec->num_duplicates++;
4767                         } else {
4768                                 rec->nr = tmpl->nr;
4769                                 rec->found_rec = 1;
4770                         }
4771                 }
4772
4773                 if (tmpl->extent_item_refs && !dup) {
4774                         if (rec->extent_item_refs) {
4775                                 fprintf(stderr, "block %llu rec "
4776                                         "extent_item_refs %llu, passed %llu\n",
4777                                         (unsigned long long)tmpl->start,
4778                                         (unsigned long long)
4779                                                         rec->extent_item_refs,
4780                                         (unsigned long long)tmpl->extent_item_refs);
4781                         }
4782                         rec->extent_item_refs = tmpl->extent_item_refs;
4783                 }
4784                 if (tmpl->is_root)
4785                         rec->is_root = 1;
4786                 if (tmpl->content_checked)
4787                         rec->content_checked = 1;
4788                 if (tmpl->owner_ref_checked)
4789                         rec->owner_ref_checked = 1;
4790                 memcpy(&rec->parent_key, &tmpl->parent_key,
4791                                 sizeof(tmpl->parent_key));
4792                 if (tmpl->parent_generation)
4793                         rec->parent_generation = tmpl->parent_generation;
4794                 if (rec->max_size < tmpl->max_size)
4795                         rec->max_size = tmpl->max_size;
4796
4797                 /*
4798                  * A metadata extent can't cross stripe_len boundary, otherwise
4799                  * kernel scrub won't be able to handle it.
4800                  * As now stripe_len is fixed to BTRFS_STRIPE_LEN, just check
4801                  * it.
4802                  */
4803                 if (tmpl->metadata)
4804                         rec->crossing_stripes = check_crossing_stripes(
4805                                 rec->start, global_info->tree_root->nodesize);
4806                 check_extent_type(rec);
4807                 maybe_free_extent_rec(extent_cache, rec);
4808                 return ret;
4809         }
4810
4811         ret = add_extent_rec_nolookup(extent_cache, tmpl);
4812
4813         return ret;
4814 }
4815
4816 static int add_tree_backref(struct cache_tree *extent_cache, u64 bytenr,
4817                             u64 parent, u64 root, int found_ref)
4818 {
4819         struct extent_record *rec;
4820         struct tree_backref *back;
4821         struct cache_extent *cache;
4822
4823         cache = lookup_cache_extent(extent_cache, bytenr, 1);
4824         if (!cache) {
4825                 struct extent_record tmpl;
4826
4827                 memset(&tmpl, 0, sizeof(tmpl));
4828                 tmpl.start = bytenr;
4829                 tmpl.nr = 1;
4830                 tmpl.metadata = 1;
4831
4832                 add_extent_rec_nolookup(extent_cache, &tmpl);
4833
4834                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
4835                 if (!cache)
4836                         abort();
4837         }
4838
4839         rec = container_of(cache, struct extent_record, cache);
4840         if (rec->start != bytenr) {
4841                 abort();
4842         }
4843
4844         back = find_tree_backref(rec, parent, root);
4845         if (!back) {
4846                 back = alloc_tree_backref(rec, parent, root);
4847                 BUG_ON(!back);
4848         }
4849
4850         if (found_ref) {
4851                 if (back->node.found_ref) {
4852                         fprintf(stderr, "Extent back ref already exists "
4853                                 "for %llu parent %llu root %llu \n",
4854                                 (unsigned long long)bytenr,
4855                                 (unsigned long long)parent,
4856                                 (unsigned long long)root);
4857                 }
4858                 back->node.found_ref = 1;
4859         } else {
4860                 if (back->node.found_extent_tree) {
4861                         fprintf(stderr, "Extent back ref already exists "
4862                                 "for %llu parent %llu root %llu \n",
4863                                 (unsigned long long)bytenr,
4864                                 (unsigned long long)parent,
4865                                 (unsigned long long)root);
4866                 }
4867                 back->node.found_extent_tree = 1;
4868         }
4869         check_extent_type(rec);
4870         maybe_free_extent_rec(extent_cache, rec);
4871         return 0;
4872 }
4873
4874 static int add_data_backref(struct cache_tree *extent_cache, u64 bytenr,
4875                             u64 parent, u64 root, u64 owner, u64 offset,
4876                             u32 num_refs, int found_ref, u64 max_size)
4877 {
4878         struct extent_record *rec;
4879         struct data_backref *back;
4880         struct cache_extent *cache;
4881
4882         cache = lookup_cache_extent(extent_cache, bytenr, 1);
4883         if (!cache) {
4884                 struct extent_record tmpl;
4885
4886                 memset(&tmpl, 0, sizeof(tmpl));
4887                 tmpl.start = bytenr;
4888                 tmpl.nr = 1;
4889                 tmpl.max_size = max_size;
4890
4891                 add_extent_rec_nolookup(extent_cache, &tmpl);
4892
4893                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
4894                 if (!cache)
4895                         abort();
4896         }
4897
4898         rec = container_of(cache, struct extent_record, cache);
4899         if (rec->max_size < max_size)
4900                 rec->max_size = max_size;
4901
4902         /*
4903          * If found_ref is set then max_size is the real size and must match the
4904          * existing refs.  So if we have already found a ref then we need to
4905          * make sure that this ref matches the existing one, otherwise we need
4906          * to add a new backref so we can notice that the backrefs don't match
4907          * and we need to figure out who is telling the truth.  This is to
4908          * account for that awful fsync bug I introduced where we'd end up with
4909          * a btrfs_file_extent_item that would have its length include multiple
4910          * prealloc extents or point inside of a prealloc extent.
4911          */
4912         back = find_data_backref(rec, parent, root, owner, offset, found_ref,
4913                                  bytenr, max_size);
4914         if (!back) {
4915                 back = alloc_data_backref(rec, parent, root, owner, offset,
4916                                           max_size);
4917                 BUG_ON(!back);
4918         }
4919
4920         if (found_ref) {
4921                 BUG_ON(num_refs != 1);
4922                 if (back->node.found_ref)
4923                         BUG_ON(back->bytes != max_size);
4924                 back->node.found_ref = 1;
4925                 back->found_ref += 1;
4926                 back->bytes = max_size;
4927                 back->disk_bytenr = bytenr;
4928                 rec->refs += 1;
4929                 rec->content_checked = 1;
4930                 rec->owner_ref_checked = 1;
4931         } else {
4932                 if (back->node.found_extent_tree) {
4933                         fprintf(stderr, "Extent back ref already exists "
4934                                 "for %llu parent %llu root %llu "
4935                                 "owner %llu offset %llu num_refs %lu\n",
4936                                 (unsigned long long)bytenr,
4937                                 (unsigned long long)parent,
4938                                 (unsigned long long)root,
4939                                 (unsigned long long)owner,
4940                                 (unsigned long long)offset,
4941                                 (unsigned long)num_refs);
4942                 }
4943                 back->num_refs = num_refs;
4944                 back->node.found_extent_tree = 1;
4945         }
4946         maybe_free_extent_rec(extent_cache, rec);
4947         return 0;
4948 }
4949
4950 static int add_pending(struct cache_tree *pending,
4951                        struct cache_tree *seen, u64 bytenr, u32 size)
4952 {
4953         int ret;
4954         ret = add_cache_extent(seen, bytenr, size);
4955         if (ret)
4956                 return ret;
4957         add_cache_extent(pending, bytenr, size);
4958         return 0;
4959 }
4960
4961 static int pick_next_pending(struct cache_tree *pending,
4962                         struct cache_tree *reada,
4963                         struct cache_tree *nodes,
4964                         u64 last, struct block_info *bits, int bits_nr,
4965                         int *reada_bits)
4966 {
4967         unsigned long node_start = last;
4968         struct cache_extent *cache;
4969         int ret;
4970
4971         cache = search_cache_extent(reada, 0);
4972         if (cache) {
4973                 bits[0].start = cache->start;
4974                 bits[0].size = cache->size;
4975                 *reada_bits = 1;
4976                 return 1;
4977         }
4978         *reada_bits = 0;
4979         if (node_start > 32768)
4980                 node_start -= 32768;
4981
4982         cache = search_cache_extent(nodes, node_start);
4983         if (!cache)
4984                 cache = search_cache_extent(nodes, 0);
4985
4986         if (!cache) {
4987                  cache = search_cache_extent(pending, 0);
4988                  if (!cache)
4989                          return 0;
4990                  ret = 0;
4991                  do {
4992                          bits[ret].start = cache->start;
4993                          bits[ret].size = cache->size;
4994                          cache = next_cache_extent(cache);
4995                          ret++;
4996                  } while (cache && ret < bits_nr);
4997                  return ret;
4998         }
4999
5000         ret = 0;
5001         do {
5002                 bits[ret].start = cache->start;
5003                 bits[ret].size = cache->size;
5004                 cache = next_cache_extent(cache);
5005                 ret++;
5006         } while (cache && ret < bits_nr);
5007
5008         if (bits_nr - ret > 8) {
5009                 u64 lookup = bits[0].start + bits[0].size;
5010                 struct cache_extent *next;
5011                 next = search_cache_extent(pending, lookup);
5012                 while(next) {
5013                         if (next->start - lookup > 32768)
5014                                 break;
5015                         bits[ret].start = next->start;
5016                         bits[ret].size = next->size;
5017                         lookup = next->start + next->size;
5018                         ret++;
5019                         if (ret == bits_nr)
5020                                 break;
5021                         next = next_cache_extent(next);
5022                         if (!next)
5023                                 break;
5024                 }
5025         }
5026         return ret;
5027 }
5028
5029 static void free_chunk_record(struct cache_extent *cache)
5030 {
5031         struct chunk_record *rec;
5032
5033         rec = container_of(cache, struct chunk_record, cache);
5034         list_del_init(&rec->list);
5035         list_del_init(&rec->dextents);
5036         free(rec);
5037 }
5038
5039 void free_chunk_cache_tree(struct cache_tree *chunk_cache)
5040 {
5041         cache_tree_free_extents(chunk_cache, free_chunk_record);
5042 }
5043
5044 static void free_device_record(struct rb_node *node)
5045 {
5046         struct device_record *rec;
5047
5048         rec = container_of(node, struct device_record, node);
5049         free(rec);
5050 }
5051
5052 FREE_RB_BASED_TREE(device_cache, free_device_record);
5053
5054 int insert_block_group_record(struct block_group_tree *tree,
5055                               struct block_group_record *bg_rec)
5056 {
5057         int ret;
5058
5059         ret = insert_cache_extent(&tree->tree, &bg_rec->cache);
5060         if (ret)
5061                 return ret;
5062
5063         list_add_tail(&bg_rec->list, &tree->block_groups);
5064         return 0;
5065 }
5066
5067 static void free_block_group_record(struct cache_extent *cache)
5068 {
5069         struct block_group_record *rec;
5070
5071         rec = container_of(cache, struct block_group_record, cache);
5072         list_del_init(&rec->list);
5073         free(rec);
5074 }
5075
5076 void free_block_group_tree(struct block_group_tree *tree)
5077 {
5078         cache_tree_free_extents(&tree->tree, free_block_group_record);
5079 }
5080
5081 int insert_device_extent_record(struct device_extent_tree *tree,
5082                                 struct device_extent_record *de_rec)
5083 {
5084         int ret;
5085
5086         /*
5087          * Device extent is a bit different from the other extents, because
5088          * the extents which belong to the different devices may have the
5089          * same start and size, so we need use the special extent cache
5090          * search/insert functions.
5091          */
5092         ret = insert_cache_extent2(&tree->tree, &de_rec->cache);
5093         if (ret)
5094                 return ret;
5095
5096         list_add_tail(&de_rec->chunk_list, &tree->no_chunk_orphans);
5097         list_add_tail(&de_rec->device_list, &tree->no_device_orphans);
5098         return 0;
5099 }
5100
5101 static void free_device_extent_record(struct cache_extent *cache)
5102 {
5103         struct device_extent_record *rec;
5104
5105         rec = container_of(cache, struct device_extent_record, cache);
5106         if (!list_empty(&rec->chunk_list))
5107                 list_del_init(&rec->chunk_list);
5108         if (!list_empty(&rec->device_list))
5109                 list_del_init(&rec->device_list);
5110         free(rec);
5111 }
5112
5113 void free_device_extent_tree(struct device_extent_tree *tree)
5114 {
5115         cache_tree_free_extents(&tree->tree, free_device_extent_record);
5116 }
5117
5118 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5119 static int process_extent_ref_v0(struct cache_tree *extent_cache,
5120                                  struct extent_buffer *leaf, int slot)
5121 {
5122         struct btrfs_extent_ref_v0 *ref0;
5123         struct btrfs_key key;
5124
5125         btrfs_item_key_to_cpu(leaf, &key, slot);
5126         ref0 = btrfs_item_ptr(leaf, slot, struct btrfs_extent_ref_v0);
5127         if (btrfs_ref_objectid_v0(leaf, ref0) < BTRFS_FIRST_FREE_OBJECTID) {
5128                 add_tree_backref(extent_cache, key.objectid, key.offset, 0, 0);
5129         } else {
5130                 add_data_backref(extent_cache, key.objectid, key.offset, 0,
5131                                  0, 0, btrfs_ref_count_v0(leaf, ref0), 0, 0);
5132         }
5133         return 0;
5134 }
5135 #endif
5136
5137 struct chunk_record *btrfs_new_chunk_record(struct extent_buffer *leaf,
5138                                             struct btrfs_key *key,
5139                                             int slot)
5140 {
5141         struct btrfs_chunk *ptr;
5142         struct chunk_record *rec;
5143         int num_stripes, i;
5144
5145         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
5146         num_stripes = btrfs_chunk_num_stripes(leaf, ptr);
5147
5148         rec = calloc(1, btrfs_chunk_record_size(num_stripes));
5149         if (!rec) {
5150                 fprintf(stderr, "memory allocation failed\n");
5151                 exit(-1);
5152         }
5153
5154         INIT_LIST_HEAD(&rec->list);
5155         INIT_LIST_HEAD(&rec->dextents);
5156         rec->bg_rec = NULL;
5157
5158         rec->cache.start = key->offset;
5159         rec->cache.size = btrfs_chunk_length(leaf, ptr);
5160
5161         rec->generation = btrfs_header_generation(leaf);
5162
5163         rec->objectid = key->objectid;
5164         rec->type = key->type;
5165         rec->offset = key->offset;
5166
5167         rec->length = rec->cache.size;
5168         rec->owner = btrfs_chunk_owner(leaf, ptr);
5169         rec->stripe_len = btrfs_chunk_stripe_len(leaf, ptr);
5170         rec->type_flags = btrfs_chunk_type(leaf, ptr);
5171         rec->io_width = btrfs_chunk_io_width(leaf, ptr);
5172         rec->io_align = btrfs_chunk_io_align(leaf, ptr);
5173         rec->sector_size = btrfs_chunk_sector_size(leaf, ptr);
5174         rec->num_stripes = num_stripes;
5175         rec->sub_stripes = btrfs_chunk_sub_stripes(leaf, ptr);
5176
5177         for (i = 0; i < rec->num_stripes; ++i) {
5178                 rec->stripes[i].devid =
5179                         btrfs_stripe_devid_nr(leaf, ptr, i);
5180                 rec->stripes[i].offset =
5181                         btrfs_stripe_offset_nr(leaf, ptr, i);
5182                 read_extent_buffer(leaf, rec->stripes[i].dev_uuid,
5183                                 (unsigned long)btrfs_stripe_dev_uuid_nr(ptr, i),
5184                                 BTRFS_UUID_SIZE);
5185         }
5186
5187         return rec;
5188 }
5189
5190 static int process_chunk_item(struct cache_tree *chunk_cache,
5191                               struct btrfs_key *key, struct extent_buffer *eb,
5192                               int slot)
5193 {
5194         struct chunk_record *rec;
5195         int ret = 0;
5196
5197         rec = btrfs_new_chunk_record(eb, key, slot);
5198         ret = insert_cache_extent(chunk_cache, &rec->cache);
5199         if (ret) {
5200                 fprintf(stderr, "Chunk[%llu, %llu] existed.\n",
5201                         rec->offset, rec->length);
5202                 free(rec);
5203         }
5204
5205         return ret;
5206 }
5207
5208 static int process_device_item(struct rb_root *dev_cache,
5209                 struct btrfs_key *key, struct extent_buffer *eb, int slot)
5210 {
5211         struct btrfs_dev_item *ptr;
5212         struct device_record *rec;
5213         int ret = 0;
5214
5215         ptr = btrfs_item_ptr(eb,
5216                 slot, struct btrfs_dev_item);
5217
5218         rec = malloc(sizeof(*rec));
5219         if (!rec) {
5220                 fprintf(stderr, "memory allocation failed\n");
5221                 return -ENOMEM;
5222         }
5223
5224         rec->devid = key->offset;
5225         rec->generation = btrfs_header_generation(eb);
5226
5227         rec->objectid = key->objectid;
5228         rec->type = key->type;
5229         rec->offset = key->offset;
5230
5231         rec->devid = btrfs_device_id(eb, ptr);
5232         rec->total_byte = btrfs_device_total_bytes(eb, ptr);
5233         rec->byte_used = btrfs_device_bytes_used(eb, ptr);
5234
5235         ret = rb_insert(dev_cache, &rec->node, device_record_compare);
5236         if (ret) {
5237                 fprintf(stderr, "Device[%llu] existed.\n", rec->devid);
5238                 free(rec);
5239         }
5240
5241         return ret;
5242 }
5243
5244 struct block_group_record *
5245 btrfs_new_block_group_record(struct extent_buffer *leaf, struct btrfs_key *key,
5246                              int slot)
5247 {
5248         struct btrfs_block_group_item *ptr;
5249         struct block_group_record *rec;
5250
5251         rec = calloc(1, sizeof(*rec));
5252         if (!rec) {
5253                 fprintf(stderr, "memory allocation failed\n");
5254                 exit(-1);
5255         }
5256
5257         rec->cache.start = key->objectid;
5258         rec->cache.size = key->offset;
5259
5260         rec->generation = btrfs_header_generation(leaf);
5261
5262         rec->objectid = key->objectid;
5263         rec->type = key->type;
5264         rec->offset = key->offset;
5265
5266         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_block_group_item);
5267         rec->flags = btrfs_disk_block_group_flags(leaf, ptr);
5268
5269         INIT_LIST_HEAD(&rec->list);
5270
5271         return rec;
5272 }
5273
5274 static int process_block_group_item(struct block_group_tree *block_group_cache,
5275                                     struct btrfs_key *key,
5276                                     struct extent_buffer *eb, int slot)
5277 {
5278         struct block_group_record *rec;
5279         int ret = 0;
5280
5281         rec = btrfs_new_block_group_record(eb, key, slot);
5282         ret = insert_block_group_record(block_group_cache, rec);
5283         if (ret) {
5284                 fprintf(stderr, "Block Group[%llu, %llu] existed.\n",
5285                         rec->objectid, rec->offset);
5286                 free(rec);
5287         }
5288
5289         return ret;
5290 }
5291
5292 struct device_extent_record *
5293 btrfs_new_device_extent_record(struct extent_buffer *leaf,
5294                                struct btrfs_key *key, int slot)
5295 {
5296         struct device_extent_record *rec;
5297         struct btrfs_dev_extent *ptr;
5298
5299         rec = calloc(1, sizeof(*rec));
5300         if (!rec) {
5301                 fprintf(stderr, "memory allocation failed\n");
5302                 exit(-1);
5303         }
5304
5305         rec->cache.objectid = key->objectid;
5306         rec->cache.start = key->offset;
5307
5308         rec->generation = btrfs_header_generation(leaf);
5309
5310         rec->objectid = key->objectid;
5311         rec->type = key->type;
5312         rec->offset = key->offset;
5313
5314         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
5315         rec->chunk_objecteid =
5316                 btrfs_dev_extent_chunk_objectid(leaf, ptr);
5317         rec->chunk_offset =
5318                 btrfs_dev_extent_chunk_offset(leaf, ptr);
5319         rec->length = btrfs_dev_extent_length(leaf, ptr);
5320         rec->cache.size = rec->length;
5321
5322         INIT_LIST_HEAD(&rec->chunk_list);
5323         INIT_LIST_HEAD(&rec->device_list);
5324
5325         return rec;
5326 }
5327
5328 static int
5329 process_device_extent_item(struct device_extent_tree *dev_extent_cache,
5330                            struct btrfs_key *key, struct extent_buffer *eb,
5331                            int slot)
5332 {
5333         struct device_extent_record *rec;
5334         int ret;
5335
5336         rec = btrfs_new_device_extent_record(eb, key, slot);
5337         ret = insert_device_extent_record(dev_extent_cache, rec);
5338         if (ret) {
5339                 fprintf(stderr,
5340                         "Device extent[%llu, %llu, %llu] existed.\n",
5341                         rec->objectid, rec->offset, rec->length);
5342                 free(rec);
5343         }
5344
5345         return ret;
5346 }
5347
5348 static int process_extent_item(struct btrfs_root *root,
5349                                struct cache_tree *extent_cache,
5350                                struct extent_buffer *eb, int slot)
5351 {
5352         struct btrfs_extent_item *ei;
5353         struct btrfs_extent_inline_ref *iref;
5354         struct btrfs_extent_data_ref *dref;
5355         struct btrfs_shared_data_ref *sref;
5356         struct btrfs_key key;
5357         struct extent_record tmpl;
5358         unsigned long end;
5359         unsigned long ptr;
5360         int type;
5361         u32 item_size = btrfs_item_size_nr(eb, slot);
5362         u64 refs = 0;
5363         u64 offset;
5364         u64 num_bytes;
5365         int metadata = 0;
5366
5367         btrfs_item_key_to_cpu(eb, &key, slot);
5368
5369         if (key.type == BTRFS_METADATA_ITEM_KEY) {
5370                 metadata = 1;
5371                 num_bytes = root->nodesize;
5372         } else {
5373                 num_bytes = key.offset;
5374         }
5375
5376         if (item_size < sizeof(*ei)) {
5377 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5378                 struct btrfs_extent_item_v0 *ei0;
5379                 BUG_ON(item_size != sizeof(*ei0));
5380                 ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
5381                 refs = btrfs_extent_refs_v0(eb, ei0);
5382 #else
5383                 BUG();
5384 #endif
5385                 memset(&tmpl, 0, sizeof(tmpl));
5386                 tmpl.start = key.objectid;
5387                 tmpl.nr = num_bytes;
5388                 tmpl.extent_item_refs = refs;
5389                 tmpl.metadata = metadata;
5390                 tmpl.found_rec = 1;
5391                 tmpl.max_size = num_bytes;
5392
5393                 return add_extent_rec(extent_cache, &tmpl);
5394         }
5395
5396         ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
5397         refs = btrfs_extent_refs(eb, ei);
5398         if (btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)
5399                 metadata = 1;
5400         else
5401                 metadata = 0;
5402
5403         memset(&tmpl, 0, sizeof(tmpl));
5404         tmpl.start = key.objectid;
5405         tmpl.nr = num_bytes;
5406         tmpl.extent_item_refs = refs;
5407         tmpl.metadata = metadata;
5408         tmpl.found_rec = 1;
5409         tmpl.max_size = num_bytes;
5410         add_extent_rec(extent_cache, &tmpl);
5411
5412         ptr = (unsigned long)(ei + 1);
5413         if (btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK &&
5414             key.type == BTRFS_EXTENT_ITEM_KEY)
5415                 ptr += sizeof(struct btrfs_tree_block_info);
5416
5417         end = (unsigned long)ei + item_size;
5418         while (ptr < end) {
5419                 iref = (struct btrfs_extent_inline_ref *)ptr;
5420                 type = btrfs_extent_inline_ref_type(eb, iref);
5421                 offset = btrfs_extent_inline_ref_offset(eb, iref);
5422                 switch (type) {
5423                 case BTRFS_TREE_BLOCK_REF_KEY:
5424                         add_tree_backref(extent_cache, key.objectid,
5425                                          0, offset, 0);
5426                         break;
5427                 case BTRFS_SHARED_BLOCK_REF_KEY:
5428                         add_tree_backref(extent_cache, key.objectid,
5429                                          offset, 0, 0);
5430                         break;
5431                 case BTRFS_EXTENT_DATA_REF_KEY:
5432                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
5433                         add_data_backref(extent_cache, key.objectid, 0,
5434                                         btrfs_extent_data_ref_root(eb, dref),
5435                                         btrfs_extent_data_ref_objectid(eb,
5436                                                                        dref),
5437                                         btrfs_extent_data_ref_offset(eb, dref),
5438                                         btrfs_extent_data_ref_count(eb, dref),
5439                                         0, num_bytes);
5440                         break;
5441                 case BTRFS_SHARED_DATA_REF_KEY:
5442                         sref = (struct btrfs_shared_data_ref *)(iref + 1);
5443                         add_data_backref(extent_cache, key.objectid, offset,
5444                                         0, 0, 0,
5445                                         btrfs_shared_data_ref_count(eb, sref),
5446                                         0, num_bytes);
5447                         break;
5448                 default:
5449                         fprintf(stderr, "corrupt extent record: key %Lu %u %Lu\n",
5450                                 key.objectid, key.type, num_bytes);
5451                         goto out;
5452                 }
5453                 ptr += btrfs_extent_inline_ref_size(type);
5454         }
5455         WARN_ON(ptr > end);
5456 out:
5457         return 0;
5458 }
5459
5460 static int check_cache_range(struct btrfs_root *root,
5461                              struct btrfs_block_group_cache *cache,
5462                              u64 offset, u64 bytes)
5463 {
5464         struct btrfs_free_space *entry;
5465         u64 *logical;
5466         u64 bytenr;
5467         int stripe_len;
5468         int i, nr, ret;
5469
5470         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
5471                 bytenr = btrfs_sb_offset(i);
5472                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
5473                                        cache->key.objectid, bytenr, 0,
5474                                        &logical, &nr, &stripe_len);
5475                 if (ret)
5476                         return ret;
5477
5478                 while (nr--) {
5479                         if (logical[nr] + stripe_len <= offset)
5480                                 continue;
5481                         if (offset + bytes <= logical[nr])
5482                                 continue;
5483                         if (logical[nr] == offset) {
5484                                 if (stripe_len >= bytes) {
5485                                         kfree(logical);
5486                                         return 0;
5487                                 }
5488                                 bytes -= stripe_len;
5489                                 offset += stripe_len;
5490                         } else if (logical[nr] < offset) {
5491                                 if (logical[nr] + stripe_len >=
5492                                     offset + bytes) {
5493                                         kfree(logical);
5494                                         return 0;
5495                                 }
5496                                 bytes = (offset + bytes) -
5497                                         (logical[nr] + stripe_len);
5498                                 offset = logical[nr] + stripe_len;
5499                         } else {
5500                                 /*
5501                                  * Could be tricky, the super may land in the
5502                                  * middle of the area we're checking.  First
5503                                  * check the easiest case, it's at the end.
5504                                  */
5505                                 if (logical[nr] + stripe_len >=
5506                                     bytes + offset) {
5507                                         bytes = logical[nr] - offset;
5508                                         continue;
5509                                 }
5510
5511                                 /* Check the left side */
5512                                 ret = check_cache_range(root, cache,
5513                                                         offset,
5514                                                         logical[nr] - offset);
5515                                 if (ret) {
5516                                         kfree(logical);
5517                                         return ret;
5518                                 }
5519
5520                                 /* Now we continue with the right side */
5521                                 bytes = (offset + bytes) -
5522                                         (logical[nr] + stripe_len);
5523                                 offset = logical[nr] + stripe_len;
5524                         }
5525                 }
5526
5527                 kfree(logical);
5528         }
5529
5530         entry = btrfs_find_free_space(cache->free_space_ctl, offset, bytes);
5531         if (!entry) {
5532                 fprintf(stderr, "There is no free space entry for %Lu-%Lu\n",
5533                         offset, offset+bytes);
5534                 return -EINVAL;
5535         }
5536
5537         if (entry->offset != offset) {
5538                 fprintf(stderr, "Wanted offset %Lu, found %Lu\n", offset,
5539                         entry->offset);
5540                 return -EINVAL;
5541         }
5542
5543         if (entry->bytes != bytes) {
5544                 fprintf(stderr, "Wanted bytes %Lu, found %Lu for off %Lu\n",
5545                         bytes, entry->bytes, offset);
5546                 return -EINVAL;
5547         }
5548
5549         unlink_free_space(cache->free_space_ctl, entry);
5550         free(entry);
5551         return 0;
5552 }
5553
5554 static int verify_space_cache(struct btrfs_root *root,
5555                               struct btrfs_block_group_cache *cache)
5556 {
5557         struct btrfs_path *path;
5558         struct extent_buffer *leaf;
5559         struct btrfs_key key;
5560         u64 last;
5561         int ret = 0;
5562
5563         path = btrfs_alloc_path();
5564         if (!path)
5565                 return -ENOMEM;
5566
5567         root = root->fs_info->extent_root;
5568
5569         last = max_t(u64, cache->key.objectid, BTRFS_SUPER_INFO_OFFSET);
5570
5571         key.objectid = last;
5572         key.offset = 0;
5573         key.type = BTRFS_EXTENT_ITEM_KEY;
5574
5575         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5576         if (ret < 0)
5577                 goto out;
5578         ret = 0;
5579         while (1) {
5580                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5581                         ret = btrfs_next_leaf(root, path);
5582                         if (ret < 0)
5583                                 goto out;
5584                         if (ret > 0) {
5585                                 ret = 0;
5586                                 break;
5587                         }
5588                 }
5589                 leaf = path->nodes[0];
5590                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5591                 if (key.objectid >= cache->key.offset + cache->key.objectid)
5592                         break;
5593                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
5594                     key.type != BTRFS_METADATA_ITEM_KEY) {
5595                         path->slots[0]++;
5596                         continue;
5597                 }
5598
5599                 if (last == key.objectid) {
5600                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
5601                                 last = key.objectid + key.offset;
5602                         else
5603                                 last = key.objectid + root->nodesize;
5604                         path->slots[0]++;
5605                         continue;
5606                 }
5607
5608                 ret = check_cache_range(root, cache, last,
5609                                         key.objectid - last);
5610                 if (ret)
5611                         break;
5612                 if (key.type == BTRFS_EXTENT_ITEM_KEY)
5613                         last = key.objectid + key.offset;
5614                 else
5615                         last = key.objectid + root->nodesize;
5616                 path->slots[0]++;
5617         }
5618
5619         if (last < cache->key.objectid + cache->key.offset)
5620                 ret = check_cache_range(root, cache, last,
5621                                         cache->key.objectid +
5622                                         cache->key.offset - last);
5623
5624 out:
5625         btrfs_free_path(path);
5626
5627         if (!ret &&
5628             !RB_EMPTY_ROOT(&cache->free_space_ctl->free_space_offset)) {
5629                 fprintf(stderr, "There are still entries left in the space "
5630                         "cache\n");
5631                 ret = -EINVAL;
5632         }
5633
5634         return ret;
5635 }
5636
5637 static int check_space_cache(struct btrfs_root *root)
5638 {
5639         struct btrfs_block_group_cache *cache;
5640         u64 start = BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE;
5641         int ret;
5642         int error = 0;
5643
5644         if (btrfs_super_cache_generation(root->fs_info->super_copy) != -1ULL &&
5645             btrfs_super_generation(root->fs_info->super_copy) !=
5646             btrfs_super_cache_generation(root->fs_info->super_copy)) {
5647                 printf("cache and super generation don't match, space cache "
5648                        "will be invalidated\n");
5649                 return 0;
5650         }
5651
5652         if (ctx.progress_enabled) {
5653                 ctx.tp = TASK_FREE_SPACE;
5654                 task_start(ctx.info);
5655         }
5656
5657         while (1) {
5658                 cache = btrfs_lookup_first_block_group(root->fs_info, start);
5659                 if (!cache)
5660                         break;
5661
5662                 start = cache->key.objectid + cache->key.offset;
5663                 if (!cache->free_space_ctl) {
5664                         if (btrfs_init_free_space_ctl(cache,
5665                                                       root->sectorsize)) {
5666                                 ret = -ENOMEM;
5667                                 break;
5668                         }
5669                 } else {
5670                         btrfs_remove_free_space_cache(cache);
5671                 }
5672
5673                 if (btrfs_fs_compat_ro(root->fs_info,
5674                                        BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)) {
5675                         ret = exclude_super_stripes(root, cache);
5676                         if (ret) {
5677                                 fprintf(stderr, "could not exclude super stripes: %s\n",
5678                                         strerror(-ret));
5679                                 error++;
5680                                 continue;
5681                         }
5682                         ret = load_free_space_tree(root->fs_info, cache);
5683                         free_excluded_extents(root, cache);
5684                         if (ret < 0) {
5685                                 fprintf(stderr, "could not load free space tree: %s\n",
5686                                         strerror(-ret));
5687                                 error++;
5688                                 continue;
5689                         }
5690                         error += ret;
5691                 } else {
5692                         ret = load_free_space_cache(root->fs_info, cache);
5693                         if (!ret)
5694                                 continue;
5695                 }
5696
5697                 ret = verify_space_cache(root, cache);
5698                 if (ret) {
5699                         fprintf(stderr, "cache appears valid but isn't %Lu\n",
5700                                 cache->key.objectid);
5701                         error++;
5702                 }
5703         }
5704
5705         task_stop(ctx.info);
5706
5707         return error ? -EINVAL : 0;
5708 }
5709
5710 static int check_extent_csums(struct btrfs_root *root, u64 bytenr,
5711                         u64 num_bytes, unsigned long leaf_offset,
5712                         struct extent_buffer *eb) {
5713
5714         u64 offset = 0;
5715         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
5716         char *data;
5717         unsigned long csum_offset;
5718         u32 csum;
5719         u32 csum_expected;
5720         u64 read_len;
5721         u64 data_checked = 0;
5722         u64 tmp;
5723         int ret = 0;
5724         int mirror;
5725         int num_copies;
5726
5727         if (num_bytes % root->sectorsize)
5728                 return -EINVAL;
5729
5730         data = malloc(num_bytes);
5731         if (!data)
5732                 return -ENOMEM;
5733
5734         while (offset < num_bytes) {
5735                 mirror = 0;
5736 again:
5737                 read_len = num_bytes - offset;
5738                 /* read as much space once a time */
5739                 ret = read_extent_data(root, data + offset,
5740                                 bytenr + offset, &read_len, mirror);
5741                 if (ret)
5742                         goto out;
5743                 data_checked = 0;
5744                 /* verify every 4k data's checksum */
5745                 while (data_checked < read_len) {
5746                         csum = ~(u32)0;
5747                         tmp = offset + data_checked;
5748
5749                         csum = btrfs_csum_data(NULL, (char *)data + tmp,
5750                                                csum, root->sectorsize);
5751                         btrfs_csum_final(csum, (char *)&csum);
5752
5753                         csum_offset = leaf_offset +
5754                                  tmp / root->sectorsize * csum_size;
5755                         read_extent_buffer(eb, (char *)&csum_expected,
5756                                            csum_offset, csum_size);
5757                         /* try another mirror */
5758                         if (csum != csum_expected) {
5759                                 fprintf(stderr, "mirror %d bytenr %llu csum %u expected csum %u\n",
5760                                                 mirror, bytenr + tmp,
5761                                                 csum, csum_expected);
5762                                 num_copies = btrfs_num_copies(
5763                                                 &root->fs_info->mapping_tree,
5764                                                 bytenr, num_bytes);
5765                                 if (mirror < num_copies - 1) {
5766                                         mirror += 1;
5767                                         goto again;
5768                                 }
5769                         }
5770                         data_checked += root->sectorsize;
5771                 }
5772                 offset += read_len;
5773         }
5774 out:
5775         free(data);
5776         return ret;
5777 }
5778
5779 static int check_extent_exists(struct btrfs_root *root, u64 bytenr,
5780                                u64 num_bytes)
5781 {
5782         struct btrfs_path *path;
5783         struct extent_buffer *leaf;
5784         struct btrfs_key key;
5785         int ret;
5786
5787         path = btrfs_alloc_path();
5788         if (!path) {
5789                 fprintf(stderr, "Error allocating path\n");
5790                 return -ENOMEM;
5791         }
5792
5793         key.objectid = bytenr;
5794         key.type = BTRFS_EXTENT_ITEM_KEY;
5795         key.offset = (u64)-1;
5796
5797 again:
5798         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
5799                                 0, 0);
5800         if (ret < 0) {
5801                 fprintf(stderr, "Error looking up extent record %d\n", ret);
5802                 btrfs_free_path(path);
5803                 return ret;
5804         } else if (ret) {
5805                 if (path->slots[0] > 0) {
5806                         path->slots[0]--;
5807                 } else {
5808                         ret = btrfs_prev_leaf(root, path);
5809                         if (ret < 0) {
5810                                 goto out;
5811                         } else if (ret > 0) {
5812                                 ret = 0;
5813                                 goto out;
5814                         }
5815                 }
5816         }
5817
5818         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
5819
5820         /*
5821          * Block group items come before extent items if they have the same
5822          * bytenr, so walk back one more just in case.  Dear future traveller,
5823          * first congrats on mastering time travel.  Now if it's not too much
5824          * trouble could you go back to 2006 and tell Chris to make the
5825          * BLOCK_GROUP_ITEM_KEY (and BTRFS_*_REF_KEY) lower than the
5826          * EXTENT_ITEM_KEY please?
5827          */
5828         while (key.type > BTRFS_EXTENT_ITEM_KEY) {
5829                 if (path->slots[0] > 0) {
5830                         path->slots[0]--;
5831                 } else {
5832                         ret = btrfs_prev_leaf(root, path);
5833                         if (ret < 0) {
5834                                 goto out;
5835                         } else if (ret > 0) {
5836                                 ret = 0;
5837                                 goto out;
5838                         }
5839                 }
5840                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
5841         }
5842
5843         while (num_bytes) {
5844                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5845                         ret = btrfs_next_leaf(root, path);
5846                         if (ret < 0) {
5847                                 fprintf(stderr, "Error going to next leaf "
5848                                         "%d\n", ret);
5849                                 btrfs_free_path(path);
5850                                 return ret;
5851                         } else if (ret) {
5852                                 break;
5853                         }
5854                 }
5855                 leaf = path->nodes[0];
5856                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5857                 if (key.type != BTRFS_EXTENT_ITEM_KEY) {
5858                         path->slots[0]++;
5859                         continue;
5860                 }
5861                 if (key.objectid + key.offset < bytenr) {
5862                         path->slots[0]++;
5863                         continue;
5864                 }
5865                 if (key.objectid > bytenr + num_bytes)
5866                         break;
5867
5868                 if (key.objectid == bytenr) {
5869                         if (key.offset >= num_bytes) {
5870                                 num_bytes = 0;
5871                                 break;
5872                         }
5873                         num_bytes -= key.offset;
5874                         bytenr += key.offset;
5875                 } else if (key.objectid < bytenr) {
5876                         if (key.objectid + key.offset >= bytenr + num_bytes) {
5877                                 num_bytes = 0;
5878                                 break;
5879                         }
5880                         num_bytes = (bytenr + num_bytes) -
5881                                 (key.objectid + key.offset);
5882                         bytenr = key.objectid + key.offset;
5883                 } else {
5884                         if (key.objectid + key.offset < bytenr + num_bytes) {
5885                                 u64 new_start = key.objectid + key.offset;
5886                                 u64 new_bytes = bytenr + num_bytes - new_start;
5887
5888                                 /*
5889                                  * Weird case, the extent is in the middle of
5890                                  * our range, we'll have to search one side
5891                                  * and then the other.  Not sure if this happens
5892                                  * in real life, but no harm in coding it up
5893                                  * anyway just in case.
5894                                  */
5895                                 btrfs_release_path(path);
5896                                 ret = check_extent_exists(root, new_start,
5897                                                           new_bytes);
5898                                 if (ret) {
5899                                         fprintf(stderr, "Right section didn't "
5900                                                 "have a record\n");
5901                                         break;
5902                                 }
5903                                 num_bytes = key.objectid - bytenr;
5904                                 goto again;
5905                         }
5906                         num_bytes = key.objectid - bytenr;
5907                 }
5908                 path->slots[0]++;
5909         }
5910         ret = 0;
5911
5912 out:
5913         if (num_bytes && !ret) {
5914                 fprintf(stderr, "There are no extents for csum range "
5915                         "%Lu-%Lu\n", bytenr, bytenr+num_bytes);
5916                 ret = 1;
5917         }
5918
5919         btrfs_free_path(path);
5920         return ret;
5921 }
5922
5923 static int check_csums(struct btrfs_root *root)
5924 {
5925         struct btrfs_path *path;
5926         struct extent_buffer *leaf;
5927         struct btrfs_key key;
5928         u64 offset = 0, num_bytes = 0;
5929         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
5930         int errors = 0;
5931         int ret;
5932         u64 data_len;
5933         unsigned long leaf_offset;
5934
5935         root = root->fs_info->csum_root;
5936         if (!extent_buffer_uptodate(root->node)) {
5937                 fprintf(stderr, "No valid csum tree found\n");
5938                 return -ENOENT;
5939         }
5940
5941         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
5942         key.type = BTRFS_EXTENT_CSUM_KEY;
5943         key.offset = 0;
5944
5945         path = btrfs_alloc_path();
5946         if (!path)
5947                 return -ENOMEM;
5948
5949         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5950         if (ret < 0) {
5951                 fprintf(stderr, "Error searching csum tree %d\n", ret);
5952                 btrfs_free_path(path);
5953                 return ret;
5954         }
5955
5956         if (ret > 0 && path->slots[0])
5957                 path->slots[0]--;
5958         ret = 0;
5959
5960         while (1) {
5961                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5962                         ret = btrfs_next_leaf(root, path);
5963                         if (ret < 0) {
5964                                 fprintf(stderr, "Error going to next leaf "
5965                                         "%d\n", ret);
5966                                 break;
5967                         }
5968                         if (ret)
5969                                 break;
5970                 }
5971                 leaf = path->nodes[0];
5972
5973                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5974                 if (key.type != BTRFS_EXTENT_CSUM_KEY) {
5975                         path->slots[0]++;
5976                         continue;
5977                 }
5978
5979                 data_len = (btrfs_item_size_nr(leaf, path->slots[0]) /
5980                               csum_size) * root->sectorsize;
5981                 if (!check_data_csum)
5982                         goto skip_csum_check;
5983                 leaf_offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
5984                 ret = check_extent_csums(root, key.offset, data_len,
5985                                          leaf_offset, leaf);
5986                 if (ret)
5987                         break;
5988 skip_csum_check:
5989                 if (!num_bytes) {
5990                         offset = key.offset;
5991                 } else if (key.offset != offset + num_bytes) {
5992                         ret = check_extent_exists(root, offset, num_bytes);
5993                         if (ret) {
5994                                 fprintf(stderr, "Csum exists for %Lu-%Lu but "
5995                                         "there is no extent record\n",
5996                                         offset, offset+num_bytes);
5997                                 errors++;
5998                         }
5999                         offset = key.offset;
6000                         num_bytes = 0;
6001                 }
6002                 num_bytes += data_len;
6003                 path->slots[0]++;
6004         }
6005
6006         btrfs_free_path(path);
6007         return errors;
6008 }
6009
6010 static int is_dropped_key(struct btrfs_key *key,
6011                           struct btrfs_key *drop_key) {
6012         if (key->objectid < drop_key->objectid)
6013                 return 1;
6014         else if (key->objectid == drop_key->objectid) {
6015                 if (key->type < drop_key->type)
6016                         return 1;
6017                 else if (key->type == drop_key->type) {
6018                         if (key->offset < drop_key->offset)
6019                                 return 1;
6020                 }
6021         }
6022         return 0;
6023 }
6024
6025 /*
6026  * Here are the rules for FULL_BACKREF.
6027  *
6028  * 1) If BTRFS_HEADER_FLAG_RELOC is set then we have FULL_BACKREF set.
6029  * 2) If btrfs_header_owner(buf) no longer points to buf then we have
6030  *      FULL_BACKREF set.
6031  * 3) We cowed the block walking down a reloc tree.  This is impossible to tell
6032  *    if it happened after the relocation occurred since we'll have dropped the
6033  *    reloc root, so it's entirely possible to have FULL_BACKREF set on buf and
6034  *    have no real way to know for sure.
6035  *
6036  * We process the blocks one root at a time, and we start from the lowest root
6037  * objectid and go to the highest.  So we can just lookup the owner backref for
6038  * the record and if we don't find it then we know it doesn't exist and we have
6039  * a FULL BACKREF.
6040  *
6041  * FIXME: if we ever start reclaiming root objectid's then we need to fix this
6042  * assumption and simply indicate that we _think_ that the FULL BACKREF needs to
6043  * be set or not and then we can check later once we've gathered all the refs.
6044  */
6045 static int calc_extent_flag(struct btrfs_root *root,
6046                            struct cache_tree *extent_cache,
6047                            struct extent_buffer *buf,
6048                            struct root_item_record *ri,
6049                            u64 *flags)
6050 {
6051         struct extent_record *rec;
6052         struct cache_extent *cache;
6053         struct tree_backref *tback;
6054         u64 owner = 0;
6055
6056         cache = lookup_cache_extent(extent_cache, buf->start, 1);
6057         /* we have added this extent before */
6058         BUG_ON(!cache);
6059         rec = container_of(cache, struct extent_record, cache);
6060
6061         /*
6062          * Except file/reloc tree, we can not have
6063          * FULL BACKREF MODE
6064          */
6065         if (ri->objectid < BTRFS_FIRST_FREE_OBJECTID)
6066                 goto normal;
6067         /*
6068          * root node
6069          */
6070         if (buf->start == ri->bytenr)
6071                 goto normal;
6072
6073         if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
6074                 goto full_backref;
6075
6076         owner = btrfs_header_owner(buf);
6077         if (owner == ri->objectid)
6078                 goto normal;
6079
6080         tback = find_tree_backref(rec, 0, owner);
6081         if (!tback)
6082                 goto full_backref;
6083 normal:
6084         *flags = 0;
6085         if (rec->flag_block_full_backref != FLAG_UNSET &&
6086             rec->flag_block_full_backref != 0)
6087                 rec->bad_full_backref = 1;
6088         return 0;
6089 full_backref:
6090         *flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6091         if (rec->flag_block_full_backref != FLAG_UNSET &&
6092             rec->flag_block_full_backref != 1)
6093                 rec->bad_full_backref = 1;
6094         return 0;
6095 }
6096
6097 static int run_next_block(struct btrfs_root *root,
6098                           struct block_info *bits,
6099                           int bits_nr,
6100                           u64 *last,
6101                           struct cache_tree *pending,
6102                           struct cache_tree *seen,
6103                           struct cache_tree *reada,
6104                           struct cache_tree *nodes,
6105                           struct cache_tree *extent_cache,
6106                           struct cache_tree *chunk_cache,
6107                           struct rb_root *dev_cache,
6108                           struct block_group_tree *block_group_cache,
6109                           struct device_extent_tree *dev_extent_cache,
6110                           struct root_item_record *ri)
6111 {
6112         struct extent_buffer *buf;
6113         struct extent_record *rec = NULL;
6114         u64 bytenr;
6115         u32 size;
6116         u64 parent;
6117         u64 owner;
6118         u64 flags;
6119         u64 ptr;
6120         u64 gen = 0;
6121         int ret = 0;
6122         int i;
6123         int nritems;
6124         struct btrfs_key key;
6125         struct cache_extent *cache;
6126         int reada_bits;
6127
6128         nritems = pick_next_pending(pending, reada, nodes, *last, bits,
6129                                     bits_nr, &reada_bits);
6130         if (nritems == 0)
6131                 return 1;
6132
6133         if (!reada_bits) {
6134                 for(i = 0; i < nritems; i++) {
6135                         ret = add_cache_extent(reada, bits[i].start,
6136                                                bits[i].size);
6137                         if (ret == -EEXIST)
6138                                 continue;
6139
6140                         /* fixme, get the parent transid */
6141                         readahead_tree_block(root, bits[i].start,
6142                                              bits[i].size, 0);
6143                 }
6144         }
6145         *last = bits[0].start;
6146         bytenr = bits[0].start;
6147         size = bits[0].size;
6148
6149         cache = lookup_cache_extent(pending, bytenr, size);
6150         if (cache) {
6151                 remove_cache_extent(pending, cache);
6152                 free(cache);
6153         }
6154         cache = lookup_cache_extent(reada, bytenr, size);
6155         if (cache) {
6156                 remove_cache_extent(reada, cache);
6157                 free(cache);
6158         }
6159         cache = lookup_cache_extent(nodes, bytenr, size);
6160         if (cache) {
6161                 remove_cache_extent(nodes, cache);
6162                 free(cache);
6163         }
6164         cache = lookup_cache_extent(extent_cache, bytenr, size);
6165         if (cache) {
6166                 rec = container_of(cache, struct extent_record, cache);
6167                 gen = rec->parent_generation;
6168         }
6169
6170         /* fixme, get the real parent transid */
6171         buf = read_tree_block(root, bytenr, size, gen);
6172         if (!extent_buffer_uptodate(buf)) {
6173                 record_bad_block_io(root->fs_info,
6174                                     extent_cache, bytenr, size);
6175                 goto out;
6176         }
6177
6178         nritems = btrfs_header_nritems(buf);
6179
6180         flags = 0;
6181         if (!init_extent_tree) {
6182                 ret = btrfs_lookup_extent_info(NULL, root, bytenr,
6183                                        btrfs_header_level(buf), 1, NULL,
6184                                        &flags);
6185                 if (ret < 0) {
6186                         ret = calc_extent_flag(root, extent_cache, buf, ri, &flags);
6187                         if (ret < 0) {
6188                                 fprintf(stderr, "Couldn't calc extent flags\n");
6189                                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6190                         }
6191                 }
6192         } else {
6193                 flags = 0;
6194                 ret = calc_extent_flag(root, extent_cache, buf, ri, &flags);
6195                 if (ret < 0) {
6196                         fprintf(stderr, "Couldn't calc extent flags\n");
6197                         flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6198                 }
6199         }
6200
6201         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6202                 if (ri != NULL &&
6203                     ri->objectid != BTRFS_TREE_RELOC_OBJECTID &&
6204                     ri->objectid == btrfs_header_owner(buf)) {
6205                         /*
6206                          * Ok we got to this block from it's original owner and
6207                          * we have FULL_BACKREF set.  Relocation can leave
6208                          * converted blocks over so this is altogether possible,
6209                          * however it's not possible if the generation > the
6210                          * last snapshot, so check for this case.
6211                          */
6212                         if (!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC) &&
6213                             btrfs_header_generation(buf) > ri->last_snapshot) {
6214                                 flags &= ~BTRFS_BLOCK_FLAG_FULL_BACKREF;
6215                                 rec->bad_full_backref = 1;
6216                         }
6217                 }
6218         } else {
6219                 if (ri != NULL &&
6220                     (ri->objectid == BTRFS_TREE_RELOC_OBJECTID ||
6221                      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
6222                         flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6223                         rec->bad_full_backref = 1;
6224                 }
6225         }
6226
6227         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6228                 rec->flag_block_full_backref = 1;
6229                 parent = bytenr;
6230                 owner = 0;
6231         } else {
6232                 rec->flag_block_full_backref = 0;
6233                 parent = 0;
6234                 owner = btrfs_header_owner(buf);
6235         }
6236
6237         ret = check_block(root, extent_cache, buf, flags);
6238         if (ret)
6239                 goto out;
6240
6241         if (btrfs_is_leaf(buf)) {
6242                 btree_space_waste += btrfs_leaf_free_space(root, buf);
6243                 for (i = 0; i < nritems; i++) {
6244                         struct btrfs_file_extent_item *fi;
6245                         btrfs_item_key_to_cpu(buf, &key, i);
6246                         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
6247                                 process_extent_item(root, extent_cache, buf,
6248                                                     i);
6249                                 continue;
6250                         }
6251                         if (key.type == BTRFS_METADATA_ITEM_KEY) {
6252                                 process_extent_item(root, extent_cache, buf,
6253                                                     i);
6254                                 continue;
6255                         }
6256                         if (key.type == BTRFS_EXTENT_CSUM_KEY) {
6257                                 total_csum_bytes +=
6258                                         btrfs_item_size_nr(buf, i);
6259                                 continue;
6260                         }
6261                         if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6262                                 process_chunk_item(chunk_cache, &key, buf, i);
6263                                 continue;
6264                         }
6265                         if (key.type == BTRFS_DEV_ITEM_KEY) {
6266                                 process_device_item(dev_cache, &key, buf, i);
6267                                 continue;
6268                         }
6269                         if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6270                                 process_block_group_item(block_group_cache,
6271                                         &key, buf, i);
6272                                 continue;
6273                         }
6274                         if (key.type == BTRFS_DEV_EXTENT_KEY) {
6275                                 process_device_extent_item(dev_extent_cache,
6276                                         &key, buf, i);
6277                                 continue;
6278
6279                         }
6280                         if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
6281 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6282                                 process_extent_ref_v0(extent_cache, buf, i);
6283 #else
6284                                 BUG();
6285 #endif
6286                                 continue;
6287                         }
6288
6289                         if (key.type == BTRFS_TREE_BLOCK_REF_KEY) {
6290                                 add_tree_backref(extent_cache, key.objectid, 0,
6291                                                  key.offset, 0);
6292                                 continue;
6293                         }
6294                         if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
6295                                 add_tree_backref(extent_cache, key.objectid,
6296                                                  key.offset, 0, 0);
6297                                 continue;
6298                         }
6299                         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
6300                                 struct btrfs_extent_data_ref *ref;
6301                                 ref = btrfs_item_ptr(buf, i,
6302                                                 struct btrfs_extent_data_ref);
6303                                 add_data_backref(extent_cache,
6304                                         key.objectid, 0,
6305                                         btrfs_extent_data_ref_root(buf, ref),
6306                                         btrfs_extent_data_ref_objectid(buf,
6307                                                                        ref),
6308                                         btrfs_extent_data_ref_offset(buf, ref),
6309                                         btrfs_extent_data_ref_count(buf, ref),
6310                                         0, root->sectorsize);
6311                                 continue;
6312                         }
6313                         if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
6314                                 struct btrfs_shared_data_ref *ref;
6315                                 ref = btrfs_item_ptr(buf, i,
6316                                                 struct btrfs_shared_data_ref);
6317                                 add_data_backref(extent_cache,
6318                                         key.objectid, key.offset, 0, 0, 0,
6319                                         btrfs_shared_data_ref_count(buf, ref),
6320                                         0, root->sectorsize);
6321                                 continue;
6322                         }
6323                         if (key.type == BTRFS_ORPHAN_ITEM_KEY) {
6324                                 struct bad_item *bad;
6325
6326                                 if (key.objectid == BTRFS_ORPHAN_OBJECTID)
6327                                         continue;
6328                                 if (!owner)
6329                                         continue;
6330                                 bad = malloc(sizeof(struct bad_item));
6331                                 if (!bad)
6332                                         continue;
6333                                 INIT_LIST_HEAD(&bad->list);
6334                                 memcpy(&bad->key, &key,
6335                                        sizeof(struct btrfs_key));
6336                                 bad->root_id = owner;
6337                                 list_add_tail(&bad->list, &delete_items);
6338                                 continue;
6339                         }
6340                         if (key.type != BTRFS_EXTENT_DATA_KEY)
6341                                 continue;
6342                         fi = btrfs_item_ptr(buf, i,
6343                                             struct btrfs_file_extent_item);
6344                         if (btrfs_file_extent_type(buf, fi) ==
6345                             BTRFS_FILE_EXTENT_INLINE)
6346                                 continue;
6347                         if (btrfs_file_extent_disk_bytenr(buf, fi) == 0)
6348                                 continue;
6349
6350                         data_bytes_allocated +=
6351                                 btrfs_file_extent_disk_num_bytes(buf, fi);
6352                         if (data_bytes_allocated < root->sectorsize) {
6353                                 abort();
6354                         }
6355                         data_bytes_referenced +=
6356                                 btrfs_file_extent_num_bytes(buf, fi);
6357                         add_data_backref(extent_cache,
6358                                 btrfs_file_extent_disk_bytenr(buf, fi),
6359                                 parent, owner, key.objectid, key.offset -
6360                                 btrfs_file_extent_offset(buf, fi), 1, 1,
6361                                 btrfs_file_extent_disk_num_bytes(buf, fi));
6362                 }
6363         } else {
6364                 int level;
6365                 struct btrfs_key first_key;
6366
6367                 first_key.objectid = 0;
6368
6369                 if (nritems > 0)
6370                         btrfs_item_key_to_cpu(buf, &first_key, 0);
6371                 level = btrfs_header_level(buf);
6372                 for (i = 0; i < nritems; i++) {
6373                         struct extent_record tmpl;
6374
6375                         ptr = btrfs_node_blockptr(buf, i);
6376                         size = root->nodesize;
6377                         btrfs_node_key_to_cpu(buf, &key, i);
6378                         if (ri != NULL) {
6379                                 if ((level == ri->drop_level)
6380                                     && is_dropped_key(&key, &ri->drop_key)) {
6381                                         continue;
6382                                 }
6383                         }
6384
6385                         memset(&tmpl, 0, sizeof(tmpl));
6386                         btrfs_cpu_key_to_disk(&tmpl.parent_key, &key);
6387                         tmpl.parent_generation = btrfs_node_ptr_generation(buf, i);
6388                         tmpl.start = ptr;
6389                         tmpl.nr = size;
6390                         tmpl.refs = 1;
6391                         tmpl.metadata = 1;
6392                         tmpl.max_size = size;
6393                         ret = add_extent_rec(extent_cache, &tmpl);
6394                         BUG_ON(ret);
6395
6396                         add_tree_backref(extent_cache, ptr, parent, owner, 1);
6397
6398                         if (level > 1) {
6399                                 add_pending(nodes, seen, ptr, size);
6400                         } else {
6401                                 add_pending(pending, seen, ptr, size);
6402                         }
6403                 }
6404                 btree_space_waste += (BTRFS_NODEPTRS_PER_BLOCK(root) -
6405                                       nritems) * sizeof(struct btrfs_key_ptr);
6406         }
6407         total_btree_bytes += buf->len;
6408         if (fs_root_objectid(btrfs_header_owner(buf)))
6409                 total_fs_tree_bytes += buf->len;
6410         if (btrfs_header_owner(buf) == BTRFS_EXTENT_TREE_OBJECTID)
6411                 total_extent_tree_bytes += buf->len;
6412         if (!found_old_backref &&
6413             btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID &&
6414             btrfs_header_backref_rev(buf) == BTRFS_MIXED_BACKREF_REV &&
6415             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
6416                 found_old_backref = 1;
6417 out:
6418         free_extent_buffer(buf);
6419         return ret;
6420 }
6421
6422 static int add_root_to_pending(struct extent_buffer *buf,
6423                                struct cache_tree *extent_cache,
6424                                struct cache_tree *pending,
6425                                struct cache_tree *seen,
6426                                struct cache_tree *nodes,
6427                                u64 objectid)
6428 {
6429         struct extent_record tmpl;
6430
6431         if (btrfs_header_level(buf) > 0)
6432                 add_pending(nodes, seen, buf->start, buf->len);
6433         else
6434                 add_pending(pending, seen, buf->start, buf->len);
6435
6436         memset(&tmpl, 0, sizeof(tmpl));
6437         tmpl.start = buf->start;
6438         tmpl.nr = buf->len;
6439         tmpl.is_root = 1;
6440         tmpl.refs = 1;
6441         tmpl.metadata = 1;
6442         tmpl.max_size = buf->len;
6443         add_extent_rec(extent_cache, &tmpl);
6444
6445         if (objectid == BTRFS_TREE_RELOC_OBJECTID ||
6446             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
6447                 add_tree_backref(extent_cache, buf->start, buf->start,
6448                                  0, 1);
6449         else
6450                 add_tree_backref(extent_cache, buf->start, 0, objectid, 1);
6451         return 0;
6452 }
6453
6454 /* as we fix the tree, we might be deleting blocks that
6455  * we're tracking for repair.  This hook makes sure we
6456  * remove any backrefs for blocks as we are fixing them.
6457  */
6458 static int free_extent_hook(struct btrfs_trans_handle *trans,
6459                             struct btrfs_root *root,
6460                             u64 bytenr, u64 num_bytes, u64 parent,
6461                             u64 root_objectid, u64 owner, u64 offset,
6462                             int refs_to_drop)
6463 {
6464         struct extent_record *rec;
6465         struct cache_extent *cache;
6466         int is_data;
6467         struct cache_tree *extent_cache = root->fs_info->fsck_extent_cache;
6468
6469         is_data = owner >= BTRFS_FIRST_FREE_OBJECTID;
6470         cache = lookup_cache_extent(extent_cache, bytenr, num_bytes);
6471         if (!cache)
6472                 return 0;
6473
6474         rec = container_of(cache, struct extent_record, cache);
6475         if (is_data) {
6476                 struct data_backref *back;
6477                 back = find_data_backref(rec, parent, root_objectid, owner,
6478                                          offset, 1, bytenr, num_bytes);
6479                 if (!back)
6480                         goto out;
6481                 if (back->node.found_ref) {
6482                         back->found_ref -= refs_to_drop;
6483                         if (rec->refs)
6484                                 rec->refs -= refs_to_drop;
6485                 }
6486                 if (back->node.found_extent_tree) {
6487                         back->num_refs -= refs_to_drop;
6488                         if (rec->extent_item_refs)
6489                                 rec->extent_item_refs -= refs_to_drop;
6490                 }
6491                 if (back->found_ref == 0)
6492                         back->node.found_ref = 0;
6493                 if (back->num_refs == 0)
6494                         back->node.found_extent_tree = 0;
6495
6496                 if (!back->node.found_extent_tree && back->node.found_ref) {
6497                         rb_erase(&back->node.node, &rec->backref_tree);
6498                         free(back);
6499                 }
6500         } else {
6501                 struct tree_backref *back;
6502                 back = find_tree_backref(rec, parent, root_objectid);
6503                 if (!back)
6504                         goto out;
6505                 if (back->node.found_ref) {
6506                         if (rec->refs)
6507                                 rec->refs--;
6508                         back->node.found_ref = 0;
6509                 }
6510                 if (back->node.found_extent_tree) {
6511                         if (rec->extent_item_refs)
6512                                 rec->extent_item_refs--;
6513                         back->node.found_extent_tree = 0;
6514                 }
6515                 if (!back->node.found_extent_tree && back->node.found_ref) {
6516                         rb_erase(&back->node.node, &rec->backref_tree);
6517                         free(back);
6518                 }
6519         }
6520         maybe_free_extent_rec(extent_cache, rec);
6521 out:
6522         return 0;
6523 }
6524
6525 static int delete_extent_records(struct btrfs_trans_handle *trans,
6526                                  struct btrfs_root *root,
6527                                  struct btrfs_path *path,
6528                                  u64 bytenr, u64 new_len)
6529 {
6530         struct btrfs_key key;
6531         struct btrfs_key found_key;
6532         struct extent_buffer *leaf;
6533         int ret;
6534         int slot;
6535
6536
6537         key.objectid = bytenr;
6538         key.type = (u8)-1;
6539         key.offset = (u64)-1;
6540
6541         while(1) {
6542                 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
6543                                         &key, path, 0, 1);
6544                 if (ret < 0)
6545                         break;
6546
6547                 if (ret > 0) {
6548                         ret = 0;
6549                         if (path->slots[0] == 0)
6550                                 break;
6551                         path->slots[0]--;
6552                 }
6553                 ret = 0;
6554
6555                 leaf = path->nodes[0];
6556                 slot = path->slots[0];
6557
6558                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6559                 if (found_key.objectid != bytenr)
6560                         break;
6561
6562                 if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
6563                     found_key.type != BTRFS_METADATA_ITEM_KEY &&
6564                     found_key.type != BTRFS_TREE_BLOCK_REF_KEY &&
6565                     found_key.type != BTRFS_EXTENT_DATA_REF_KEY &&
6566                     found_key.type != BTRFS_EXTENT_REF_V0_KEY &&
6567                     found_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
6568                     found_key.type != BTRFS_SHARED_DATA_REF_KEY) {
6569                         btrfs_release_path(path);
6570                         if (found_key.type == 0) {
6571                                 if (found_key.offset == 0)
6572                                         break;
6573                                 key.offset = found_key.offset - 1;
6574                                 key.type = found_key.type;
6575                         }
6576                         key.type = found_key.type - 1;
6577                         key.offset = (u64)-1;
6578                         continue;
6579                 }
6580
6581                 fprintf(stderr, "repair deleting extent record: key %Lu %u %Lu\n",
6582                         found_key.objectid, found_key.type, found_key.offset);
6583
6584                 ret = btrfs_del_item(trans, root->fs_info->extent_root, path);
6585                 if (ret)
6586                         break;
6587                 btrfs_release_path(path);
6588
6589                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
6590                     found_key.type == BTRFS_METADATA_ITEM_KEY) {
6591                         u64 bytes = (found_key.type == BTRFS_EXTENT_ITEM_KEY) ?
6592                                 found_key.offset : root->nodesize;
6593
6594                         ret = btrfs_update_block_group(trans, root, bytenr,
6595                                                        bytes, 0, 0);
6596                         if (ret)
6597                                 break;
6598                 }
6599         }
6600
6601         btrfs_release_path(path);
6602         return ret;
6603 }
6604
6605 /*
6606  * for a single backref, this will allocate a new extent
6607  * and add the backref to it.
6608  */
6609 static int record_extent(struct btrfs_trans_handle *trans,
6610                          struct btrfs_fs_info *info,
6611                          struct btrfs_path *path,
6612                          struct extent_record *rec,
6613                          struct extent_backref *back,
6614                          int allocated, u64 flags)
6615 {
6616         int ret;
6617         struct btrfs_root *extent_root = info->extent_root;
6618         struct extent_buffer *leaf;
6619         struct btrfs_key ins_key;
6620         struct btrfs_extent_item *ei;
6621         struct tree_backref *tback;
6622         struct data_backref *dback;
6623         struct btrfs_tree_block_info *bi;
6624
6625         if (!back->is_data)
6626                 rec->max_size = max_t(u64, rec->max_size,
6627                                     info->extent_root->nodesize);
6628
6629         if (!allocated) {
6630                 u32 item_size = sizeof(*ei);
6631
6632                 if (!back->is_data)
6633                         item_size += sizeof(*bi);
6634
6635                 ins_key.objectid = rec->start;
6636                 ins_key.offset = rec->max_size;
6637                 ins_key.type = BTRFS_EXTENT_ITEM_KEY;
6638
6639                 ret = btrfs_insert_empty_item(trans, extent_root, path,
6640                                         &ins_key, item_size);
6641                 if (ret)
6642                         goto fail;
6643
6644                 leaf = path->nodes[0];
6645                 ei = btrfs_item_ptr(leaf, path->slots[0],
6646                                     struct btrfs_extent_item);
6647
6648                 btrfs_set_extent_refs(leaf, ei, 0);
6649                 btrfs_set_extent_generation(leaf, ei, rec->generation);
6650
6651                 if (back->is_data) {
6652                         btrfs_set_extent_flags(leaf, ei,
6653                                                BTRFS_EXTENT_FLAG_DATA);
6654                 } else {
6655                         struct btrfs_disk_key copy_key;;
6656
6657                         tback = to_tree_backref(back);
6658                         bi = (struct btrfs_tree_block_info *)(ei + 1);
6659                         memset_extent_buffer(leaf, 0, (unsigned long)bi,
6660                                              sizeof(*bi));
6661
6662                         btrfs_set_disk_key_objectid(&copy_key,
6663                                                     rec->info_objectid);
6664                         btrfs_set_disk_key_type(&copy_key, 0);
6665                         btrfs_set_disk_key_offset(&copy_key, 0);
6666
6667                         btrfs_set_tree_block_level(leaf, bi, rec->info_level);
6668                         btrfs_set_tree_block_key(leaf, bi, &copy_key);
6669
6670                         btrfs_set_extent_flags(leaf, ei,
6671                                                BTRFS_EXTENT_FLAG_TREE_BLOCK | flags);
6672                 }
6673
6674                 btrfs_mark_buffer_dirty(leaf);
6675                 ret = btrfs_update_block_group(trans, extent_root, rec->start,
6676                                                rec->max_size, 1, 0);
6677                 if (ret)
6678                         goto fail;
6679                 btrfs_release_path(path);
6680         }
6681
6682         if (back->is_data) {
6683                 u64 parent;
6684                 int i;
6685
6686                 dback = to_data_backref(back);
6687                 if (back->full_backref)
6688                         parent = dback->parent;
6689                 else
6690                         parent = 0;
6691
6692                 for (i = 0; i < dback->found_ref; i++) {
6693                         /* if parent != 0, we're doing a full backref
6694                          * passing BTRFS_FIRST_FREE_OBJECTID as the owner
6695                          * just makes the backref allocator create a data
6696                          * backref
6697                          */
6698                         ret = btrfs_inc_extent_ref(trans, info->extent_root,
6699                                                    rec->start, rec->max_size,
6700                                                    parent,
6701                                                    dback->root,
6702                                                    parent ?
6703                                                    BTRFS_FIRST_FREE_OBJECTID :
6704                                                    dback->owner,
6705                                                    dback->offset);
6706                         if (ret)
6707                                 break;
6708                 }
6709                 fprintf(stderr, "adding new data backref"
6710                                 " on %llu %s %llu owner %llu"
6711                                 " offset %llu found %d\n",
6712                                 (unsigned long long)rec->start,
6713                                 back->full_backref ?
6714                                 "parent" : "root",
6715                                 back->full_backref ?
6716                                 (unsigned long long)parent :
6717                                 (unsigned long long)dback->root,
6718                                 (unsigned long long)dback->owner,
6719                                 (unsigned long long)dback->offset,
6720                                 dback->found_ref);
6721         } else {
6722                 u64 parent;
6723
6724                 tback = to_tree_backref(back);
6725                 if (back->full_backref)
6726                         parent = tback->parent;
6727                 else
6728                         parent = 0;
6729
6730                 ret = btrfs_inc_extent_ref(trans, info->extent_root,
6731                                            rec->start, rec->max_size,
6732                                            parent, tback->root, 0, 0);
6733                 fprintf(stderr, "adding new tree backref on "
6734                         "start %llu len %llu parent %llu root %llu\n",
6735                         rec->start, rec->max_size, parent, tback->root);
6736         }
6737 fail:
6738         btrfs_release_path(path);
6739         return ret;
6740 }
6741
6742 static struct extent_entry *find_entry(struct list_head *entries,
6743                                        u64 bytenr, u64 bytes)
6744 {
6745         struct extent_entry *entry = NULL;
6746
6747         list_for_each_entry(entry, entries, list) {
6748                 if (entry->bytenr == bytenr && entry->bytes == bytes)
6749                         return entry;
6750         }
6751
6752         return NULL;
6753 }
6754
6755 static struct extent_entry *find_most_right_entry(struct list_head *entries)
6756 {
6757         struct extent_entry *entry, *best = NULL, *prev = NULL;
6758
6759         list_for_each_entry(entry, entries, list) {
6760                 if (!prev) {
6761                         prev = entry;
6762                         continue;
6763                 }
6764
6765                 /*
6766                  * If there are as many broken entries as entries then we know
6767                  * not to trust this particular entry.
6768                  */
6769                 if (entry->broken == entry->count)
6770                         continue;
6771
6772                 /*
6773                  * If our current entry == best then we can't be sure our best
6774                  * is really the best, so we need to keep searching.
6775                  */
6776                 if (best && best->count == entry->count) {
6777                         prev = entry;
6778                         best = NULL;
6779                         continue;
6780                 }
6781
6782                 /* Prev == entry, not good enough, have to keep searching */
6783                 if (!prev->broken && prev->count == entry->count)
6784                         continue;
6785
6786                 if (!best)
6787                         best = (prev->count > entry->count) ? prev : entry;
6788                 else if (best->count < entry->count)
6789                         best = entry;
6790                 prev = entry;
6791         }
6792
6793         return best;
6794 }
6795
6796 static int repair_ref(struct btrfs_fs_info *info, struct btrfs_path *path,
6797                       struct data_backref *dback, struct extent_entry *entry)
6798 {
6799         struct btrfs_trans_handle *trans;
6800         struct btrfs_root *root;
6801         struct btrfs_file_extent_item *fi;
6802         struct extent_buffer *leaf;
6803         struct btrfs_key key;
6804         u64 bytenr, bytes;
6805         int ret, err;
6806
6807         key.objectid = dback->root;
6808         key.type = BTRFS_ROOT_ITEM_KEY;
6809         key.offset = (u64)-1;
6810         root = btrfs_read_fs_root(info, &key);
6811         if (IS_ERR(root)) {
6812                 fprintf(stderr, "Couldn't find root for our ref\n");
6813                 return -EINVAL;
6814         }
6815
6816         /*
6817          * The backref points to the original offset of the extent if it was
6818          * split, so we need to search down to the offset we have and then walk
6819          * forward until we find the backref we're looking for.
6820          */
6821         key.objectid = dback->owner;
6822         key.type = BTRFS_EXTENT_DATA_KEY;
6823         key.offset = dback->offset;
6824         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6825         if (ret < 0) {
6826                 fprintf(stderr, "Error looking up ref %d\n", ret);
6827                 return ret;
6828         }
6829
6830         while (1) {
6831                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
6832                         ret = btrfs_next_leaf(root, path);
6833                         if (ret) {
6834                                 fprintf(stderr, "Couldn't find our ref, next\n");
6835                                 return -EINVAL;
6836                         }
6837                 }
6838                 leaf = path->nodes[0];
6839                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6840                 if (key.objectid != dback->owner ||
6841                     key.type != BTRFS_EXTENT_DATA_KEY) {
6842                         fprintf(stderr, "Couldn't find our ref, search\n");
6843                         return -EINVAL;
6844                 }
6845                 fi = btrfs_item_ptr(leaf, path->slots[0],
6846                                     struct btrfs_file_extent_item);
6847                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6848                 bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6849
6850                 if (bytenr == dback->disk_bytenr && bytes == dback->bytes)
6851                         break;
6852                 path->slots[0]++;
6853         }
6854
6855         btrfs_release_path(path);
6856
6857         trans = btrfs_start_transaction(root, 1);
6858         if (IS_ERR(trans))
6859                 return PTR_ERR(trans);
6860
6861         /*
6862          * Ok we have the key of the file extent we want to fix, now we can cow
6863          * down to the thing and fix it.
6864          */
6865         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6866         if (ret < 0) {
6867                 fprintf(stderr, "Error cowing down to ref [%Lu, %u, %Lu]: %d\n",
6868                         key.objectid, key.type, key.offset, ret);
6869                 goto out;
6870         }
6871         if (ret > 0) {
6872                 fprintf(stderr, "Well that's odd, we just found this key "
6873                         "[%Lu, %u, %Lu]\n", key.objectid, key.type,
6874                         key.offset);
6875                 ret = -EINVAL;
6876                 goto out;
6877         }
6878         leaf = path->nodes[0];
6879         fi = btrfs_item_ptr(leaf, path->slots[0],
6880                             struct btrfs_file_extent_item);
6881
6882         if (btrfs_file_extent_compression(leaf, fi) &&
6883             dback->disk_bytenr != entry->bytenr) {
6884                 fprintf(stderr, "Ref doesn't match the record start and is "
6885                         "compressed, please take a btrfs-image of this file "
6886                         "system and send it to a btrfs developer so they can "
6887                         "complete this functionality for bytenr %Lu\n",
6888                         dback->disk_bytenr);
6889                 ret = -EINVAL;
6890                 goto out;
6891         }
6892
6893         if (dback->node.broken && dback->disk_bytenr != entry->bytenr) {
6894                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
6895         } else if (dback->disk_bytenr > entry->bytenr) {
6896                 u64 off_diff, offset;
6897
6898                 off_diff = dback->disk_bytenr - entry->bytenr;
6899                 offset = btrfs_file_extent_offset(leaf, fi);
6900                 if (dback->disk_bytenr + offset +
6901                     btrfs_file_extent_num_bytes(leaf, fi) >
6902                     entry->bytenr + entry->bytes) {
6903                         fprintf(stderr, "Ref is past the entry end, please "
6904                                 "take a btrfs-image of this file system and "
6905                                 "send it to a btrfs developer, ref %Lu\n",
6906                                 dback->disk_bytenr);
6907                         ret = -EINVAL;
6908                         goto out;
6909                 }
6910                 offset += off_diff;
6911                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
6912                 btrfs_set_file_extent_offset(leaf, fi, offset);
6913         } else if (dback->disk_bytenr < entry->bytenr) {
6914                 u64 offset;
6915
6916                 offset = btrfs_file_extent_offset(leaf, fi);
6917                 if (dback->disk_bytenr + offset < entry->bytenr) {
6918                         fprintf(stderr, "Ref is before the entry start, please"
6919                                 " take a btrfs-image of this file system and "
6920                                 "send it to a btrfs developer, ref %Lu\n",
6921                                 dback->disk_bytenr);
6922                         ret = -EINVAL;
6923                         goto out;
6924                 }
6925
6926                 offset += dback->disk_bytenr;
6927                 offset -= entry->bytenr;
6928                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
6929                 btrfs_set_file_extent_offset(leaf, fi, offset);
6930         }
6931
6932         btrfs_set_file_extent_disk_num_bytes(leaf, fi, entry->bytes);
6933
6934         /*
6935          * Chances are if disk_num_bytes were wrong then so is ram_bytes, but
6936          * only do this if we aren't using compression, otherwise it's a
6937          * trickier case.
6938          */
6939         if (!btrfs_file_extent_compression(leaf, fi))
6940                 btrfs_set_file_extent_ram_bytes(leaf, fi, entry->bytes);
6941         else
6942                 printf("ram bytes may be wrong?\n");
6943         btrfs_mark_buffer_dirty(leaf);
6944 out:
6945         err = btrfs_commit_transaction(trans, root);
6946         btrfs_release_path(path);
6947         return ret ? ret : err;
6948 }
6949
6950 static int verify_backrefs(struct btrfs_fs_info *info, struct btrfs_path *path,
6951                            struct extent_record *rec)
6952 {
6953         struct extent_backref *back, *tmp;
6954         struct data_backref *dback;
6955         struct extent_entry *entry, *best = NULL;
6956         LIST_HEAD(entries);
6957         int nr_entries = 0;
6958         int broken_entries = 0;
6959         int ret = 0;
6960         short mismatch = 0;
6961
6962         /*
6963          * Metadata is easy and the backrefs should always agree on bytenr and
6964          * size, if not we've got bigger issues.
6965          */
6966         if (rec->metadata)
6967                 return 0;
6968
6969         rbtree_postorder_for_each_entry_safe(back, tmp,
6970                                              &rec->backref_tree, node) {
6971                 if (back->full_backref || !back->is_data)
6972                         continue;
6973
6974                 dback = to_data_backref(back);
6975
6976                 /*
6977                  * We only pay attention to backrefs that we found a real
6978                  * backref for.
6979                  */
6980                 if (dback->found_ref == 0)
6981                         continue;
6982
6983                 /*
6984                  * For now we only catch when the bytes don't match, not the
6985                  * bytenr.  We can easily do this at the same time, but I want
6986                  * to have a fs image to test on before we just add repair
6987                  * functionality willy-nilly so we know we won't screw up the
6988                  * repair.
6989                  */
6990
6991                 entry = find_entry(&entries, dback->disk_bytenr,
6992                                    dback->bytes);
6993                 if (!entry) {
6994                         entry = malloc(sizeof(struct extent_entry));
6995                         if (!entry) {
6996                                 ret = -ENOMEM;
6997                                 goto out;
6998                         }
6999                         memset(entry, 0, sizeof(*entry));
7000                         entry->bytenr = dback->disk_bytenr;
7001                         entry->bytes = dback->bytes;
7002                         list_add_tail(&entry->list, &entries);
7003                         nr_entries++;
7004                 }
7005
7006                 /*
7007                  * If we only have on entry we may think the entries agree when
7008                  * in reality they don't so we have to do some extra checking.
7009                  */
7010                 if (dback->disk_bytenr != rec->start ||
7011                     dback->bytes != rec->nr || back->broken)
7012                         mismatch = 1;
7013
7014                 if (back->broken) {
7015                         entry->broken++;
7016                         broken_entries++;
7017                 }
7018
7019                 entry->count++;
7020         }
7021
7022         /* Yay all the backrefs agree, carry on good sir */
7023         if (nr_entries <= 1 && !mismatch)
7024                 goto out;
7025
7026         fprintf(stderr, "attempting to repair backref discrepency for bytenr "
7027                 "%Lu\n", rec->start);
7028
7029         /*
7030          * First we want to see if the backrefs can agree amongst themselves who
7031          * is right, so figure out which one of the entries has the highest
7032          * count.
7033          */
7034         best = find_most_right_entry(&entries);
7035
7036         /*
7037          * Ok so we may have an even split between what the backrefs think, so
7038          * this is where we use the extent ref to see what it thinks.
7039          */
7040         if (!best) {
7041                 entry = find_entry(&entries, rec->start, rec->nr);
7042                 if (!entry && (!broken_entries || !rec->found_rec)) {
7043                         fprintf(stderr, "Backrefs don't agree with each other "
7044                                 "and extent record doesn't agree with anybody,"
7045                                 " so we can't fix bytenr %Lu bytes %Lu\n",
7046                                 rec->start, rec->nr);
7047                         ret = -EINVAL;
7048                         goto out;
7049                 } else if (!entry) {
7050                         /*
7051                          * Ok our backrefs were broken, we'll assume this is the
7052                          * correct value and add an entry for this range.
7053                          */
7054                         entry = malloc(sizeof(struct extent_entry));
7055                         if (!entry) {
7056                                 ret = -ENOMEM;
7057                                 goto out;
7058                         }
7059                         memset(entry, 0, sizeof(*entry));
7060                         entry->bytenr = rec->start;
7061                         entry->bytes = rec->nr;
7062                         list_add_tail(&entry->list, &entries);
7063                         nr_entries++;
7064                 }
7065                 entry->count++;
7066                 best = find_most_right_entry(&entries);
7067                 if (!best) {
7068                         fprintf(stderr, "Backrefs and extent record evenly "
7069                                 "split on who is right, this is going to "
7070                                 "require user input to fix bytenr %Lu bytes "
7071                                 "%Lu\n", rec->start, rec->nr);
7072                         ret = -EINVAL;
7073                         goto out;
7074                 }
7075         }
7076
7077         /*
7078          * I don't think this can happen currently as we'll abort() if we catch
7079          * this case higher up, but in case somebody removes that we still can't
7080          * deal with it properly here yet, so just bail out of that's the case.
7081          */
7082         if (best->bytenr != rec->start) {
7083                 fprintf(stderr, "Extent start and backref starts don't match, "
7084                         "please use btrfs-image on this file system and send "
7085                         "it to a btrfs developer so they can make fsck fix "
7086                         "this particular case.  bytenr is %Lu, bytes is %Lu\n",
7087                         rec->start, rec->nr);
7088                 ret = -EINVAL;
7089                 goto out;
7090         }
7091
7092         /*
7093          * Ok great we all agreed on an extent record, let's go find the real
7094          * references and fix up the ones that don't match.
7095          */
7096         rbtree_postorder_for_each_entry_safe(back, tmp,
7097                                              &rec->backref_tree, node) {
7098                 if (back->full_backref || !back->is_data)
7099                         continue;
7100
7101                 dback = to_data_backref(back);
7102
7103                 /*
7104                  * Still ignoring backrefs that don't have a real ref attached
7105                  * to them.
7106                  */
7107                 if (dback->found_ref == 0)
7108                         continue;
7109
7110                 if (dback->bytes == best->bytes &&
7111                     dback->disk_bytenr == best->bytenr)
7112                         continue;
7113
7114                 ret = repair_ref(info, path, dback, best);
7115                 if (ret)
7116                         goto out;
7117         }
7118
7119         /*
7120          * Ok we messed with the actual refs, which means we need to drop our
7121          * entire cache and go back and rescan.  I know this is a huge pain and
7122          * adds a lot of extra work, but it's the only way to be safe.  Once all
7123          * the backrefs agree we may not need to do anything to the extent
7124          * record itself.
7125          */
7126         ret = -EAGAIN;
7127 out:
7128         while (!list_empty(&entries)) {
7129                 entry = list_entry(entries.next, struct extent_entry, list);
7130                 list_del_init(&entry->list);
7131                 free(entry);
7132         }
7133         return ret;
7134 }
7135
7136 static int process_duplicates(struct btrfs_root *root,
7137                               struct cache_tree *extent_cache,
7138                               struct extent_record *rec)
7139 {
7140         struct extent_record *good, *tmp;
7141         struct cache_extent *cache;
7142         int ret;
7143
7144         /*
7145          * If we found a extent record for this extent then return, or if we
7146          * have more than one duplicate we are likely going to need to delete
7147          * something.
7148          */
7149         if (rec->found_rec || rec->num_duplicates > 1)
7150                 return 0;
7151
7152         /* Shouldn't happen but just in case */
7153         BUG_ON(!rec->num_duplicates);
7154
7155         /*
7156          * So this happens if we end up with a backref that doesn't match the
7157          * actual extent entry.  So either the backref is bad or the extent
7158          * entry is bad.  Either way we want to have the extent_record actually
7159          * reflect what we found in the extent_tree, so we need to take the
7160          * duplicate out and use that as the extent_record since the only way we
7161          * get a duplicate is if we find a real life BTRFS_EXTENT_ITEM_KEY.
7162          */
7163         remove_cache_extent(extent_cache, &rec->cache);
7164
7165         good = to_extent_record(rec->dups.next);
7166         list_del_init(&good->list);
7167         INIT_LIST_HEAD(&good->backrefs);
7168         INIT_LIST_HEAD(&good->dups);
7169         good->cache.start = good->start;
7170         good->cache.size = good->nr;
7171         good->content_checked = 0;
7172         good->owner_ref_checked = 0;
7173         good->num_duplicates = 0;
7174         good->refs = rec->refs;
7175         list_splice_init(&rec->backrefs, &good->backrefs);
7176         while (1) {
7177                 cache = lookup_cache_extent(extent_cache, good->start,
7178                                             good->nr);
7179                 if (!cache)
7180                         break;
7181                 tmp = container_of(cache, struct extent_record, cache);
7182
7183                 /*
7184                  * If we find another overlapping extent and it's found_rec is
7185                  * set then it's a duplicate and we need to try and delete
7186                  * something.
7187                  */
7188                 if (tmp->found_rec || tmp->num_duplicates > 0) {
7189                         if (list_empty(&good->list))
7190                                 list_add_tail(&good->list,
7191                                               &duplicate_extents);
7192                         good->num_duplicates += tmp->num_duplicates + 1;
7193                         list_splice_init(&tmp->dups, &good->dups);
7194                         list_del_init(&tmp->list);
7195                         list_add_tail(&tmp->list, &good->dups);
7196                         remove_cache_extent(extent_cache, &tmp->cache);
7197                         continue;
7198                 }
7199
7200                 /*
7201                  * Ok we have another non extent item backed extent rec, so lets
7202                  * just add it to this extent and carry on like we did above.
7203                  */
7204                 good->refs += tmp->refs;
7205                 list_splice_init(&tmp->backrefs, &good->backrefs);
7206                 remove_cache_extent(extent_cache, &tmp->cache);
7207                 free(tmp);
7208         }
7209         ret = insert_cache_extent(extent_cache, &good->cache);
7210         BUG_ON(ret);
7211         free(rec);
7212         return good->num_duplicates ? 0 : 1;
7213 }
7214
7215 static int delete_duplicate_records(struct btrfs_root *root,
7216                                     struct extent_record *rec)
7217 {
7218         struct btrfs_trans_handle *trans;
7219         LIST_HEAD(delete_list);
7220         struct btrfs_path *path;
7221         struct extent_record *tmp, *good, *n;
7222         int nr_del = 0;
7223         int ret = 0, err;
7224         struct btrfs_key key;
7225
7226         path = btrfs_alloc_path();
7227         if (!path) {
7228                 ret = -ENOMEM;
7229                 goto out;
7230         }
7231
7232         good = rec;
7233         /* Find the record that covers all of the duplicates. */
7234         list_for_each_entry(tmp, &rec->dups, list) {
7235                 if (good->start < tmp->start)
7236                         continue;
7237                 if (good->nr > tmp->nr)
7238                         continue;
7239
7240                 if (tmp->start + tmp->nr < good->start + good->nr) {
7241                         fprintf(stderr, "Ok we have overlapping extents that "
7242                                 "aren't completely covered by each other, this "
7243                                 "is going to require more careful thought.  "
7244                                 "The extents are [%Lu-%Lu] and [%Lu-%Lu]\n",
7245                                 tmp->start, tmp->nr, good->start, good->nr);
7246                         abort();
7247                 }
7248                 good = tmp;
7249         }
7250
7251         if (good != rec)
7252                 list_add_tail(&rec->list, &delete_list);
7253
7254         list_for_each_entry_safe(tmp, n, &rec->dups, list) {
7255                 if (tmp == good)
7256                         continue;
7257                 list_move_tail(&tmp->list, &delete_list);
7258         }
7259
7260         root = root->fs_info->extent_root;
7261         trans = btrfs_start_transaction(root, 1);
7262         if (IS_ERR(trans)) {
7263                 ret = PTR_ERR(trans);
7264                 goto out;
7265         }
7266
7267         list_for_each_entry(tmp, &delete_list, list) {
7268                 if (tmp->found_rec == 0)
7269                         continue;
7270                 key.objectid = tmp->start;
7271                 key.type = BTRFS_EXTENT_ITEM_KEY;
7272                 key.offset = tmp->nr;
7273
7274                 /* Shouldn't happen but just in case */
7275                 if (tmp->metadata) {
7276                         fprintf(stderr, "Well this shouldn't happen, extent "
7277                                 "record overlaps but is metadata? "
7278                                 "[%Lu, %Lu]\n", tmp->start, tmp->nr);
7279                         abort();
7280                 }
7281
7282                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7283                 if (ret) {
7284                         if (ret > 0)
7285                                 ret = -EINVAL;
7286                         break;
7287                 }
7288                 ret = btrfs_del_item(trans, root, path);
7289                 if (ret)
7290                         break;
7291                 btrfs_release_path(path);
7292                 nr_del++;
7293         }
7294         err = btrfs_commit_transaction(trans, root);
7295         if (err && !ret)
7296                 ret = err;
7297 out:
7298         while (!list_empty(&delete_list)) {
7299                 tmp = to_extent_record(delete_list.next);
7300                 list_del_init(&tmp->list);
7301                 if (tmp == rec)
7302                         continue;
7303                 free(tmp);
7304         }
7305
7306         while (!list_empty(&rec->dups)) {
7307                 tmp = to_extent_record(rec->dups.next);
7308                 list_del_init(&tmp->list);
7309                 free(tmp);
7310         }
7311
7312         btrfs_free_path(path);
7313
7314         if (!ret && !nr_del)
7315                 rec->num_duplicates = 0;
7316
7317         return ret ? ret : nr_del;
7318 }
7319
7320 static int find_possible_backrefs(struct btrfs_fs_info *info,
7321                                   struct btrfs_path *path,
7322                                   struct cache_tree *extent_cache,
7323                                   struct extent_record *rec)
7324 {
7325         struct btrfs_root *root;
7326         struct extent_backref *back, *tmp;
7327         struct data_backref *dback;
7328         struct cache_extent *cache;
7329         struct btrfs_file_extent_item *fi;
7330         struct btrfs_key key;
7331         u64 bytenr, bytes;
7332         int ret;
7333
7334         rbtree_postorder_for_each_entry_safe(back, tmp,
7335                                              &rec->backref_tree, node) {
7336                 /* Don't care about full backrefs (poor unloved backrefs) */
7337                 if (back->full_backref || !back->is_data)
7338                         continue;
7339
7340                 dback = to_data_backref(back);
7341
7342                 /* We found this one, we don't need to do a lookup */
7343                 if (dback->found_ref)
7344                         continue;
7345
7346                 key.objectid = dback->root;
7347                 key.type = BTRFS_ROOT_ITEM_KEY;
7348                 key.offset = (u64)-1;
7349
7350                 root = btrfs_read_fs_root(info, &key);
7351
7352                 /* No root, definitely a bad ref, skip */
7353                 if (IS_ERR(root) && PTR_ERR(root) == -ENOENT)
7354                         continue;
7355                 /* Other err, exit */
7356                 if (IS_ERR(root))
7357                         return PTR_ERR(root);
7358
7359                 key.objectid = dback->owner;
7360                 key.type = BTRFS_EXTENT_DATA_KEY;
7361                 key.offset = dback->offset;
7362                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7363                 if (ret) {
7364                         btrfs_release_path(path);
7365                         if (ret < 0)
7366                                 return ret;
7367                         /* Didn't find it, we can carry on */
7368                         ret = 0;
7369                         continue;
7370                 }
7371
7372                 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
7373                                     struct btrfs_file_extent_item);
7374                 bytenr = btrfs_file_extent_disk_bytenr(path->nodes[0], fi);
7375                 bytes = btrfs_file_extent_disk_num_bytes(path->nodes[0], fi);
7376                 btrfs_release_path(path);
7377                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
7378                 if (cache) {
7379                         struct extent_record *tmp;
7380                         tmp = container_of(cache, struct extent_record, cache);
7381
7382                         /*
7383                          * If we found an extent record for the bytenr for this
7384                          * particular backref then we can't add it to our
7385                          * current extent record.  We only want to add backrefs
7386                          * that don't have a corresponding extent item in the
7387                          * extent tree since they likely belong to this record
7388                          * and we need to fix it if it doesn't match bytenrs.
7389                          */
7390                         if  (tmp->found_rec)
7391                                 continue;
7392                 }
7393
7394                 dback->found_ref += 1;
7395                 dback->disk_bytenr = bytenr;
7396                 dback->bytes = bytes;
7397
7398                 /*
7399                  * Set this so the verify backref code knows not to trust the
7400                  * values in this backref.
7401                  */
7402                 back->broken = 1;
7403         }
7404
7405         return 0;
7406 }
7407
7408 /*
7409  * Record orphan data ref into corresponding root.
7410  *
7411  * Return 0 if the extent item contains data ref and recorded.
7412  * Return 1 if the extent item contains no useful data ref
7413  *   On that case, it may contains only shared_dataref or metadata backref
7414  *   or the file extent exists(this should be handled by the extent bytenr
7415  *   recovery routine)
7416  * Return <0 if something goes wrong.
7417  */
7418 static int record_orphan_data_extents(struct btrfs_fs_info *fs_info,
7419                                       struct extent_record *rec)
7420 {
7421         struct btrfs_key key;
7422         struct btrfs_root *dest_root;
7423         struct extent_backref *back, *tmp;
7424         struct data_backref *dback;
7425         struct orphan_data_extent *orphan;
7426         struct btrfs_path *path;
7427         int recorded_data_ref = 0;
7428         int ret = 0;
7429
7430         if (rec->metadata)
7431                 return 1;
7432         path = btrfs_alloc_path();
7433         if (!path)
7434                 return -ENOMEM;
7435         rbtree_postorder_for_each_entry_safe(back, tmp,
7436                                              &rec->backref_tree, node) {
7437                 if (back->full_backref || !back->is_data ||
7438                     !back->found_extent_tree)
7439                         continue;
7440                 dback = to_data_backref(back);
7441                 if (dback->found_ref)
7442                         continue;
7443                 key.objectid = dback->root;
7444                 key.type = BTRFS_ROOT_ITEM_KEY;
7445                 key.offset = (u64)-1;
7446
7447                 dest_root = btrfs_read_fs_root(fs_info, &key);
7448
7449                 /* For non-exist root we just skip it */
7450                 if (IS_ERR(dest_root) || !dest_root)
7451                         continue;
7452
7453                 key.objectid = dback->owner;
7454                 key.type = BTRFS_EXTENT_DATA_KEY;
7455                 key.offset = dback->offset;
7456
7457                 ret = btrfs_search_slot(NULL, dest_root, &key, path, 0, 0);
7458                 /*
7459                  * For ret < 0, it's OK since the fs-tree may be corrupted,
7460                  * we need to record it for inode/file extent rebuild.
7461                  * For ret > 0, we record it only for file extent rebuild.
7462                  * For ret == 0, the file extent exists but only bytenr
7463                  * mismatch, let the original bytenr fix routine to handle,
7464                  * don't record it.
7465                  */
7466                 if (ret == 0)
7467                         continue;
7468                 ret = 0;
7469                 orphan = malloc(sizeof(*orphan));
7470                 if (!orphan) {
7471                         ret = -ENOMEM;
7472                         goto out;
7473                 }
7474                 INIT_LIST_HEAD(&orphan->list);
7475                 orphan->root = dback->root;
7476                 orphan->objectid = dback->owner;
7477                 orphan->offset = dback->offset;
7478                 orphan->disk_bytenr = rec->cache.start;
7479                 orphan->disk_len = rec->cache.size;
7480                 list_add(&dest_root->orphan_data_extents, &orphan->list);
7481                 recorded_data_ref = 1;
7482         }
7483 out:
7484         btrfs_free_path(path);
7485         if (!ret)
7486                 return !recorded_data_ref;
7487         else
7488                 return ret;
7489 }
7490
7491 /*
7492  * when an incorrect extent item is found, this will delete
7493  * all of the existing entries for it and recreate them
7494  * based on what the tree scan found.
7495  */
7496 static int fixup_extent_refs(struct btrfs_fs_info *info,
7497                              struct cache_tree *extent_cache,
7498                              struct extent_record *rec)
7499 {
7500         struct btrfs_trans_handle *trans = NULL;
7501         int ret;
7502         struct btrfs_path *path;
7503         struct cache_extent *cache;
7504         struct extent_backref *back, *tmp;
7505         int allocated = 0;
7506         u64 flags = 0;
7507
7508         if (rec->flag_block_full_backref)
7509                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7510
7511         path = btrfs_alloc_path();
7512         if (!path)
7513                 return -ENOMEM;
7514
7515         if (rec->refs != rec->extent_item_refs && !rec->metadata) {
7516                 /*
7517                  * Sometimes the backrefs themselves are so broken they don't
7518                  * get attached to any meaningful rec, so first go back and
7519                  * check any of our backrefs that we couldn't find and throw
7520                  * them into the list if we find the backref so that
7521                  * verify_backrefs can figure out what to do.
7522                  */
7523                 ret = find_possible_backrefs(info, path, extent_cache, rec);
7524                 if (ret < 0)
7525                         goto out;
7526         }
7527
7528         /* step one, make sure all of the backrefs agree */
7529         ret = verify_backrefs(info, path, rec);
7530         if (ret < 0)
7531                 goto out;
7532
7533         trans = btrfs_start_transaction(info->extent_root, 1);
7534         if (IS_ERR(trans)) {
7535                 ret = PTR_ERR(trans);
7536                 goto out;
7537         }
7538
7539         /* step two, delete all the existing records */
7540         ret = delete_extent_records(trans, info->extent_root, path,
7541                                     rec->start, rec->max_size);
7542
7543         if (ret < 0)
7544                 goto out;
7545
7546         /* was this block corrupt?  If so, don't add references to it */
7547         cache = lookup_cache_extent(info->corrupt_blocks,
7548                                     rec->start, rec->max_size);
7549         if (cache) {
7550                 ret = 0;
7551                 goto out;
7552         }
7553
7554         /* step three, recreate all the refs we did find */
7555         rbtree_postorder_for_each_entry_safe(back, tmp,
7556                                              &rec->backref_tree, node) {
7557                 /*
7558                  * if we didn't find any references, don't create a
7559                  * new extent record
7560                  */
7561                 if (!back->found_ref)
7562                         continue;
7563
7564                 rec->bad_full_backref = 0;
7565                 ret = record_extent(trans, info, path, rec, back, allocated, flags);
7566                 allocated = 1;
7567
7568                 if (ret)
7569                         goto out;
7570         }
7571 out:
7572         if (trans) {
7573                 int err = btrfs_commit_transaction(trans, info->extent_root);
7574                 if (!ret)
7575                         ret = err;
7576         }
7577
7578         btrfs_free_path(path);
7579         return ret;
7580 }
7581
7582 static int fixup_extent_flags(struct btrfs_fs_info *fs_info,
7583                               struct extent_record *rec)
7584 {
7585         struct btrfs_trans_handle *trans;
7586         struct btrfs_root *root = fs_info->extent_root;
7587         struct btrfs_path *path;
7588         struct btrfs_extent_item *ei;
7589         struct btrfs_key key;
7590         u64 flags;
7591         int ret = 0;
7592
7593         key.objectid = rec->start;
7594         if (rec->metadata) {
7595                 key.type = BTRFS_METADATA_ITEM_KEY;
7596                 key.offset = rec->info_level;
7597         } else {
7598                 key.type = BTRFS_EXTENT_ITEM_KEY;
7599                 key.offset = rec->max_size;
7600         }
7601
7602         path = btrfs_alloc_path();
7603         if (!path)
7604                 return -ENOMEM;
7605
7606         trans = btrfs_start_transaction(root, 0);
7607         if (IS_ERR(trans)) {
7608                 btrfs_free_path(path);
7609                 return PTR_ERR(trans);
7610         }
7611
7612         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
7613         if (ret < 0) {
7614                 btrfs_free_path(path);
7615                 btrfs_commit_transaction(trans, root);
7616                 return ret;
7617         } else if (ret) {
7618                 fprintf(stderr, "Didn't find extent for %llu\n",
7619                         (unsigned long long)rec->start);
7620                 btrfs_free_path(path);
7621                 btrfs_commit_transaction(trans, root);
7622                 return -ENOENT;
7623         }
7624
7625         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
7626                             struct btrfs_extent_item);
7627         flags = btrfs_extent_flags(path->nodes[0], ei);
7628         if (rec->flag_block_full_backref) {
7629                 fprintf(stderr, "setting full backref on %llu\n",
7630                         (unsigned long long)key.objectid);
7631                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7632         } else {
7633                 fprintf(stderr, "clearing full backref on %llu\n",
7634                         (unsigned long long)key.objectid);
7635                 flags &= ~BTRFS_BLOCK_FLAG_FULL_BACKREF;
7636         }
7637         btrfs_set_extent_flags(path->nodes[0], ei, flags);
7638         btrfs_mark_buffer_dirty(path->nodes[0]);
7639         btrfs_free_path(path);
7640         return btrfs_commit_transaction(trans, root);
7641 }
7642
7643 /* right now we only prune from the extent allocation tree */
7644 static int prune_one_block(struct btrfs_trans_handle *trans,
7645                            struct btrfs_fs_info *info,
7646                            struct btrfs_corrupt_block *corrupt)
7647 {
7648         int ret;
7649         struct btrfs_path path;
7650         struct extent_buffer *eb;
7651         u64 found;
7652         int slot;
7653         int nritems;
7654         int level = corrupt->level + 1;
7655
7656         btrfs_init_path(&path);
7657 again:
7658         /* we want to stop at the parent to our busted block */
7659         path.lowest_level = level;
7660
7661         ret = btrfs_search_slot(trans, info->extent_root,
7662                                 &corrupt->key, &path, -1, 1);
7663
7664         if (ret < 0)
7665                 goto out;
7666
7667         eb = path.nodes[level];
7668         if (!eb) {
7669                 ret = -ENOENT;
7670                 goto out;
7671         }
7672
7673         /*
7674          * hopefully the search gave us the block we want to prune,
7675          * lets try that first
7676          */
7677         slot = path.slots[level];
7678         found =  btrfs_node_blockptr(eb, slot);
7679         if (found == corrupt->cache.start)
7680                 goto del_ptr;
7681
7682         nritems = btrfs_header_nritems(eb);
7683
7684         /* the search failed, lets scan this node and hope we find it */
7685         for (slot = 0; slot < nritems; slot++) {
7686                 found =  btrfs_node_blockptr(eb, slot);
7687                 if (found == corrupt->cache.start)
7688                         goto del_ptr;
7689         }
7690         /*
7691          * we couldn't find the bad block.  TODO, search all the nodes for pointers
7692          * to this block
7693          */
7694         if (eb == info->extent_root->node) {
7695                 ret = -ENOENT;
7696                 goto out;
7697         } else {
7698                 level++;
7699                 btrfs_release_path(&path);
7700                 goto again;
7701         }
7702
7703 del_ptr:
7704         printk("deleting pointer to block %Lu\n", corrupt->cache.start);
7705         ret = btrfs_del_ptr(trans, info->extent_root, &path, level, slot);
7706
7707 out:
7708         btrfs_release_path(&path);
7709         return ret;
7710 }
7711
7712 static int prune_corrupt_blocks(struct btrfs_fs_info *info)
7713 {
7714         struct btrfs_trans_handle *trans = NULL;
7715         struct cache_extent *cache;
7716         struct btrfs_corrupt_block *corrupt;
7717
7718         while (1) {
7719                 cache = search_cache_extent(info->corrupt_blocks, 0);
7720                 if (!cache)
7721                         break;
7722                 if (!trans) {
7723                         trans = btrfs_start_transaction(info->extent_root, 1);
7724                         if (IS_ERR(trans))
7725                                 return PTR_ERR(trans);
7726                 }
7727                 corrupt = container_of(cache, struct btrfs_corrupt_block, cache);
7728                 prune_one_block(trans, info, corrupt);
7729                 remove_cache_extent(info->corrupt_blocks, cache);
7730         }
7731         if (trans)
7732                 return btrfs_commit_transaction(trans, info->extent_root);
7733         return 0;
7734 }
7735
7736 static void reset_cached_block_groups(struct btrfs_fs_info *fs_info)
7737 {
7738         struct btrfs_block_group_cache *cache;
7739         u64 start, end;
7740         int ret;
7741
7742         while (1) {
7743                 ret = find_first_extent_bit(&fs_info->free_space_cache, 0,
7744                                             &start, &end, EXTENT_DIRTY);
7745                 if (ret)
7746                         break;
7747                 clear_extent_dirty(&fs_info->free_space_cache, start, end,
7748                                    GFP_NOFS);
7749         }
7750
7751         start = 0;
7752         while (1) {
7753                 cache = btrfs_lookup_first_block_group(fs_info, start);
7754                 if (!cache)
7755                         break;
7756                 if (cache->cached)
7757                         cache->cached = 0;
7758                 start = cache->key.objectid + cache->key.offset;
7759         }
7760 }
7761
7762 static int check_extent_refs(struct btrfs_root *root,
7763                              struct cache_tree *extent_cache)
7764 {
7765         struct extent_record *rec;
7766         struct cache_extent *cache;
7767         int err = 0;
7768         int ret = 0;
7769         int fixed = 0;
7770         int had_dups = 0;
7771         int recorded = 0;
7772
7773         if (repair) {
7774                 /*
7775                  * if we're doing a repair, we have to make sure
7776                  * we don't allocate from the problem extents.
7777                  * In the worst case, this will be all the
7778                  * extents in the FS
7779                  */
7780                 cache = search_cache_extent(extent_cache, 0);
7781                 while(cache) {
7782                         rec = container_of(cache, struct extent_record, cache);
7783                         set_extent_dirty(root->fs_info->excluded_extents,
7784                                          rec->start,
7785                                          rec->start + rec->max_size - 1,
7786                                          GFP_NOFS);
7787                         cache = next_cache_extent(cache);
7788                 }
7789
7790                 /* pin down all the corrupted blocks too */
7791                 cache = search_cache_extent(root->fs_info->corrupt_blocks, 0);
7792                 while(cache) {
7793                         set_extent_dirty(root->fs_info->excluded_extents,
7794                                          cache->start,
7795                                          cache->start + cache->size - 1,
7796                                          GFP_NOFS);
7797                         cache = next_cache_extent(cache);
7798                 }
7799                 prune_corrupt_blocks(root->fs_info);
7800                 reset_cached_block_groups(root->fs_info);
7801         }
7802
7803         reset_cached_block_groups(root->fs_info);
7804
7805         /*
7806          * We need to delete any duplicate entries we find first otherwise we
7807          * could mess up the extent tree when we have backrefs that actually
7808          * belong to a different extent item and not the weird duplicate one.
7809          */
7810         while (repair && !list_empty(&duplicate_extents)) {
7811                 rec = to_extent_record(duplicate_extents.next);
7812                 list_del_init(&rec->list);
7813
7814                 /* Sometimes we can find a backref before we find an actual
7815                  * extent, so we need to process it a little bit to see if there
7816                  * truly are multiple EXTENT_ITEM_KEY's for the same range, or
7817                  * if this is a backref screwup.  If we need to delete stuff
7818                  * process_duplicates() will return 0, otherwise it will return
7819                  * 1 and we
7820                  */
7821                 if (process_duplicates(root, extent_cache, rec))
7822                         continue;
7823                 ret = delete_duplicate_records(root, rec);
7824                 if (ret < 0)
7825                         return ret;
7826                 /*
7827                  * delete_duplicate_records will return the number of entries
7828                  * deleted, so if it's greater than 0 then we know we actually
7829                  * did something and we need to remove.
7830                  */
7831                 if (ret)
7832                         had_dups = 1;
7833         }
7834
7835         if (had_dups)
7836                 return -EAGAIN;
7837
7838         while(1) {
7839                 int cur_err = 0;
7840
7841                 fixed = 0;
7842                 recorded = 0;
7843                 cache = search_cache_extent(extent_cache, 0);
7844                 if (!cache)
7845                         break;
7846                 rec = container_of(cache, struct extent_record, cache);
7847                 if (rec->num_duplicates) {
7848                         fprintf(stderr, "extent item %llu has multiple extent "
7849                                 "items\n", (unsigned long long)rec->start);
7850                         err = 1;
7851                         cur_err = 1;
7852                 }
7853
7854                 if (rec->refs != rec->extent_item_refs) {
7855                         fprintf(stderr, "ref mismatch on [%llu %llu] ",
7856                                 (unsigned long long)rec->start,
7857                                 (unsigned long long)rec->nr);
7858                         fprintf(stderr, "extent item %llu, found %llu\n",
7859                                 (unsigned long long)rec->extent_item_refs,
7860                                 (unsigned long long)rec->refs);
7861                         ret = record_orphan_data_extents(root->fs_info, rec);
7862                         if (ret < 0)
7863                                 goto repair_abort;
7864                         if (ret == 0) {
7865                                 recorded = 1;
7866                         } else {
7867                                 /*
7868                                  * we can't use the extent to repair file
7869                                  * extent, let the fallback method handle it.
7870                                  */
7871                                 if (!fixed && repair) {
7872                                         ret = fixup_extent_refs(
7873                                                         root->fs_info,
7874                                                         extent_cache, rec);
7875                                         if (ret)
7876                                                 goto repair_abort;
7877                                         fixed = 1;
7878                                 }
7879                         }
7880                         err = 1;
7881                         cur_err = 1;
7882                 }
7883                 if (all_backpointers_checked(rec, 1)) {
7884                         fprintf(stderr, "backpointer mismatch on [%llu %llu]\n",
7885                                 (unsigned long long)rec->start,
7886                                 (unsigned long long)rec->nr);
7887
7888                         if (!fixed && !recorded && repair) {
7889                                 ret = fixup_extent_refs(root->fs_info,
7890                                                         extent_cache, rec);
7891                                 if (ret)
7892                                         goto repair_abort;
7893                                 fixed = 1;
7894                         }
7895                         cur_err = 1;
7896                         err = 1;
7897                 }
7898                 if (!rec->owner_ref_checked) {
7899                         fprintf(stderr, "owner ref check failed [%llu %llu]\n",
7900                                 (unsigned long long)rec->start,
7901                                 (unsigned long long)rec->nr);
7902                         if (!fixed && !recorded && repair) {
7903                                 ret = fixup_extent_refs(root->fs_info,
7904                                                         extent_cache, rec);
7905                                 if (ret)
7906                                         goto repair_abort;
7907                                 fixed = 1;
7908                         }
7909                         err = 1;
7910                         cur_err = 1;
7911                 }
7912                 if (rec->bad_full_backref) {
7913                         fprintf(stderr, "bad full backref, on [%llu]\n",
7914                                 (unsigned long long)rec->start);
7915                         if (repair) {
7916                                 ret = fixup_extent_flags(root->fs_info, rec);
7917                                 if (ret)
7918                                         goto repair_abort;
7919                                 fixed = 1;
7920                         }
7921                         err = 1;
7922                         cur_err = 1;
7923                 }
7924                 /*
7925                  * Although it's not a extent ref's problem, we reuse this
7926                  * routine for error reporting.
7927                  * No repair function yet.
7928                  */
7929                 if (rec->crossing_stripes) {
7930                         fprintf(stderr,
7931                                 "bad metadata [%llu, %llu) crossing stripe boundary\n",
7932                                 rec->start, rec->start + rec->max_size);
7933                         err = 1;
7934                         cur_err = 1;
7935                 }
7936
7937                 if (rec->wrong_chunk_type) {
7938                         fprintf(stderr,
7939                                 "bad extent [%llu, %llu), type mismatch with chunk\n",
7940                                 rec->start, rec->start + rec->max_size);
7941                         err = 1;
7942                         cur_err = 1;
7943                 }
7944
7945                 remove_cache_extent(extent_cache, cache);
7946                 free_all_extent_backrefs(rec);
7947                 if (!init_extent_tree && repair && (!cur_err || fixed))
7948                         clear_extent_dirty(root->fs_info->excluded_extents,
7949                                            rec->start,
7950                                            rec->start + rec->max_size - 1,
7951                                            GFP_NOFS);
7952                 free(rec);
7953         }
7954 repair_abort:
7955         if (repair) {
7956                 if (ret && ret != -EAGAIN) {
7957                         fprintf(stderr, "failed to repair damaged filesystem, aborting\n");
7958                         exit(1);
7959                 } else if (!ret) {
7960                         struct btrfs_trans_handle *trans;
7961
7962                         root = root->fs_info->extent_root;
7963                         trans = btrfs_start_transaction(root, 1);
7964                         if (IS_ERR(trans)) {
7965                                 ret = PTR_ERR(trans);
7966                                 goto repair_abort;
7967                         }
7968
7969                         btrfs_fix_block_accounting(trans, root);
7970                         ret = btrfs_commit_transaction(trans, root);
7971                         if (ret)
7972                                 goto repair_abort;
7973                 }
7974                 if (err)
7975                         fprintf(stderr, "repaired damaged extent references\n");
7976                 return ret;
7977         }
7978         return err;
7979 }
7980
7981 u64 calc_stripe_length(u64 type, u64 length, int num_stripes)
7982 {
7983         u64 stripe_size;
7984
7985         if (type & BTRFS_BLOCK_GROUP_RAID0) {
7986                 stripe_size = length;
7987                 stripe_size /= num_stripes;
7988         } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
7989                 stripe_size = length * 2;
7990                 stripe_size /= num_stripes;
7991         } else if (type & BTRFS_BLOCK_GROUP_RAID5) {
7992                 stripe_size = length;
7993                 stripe_size /= (num_stripes - 1);
7994         } else if (type & BTRFS_BLOCK_GROUP_RAID6) {
7995                 stripe_size = length;
7996                 stripe_size /= (num_stripes - 2);
7997         } else {
7998                 stripe_size = length;
7999         }
8000         return stripe_size;
8001 }
8002
8003 /*
8004  * Check the chunk with its block group/dev list ref:
8005  * Return 0 if all refs seems valid.
8006  * Return 1 if part of refs seems valid, need later check for rebuild ref
8007  * like missing block group and needs to search extent tree to rebuild them.
8008  * Return -1 if essential refs are missing and unable to rebuild.
8009  */
8010 static int check_chunk_refs(struct chunk_record *chunk_rec,
8011                             struct block_group_tree *block_group_cache,
8012                             struct device_extent_tree *dev_extent_cache,
8013                             int silent)
8014 {
8015         struct cache_extent *block_group_item;
8016         struct block_group_record *block_group_rec;
8017         struct cache_extent *dev_extent_item;
8018         struct device_extent_record *dev_extent_rec;
8019         u64 devid;
8020         u64 offset;
8021         u64 length;
8022         int metadump_v2 = 0;
8023         int i;
8024         int ret = 0;
8025
8026         block_group_item = lookup_cache_extent(&block_group_cache->tree,
8027                                                chunk_rec->offset,
8028                                                chunk_rec->length);
8029         if (block_group_item) {
8030                 block_group_rec = container_of(block_group_item,
8031                                                struct block_group_record,
8032                                                cache);
8033                 if (chunk_rec->length != block_group_rec->offset ||
8034                     chunk_rec->offset != block_group_rec->objectid ||
8035                     (!metadump_v2 &&
8036                      chunk_rec->type_flags != block_group_rec->flags)) {
8037                         if (!silent)
8038                                 fprintf(stderr,
8039                                         "Chunk[%llu, %u, %llu]: length(%llu), offset(%llu), type(%llu) mismatch with block group[%llu, %u, %llu]: offset(%llu), objectid(%llu), flags(%llu)\n",
8040                                         chunk_rec->objectid,
8041                                         chunk_rec->type,
8042                                         chunk_rec->offset,
8043                                         chunk_rec->length,
8044                                         chunk_rec->offset,
8045                                         chunk_rec->type_flags,
8046                                         block_group_rec->objectid,
8047                                         block_group_rec->type,
8048                                         block_group_rec->offset,
8049                                         block_group_rec->offset,
8050                                         block_group_rec->objectid,
8051                                         block_group_rec->flags);
8052                         ret = -1;
8053                 } else {
8054                         list_del_init(&block_group_rec->list);
8055                         chunk_rec->bg_rec = block_group_rec;
8056                 }
8057         } else {
8058                 if (!silent)
8059                         fprintf(stderr,
8060                                 "Chunk[%llu, %u, %llu]: length(%llu), offset(%llu), type(%llu) is not found in block group\n",
8061                                 chunk_rec->objectid,
8062                                 chunk_rec->type,
8063                                 chunk_rec->offset,
8064                                 chunk_rec->length,
8065                                 chunk_rec->offset,
8066                                 chunk_rec->type_flags);
8067                 ret = 1;
8068         }
8069
8070         if (metadump_v2)
8071                 return ret;
8072
8073         length = calc_stripe_length(chunk_rec->type_flags, chunk_rec->length,
8074                                     chunk_rec->num_stripes);
8075         for (i = 0; i < chunk_rec->num_stripes; ++i) {
8076                 devid = chunk_rec->stripes[i].devid;
8077                 offset = chunk_rec->stripes[i].offset;
8078                 dev_extent_item = lookup_cache_extent2(&dev_extent_cache->tree,
8079                                                        devid, offset, length);
8080                 if (dev_extent_item) {
8081                         dev_extent_rec = container_of(dev_extent_item,
8082                                                 struct device_extent_record,
8083                                                 cache);
8084                         if (dev_extent_rec->objectid != devid ||
8085                             dev_extent_rec->offset != offset ||
8086                             dev_extent_rec->chunk_offset != chunk_rec->offset ||
8087                             dev_extent_rec->length != length) {
8088                                 if (!silent)
8089                                         fprintf(stderr,
8090                                                 "Chunk[%llu, %u, %llu] stripe[%llu, %llu] dismatch dev extent[%llu, %llu, %llu]\n",
8091                                                 chunk_rec->objectid,
8092                                                 chunk_rec->type,
8093                                                 chunk_rec->offset,
8094                                                 chunk_rec->stripes[i].devid,
8095                                                 chunk_rec->stripes[i].offset,
8096                                                 dev_extent_rec->objectid,
8097                                                 dev_extent_rec->offset,
8098                                                 dev_extent_rec->length);
8099                                 ret = -1;
8100                         } else {
8101                                 list_move(&dev_extent_rec->chunk_list,
8102                                           &chunk_rec->dextents);
8103                         }
8104                 } else {
8105                         if (!silent)
8106                                 fprintf(stderr,
8107                                         "Chunk[%llu, %u, %llu] stripe[%llu, %llu] is not found in dev extent\n",
8108                                         chunk_rec->objectid,
8109                                         chunk_rec->type,
8110                                         chunk_rec->offset,
8111                                         chunk_rec->stripes[i].devid,
8112                                         chunk_rec->stripes[i].offset);
8113                         ret = -1;
8114                 }
8115         }
8116         return ret;
8117 }
8118
8119 /* check btrfs_chunk -> btrfs_dev_extent / btrfs_block_group_item */
8120 int check_chunks(struct cache_tree *chunk_cache,
8121                  struct block_group_tree *block_group_cache,
8122                  struct device_extent_tree *dev_extent_cache,
8123                  struct list_head *good, struct list_head *bad,
8124                  struct list_head *rebuild, int silent)
8125 {
8126         struct cache_extent *chunk_item;
8127         struct chunk_record *chunk_rec;
8128         struct block_group_record *bg_rec;
8129         struct device_extent_record *dext_rec;
8130         int err;
8131         int ret = 0;
8132
8133         chunk_item = first_cache_extent(chunk_cache);
8134         while (chunk_item) {
8135                 chunk_rec = container_of(chunk_item, struct chunk_record,
8136                                          cache);
8137                 err = check_chunk_refs(chunk_rec, block_group_cache,
8138                                        dev_extent_cache, silent);
8139                 if (err < 0)
8140                         ret = err;
8141                 if (err == 0 && good)
8142                         list_add_tail(&chunk_rec->list, good);
8143                 if (err > 0 && rebuild)
8144                         list_add_tail(&chunk_rec->list, rebuild);
8145                 if (err < 0 && bad)
8146                         list_add_tail(&chunk_rec->list, bad);
8147                 chunk_item = next_cache_extent(chunk_item);
8148         }
8149
8150         list_for_each_entry(bg_rec, &block_group_cache->block_groups, list) {
8151                 if (!silent)
8152                         fprintf(stderr,
8153                                 "Block group[%llu, %llu] (flags = %llu) didn't find the relative chunk.\n",
8154                                 bg_rec->objectid,
8155                                 bg_rec->offset,
8156                                 bg_rec->flags);
8157                 if (!ret)
8158                         ret = 1;
8159         }
8160
8161         list_for_each_entry(dext_rec, &dev_extent_cache->no_chunk_orphans,
8162                             chunk_list) {
8163                 if (!silent)
8164                         fprintf(stderr,
8165                                 "Device extent[%llu, %llu, %llu] didn't find the relative chunk.\n",
8166                                 dext_rec->objectid,
8167                                 dext_rec->offset,
8168                                 dext_rec->length);
8169                 if (!ret)
8170                         ret = 1;
8171         }
8172         return ret;
8173 }
8174
8175
8176 static int check_device_used(struct device_record *dev_rec,
8177                              struct device_extent_tree *dext_cache)
8178 {
8179         struct cache_extent *cache;
8180         struct device_extent_record *dev_extent_rec;
8181         u64 total_byte = 0;
8182
8183         cache = search_cache_extent2(&dext_cache->tree, dev_rec->devid, 0);
8184         while (cache) {
8185                 dev_extent_rec = container_of(cache,
8186                                               struct device_extent_record,
8187                                               cache);
8188                 if (dev_extent_rec->objectid != dev_rec->devid)
8189                         break;
8190
8191                 list_del_init(&dev_extent_rec->device_list);
8192                 total_byte += dev_extent_rec->length;
8193                 cache = next_cache_extent(cache);
8194         }
8195
8196         if (total_byte != dev_rec->byte_used) {
8197                 fprintf(stderr,
8198                         "Dev extent's total-byte(%llu) is not equal to byte-used(%llu) in dev[%llu, %u, %llu]\n",
8199                         total_byte, dev_rec->byte_used, dev_rec->objectid,
8200                         dev_rec->type, dev_rec->offset);
8201                 return -1;
8202         } else {
8203                 return 0;
8204         }
8205 }
8206
8207 /* check btrfs_dev_item -> btrfs_dev_extent */
8208 static int check_devices(struct rb_root *dev_cache,
8209                          struct device_extent_tree *dev_extent_cache)
8210 {
8211         struct rb_node *dev_node;
8212         struct device_record *dev_rec;
8213         struct device_extent_record *dext_rec;
8214         int err;
8215         int ret = 0;
8216
8217         dev_node = rb_first(dev_cache);
8218         while (dev_node) {
8219                 dev_rec = container_of(dev_node, struct device_record, node);
8220                 err = check_device_used(dev_rec, dev_extent_cache);
8221                 if (err)
8222                         ret = err;
8223
8224                 dev_node = rb_next(dev_node);
8225         }
8226         list_for_each_entry(dext_rec, &dev_extent_cache->no_device_orphans,
8227                             device_list) {
8228                 fprintf(stderr,
8229                         "Device extent[%llu, %llu, %llu] didn't find its device.\n",
8230                         dext_rec->objectid, dext_rec->offset, dext_rec->length);
8231                 if (!ret)
8232                         ret = 1;
8233         }
8234         return ret;
8235 }
8236
8237 static int add_root_item_to_list(struct list_head *head,
8238                                   u64 objectid, u64 bytenr, u64 last_snapshot,
8239                                   u8 level, u8 drop_level,
8240                                   int level_size, struct btrfs_key *drop_key)
8241 {
8242
8243         struct root_item_record *ri_rec;
8244         ri_rec = malloc(sizeof(*ri_rec));
8245         if (!ri_rec)
8246                 return -ENOMEM;
8247         ri_rec->bytenr = bytenr;
8248         ri_rec->objectid = objectid;
8249         ri_rec->level = level;
8250         ri_rec->level_size = level_size;
8251         ri_rec->drop_level = drop_level;
8252         ri_rec->last_snapshot = last_snapshot;
8253         if (drop_key)
8254                 memcpy(&ri_rec->drop_key, drop_key, sizeof(*drop_key));
8255         list_add_tail(&ri_rec->list, head);
8256
8257         return 0;
8258 }
8259
8260 static void free_root_item_list(struct list_head *list)
8261 {
8262         struct root_item_record *ri_rec;
8263
8264         while (!list_empty(list)) {
8265                 ri_rec = list_first_entry(list, struct root_item_record,
8266                                           list);
8267                 list_del_init(&ri_rec->list);
8268                 free(ri_rec);
8269         }
8270 }
8271
8272 static int deal_root_from_list(struct list_head *list,
8273                                struct btrfs_root *root,
8274                                struct block_info *bits,
8275                                int bits_nr,
8276                                struct cache_tree *pending,
8277                                struct cache_tree *seen,
8278                                struct cache_tree *reada,
8279                                struct cache_tree *nodes,
8280                                struct cache_tree *extent_cache,
8281                                struct cache_tree *chunk_cache,
8282                                struct rb_root *dev_cache,
8283                                struct block_group_tree *block_group_cache,
8284                                struct device_extent_tree *dev_extent_cache)
8285 {
8286         int ret = 0;
8287         u64 last;
8288
8289         while (!list_empty(list)) {
8290                 struct root_item_record *rec;
8291                 struct extent_buffer *buf;
8292                 rec = list_entry(list->next,
8293                                  struct root_item_record, list);
8294                 last = 0;
8295                 buf = read_tree_block(root->fs_info->tree_root,
8296                                       rec->bytenr, rec->level_size, 0);
8297                 if (!extent_buffer_uptodate(buf)) {
8298                         free_extent_buffer(buf);
8299                         ret = -EIO;
8300                         break;
8301                 }
8302                 add_root_to_pending(buf, extent_cache, pending,
8303                                     seen, nodes, rec->objectid);
8304                 /*
8305                  * To rebuild extent tree, we need deal with snapshot
8306                  * one by one, otherwise we deal with node firstly which
8307                  * can maximize readahead.
8308                  */
8309                 while (1) {
8310                         ret = run_next_block(root, bits, bits_nr, &last,
8311                                              pending, seen, reada, nodes,
8312                                              extent_cache, chunk_cache,
8313                                              dev_cache, block_group_cache,
8314                                              dev_extent_cache, rec);
8315                         if (ret != 0)
8316                                 break;
8317                 }
8318                 free_extent_buffer(buf);
8319                 list_del(&rec->list);
8320                 free(rec);
8321                 if (ret < 0)
8322                         break;
8323         }
8324         while (ret >= 0) {
8325                 ret = run_next_block(root, bits, bits_nr, &last, pending, seen,
8326                                      reada, nodes, extent_cache, chunk_cache,
8327                                      dev_cache, block_group_cache,
8328                                      dev_extent_cache, NULL);
8329                 if (ret != 0) {
8330                         if (ret > 0)
8331                                 ret = 0;
8332                         break;
8333                 }
8334         }
8335         return ret;
8336 }
8337
8338 static int check_chunks_and_extents(struct btrfs_root *root)
8339 {
8340         struct rb_root dev_cache;
8341         struct cache_tree chunk_cache;
8342         struct block_group_tree block_group_cache;
8343         struct device_extent_tree dev_extent_cache;
8344         struct cache_tree extent_cache;
8345         struct cache_tree seen;
8346         struct cache_tree pending;
8347         struct cache_tree reada;
8348         struct cache_tree nodes;
8349         struct extent_io_tree excluded_extents;
8350         struct cache_tree corrupt_blocks;
8351         struct btrfs_path path;
8352         struct btrfs_key key;
8353         struct btrfs_key found_key;
8354         int ret, err = 0;
8355         struct block_info *bits;
8356         int bits_nr;
8357         struct extent_buffer *leaf;
8358         int slot;
8359         struct btrfs_root_item ri;
8360         struct list_head dropping_trees;
8361         struct list_head normal_trees;
8362         struct btrfs_root *root1;
8363         u64 objectid;
8364         u32 level_size;
8365         u8 level;
8366
8367         dev_cache = RB_ROOT;
8368         cache_tree_init(&chunk_cache);
8369         block_group_tree_init(&block_group_cache);
8370         device_extent_tree_init(&dev_extent_cache);
8371
8372         cache_tree_init(&extent_cache);
8373         cache_tree_init(&seen);
8374         cache_tree_init(&pending);
8375         cache_tree_init(&nodes);
8376         cache_tree_init(&reada);
8377         cache_tree_init(&corrupt_blocks);
8378         extent_io_tree_init(&excluded_extents);
8379         INIT_LIST_HEAD(&dropping_trees);
8380         INIT_LIST_HEAD(&normal_trees);
8381
8382         if (repair) {
8383                 root->fs_info->excluded_extents = &excluded_extents;
8384                 root->fs_info->fsck_extent_cache = &extent_cache;
8385                 root->fs_info->free_extent_hook = free_extent_hook;
8386                 root->fs_info->corrupt_blocks = &corrupt_blocks;
8387         }
8388
8389         bits_nr = 1024;
8390         bits = malloc(bits_nr * sizeof(struct block_info));
8391         if (!bits) {
8392                 perror("malloc");
8393                 exit(1);
8394         }
8395
8396         if (ctx.progress_enabled) {
8397                 ctx.tp = TASK_EXTENTS;
8398                 task_start(ctx.info);
8399         }
8400
8401 again:
8402         root1 = root->fs_info->tree_root;
8403         level = btrfs_header_level(root1->node);
8404         ret = add_root_item_to_list(&normal_trees, root1->root_key.objectid,
8405                                     root1->node->start, 0, level, 0,
8406                                     root1->nodesize, NULL);
8407         if (ret < 0)
8408                 goto out;
8409         root1 = root->fs_info->chunk_root;
8410         level = btrfs_header_level(root1->node);
8411         ret = add_root_item_to_list(&normal_trees, root1->root_key.objectid,
8412                                     root1->node->start, 0, level, 0,
8413                                     root1->nodesize, NULL);
8414         if (ret < 0)
8415                 goto out;
8416         btrfs_init_path(&path);
8417         key.offset = 0;
8418         key.objectid = 0;
8419         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
8420         ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
8421                                         &key, &path, 0, 0);
8422         if (ret < 0)
8423                 goto out;
8424         while(1) {
8425                 leaf = path.nodes[0];
8426                 slot = path.slots[0];
8427                 if (slot >= btrfs_header_nritems(path.nodes[0])) {
8428                         ret = btrfs_next_leaf(root, &path);
8429                         if (ret != 0)
8430                                 break;
8431                         leaf = path.nodes[0];
8432                         slot = path.slots[0];
8433                 }
8434                 btrfs_item_key_to_cpu(leaf, &found_key, path.slots[0]);
8435                 if (btrfs_key_type(&found_key) == BTRFS_ROOT_ITEM_KEY) {
8436                         unsigned long offset;
8437                         u64 last_snapshot;
8438
8439                         offset = btrfs_item_ptr_offset(leaf, path.slots[0]);
8440                         read_extent_buffer(leaf, &ri, offset, sizeof(ri));
8441                         last_snapshot = btrfs_root_last_snapshot(&ri);
8442                         if (btrfs_disk_key_objectid(&ri.drop_progress) == 0) {
8443                                 level = btrfs_root_level(&ri);
8444                                 level_size = root->nodesize;
8445                                 ret = add_root_item_to_list(&normal_trees,
8446                                                 found_key.objectid,
8447                                                 btrfs_root_bytenr(&ri),
8448                                                 last_snapshot, level,
8449                                                 0, level_size, NULL);
8450                                 if (ret < 0)
8451                                         goto out;
8452                         } else {
8453                                 level = btrfs_root_level(&ri);
8454                                 level_size = root->nodesize;
8455                                 objectid = found_key.objectid;
8456                                 btrfs_disk_key_to_cpu(&found_key,
8457                                                       &ri.drop_progress);
8458                                 ret = add_root_item_to_list(&dropping_trees,
8459                                                 objectid,
8460                                                 btrfs_root_bytenr(&ri),
8461                                                 last_snapshot, level,
8462                                                 ri.drop_level,
8463                                                 level_size, &found_key);
8464                                 if (ret < 0)
8465                                         goto out;
8466                         }
8467                 }
8468                 path.slots[0]++;
8469         }
8470         btrfs_release_path(&path);
8471
8472         /*
8473          * check_block can return -EAGAIN if it fixes something, please keep
8474          * this in mind when dealing with return values from these functions, if
8475          * we get -EAGAIN we want to fall through and restart the loop.
8476          */
8477         ret = deal_root_from_list(&normal_trees, root, bits, bits_nr, &pending,
8478                                   &seen, &reada, &nodes, &extent_cache,
8479                                   &chunk_cache, &dev_cache, &block_group_cache,
8480                                   &dev_extent_cache);
8481         if (ret < 0) {
8482                 if (ret == -EAGAIN)
8483                         goto loop;
8484                 goto out;
8485         }
8486         ret = deal_root_from_list(&dropping_trees, root, bits, bits_nr,
8487                                   &pending, &seen, &reada, &nodes,
8488                                   &extent_cache, &chunk_cache, &dev_cache,
8489                                   &block_group_cache, &dev_extent_cache);
8490         if (ret < 0) {
8491                 if (ret == -EAGAIN)
8492                         goto loop;
8493                 goto out;
8494         }
8495
8496         ret = check_chunks(&chunk_cache, &block_group_cache,
8497                            &dev_extent_cache, NULL, NULL, NULL, 0);
8498         if (ret) {
8499                 if (ret == -EAGAIN)
8500                         goto loop;
8501                 err = ret;
8502         }
8503
8504         ret = check_extent_refs(root, &extent_cache);
8505         if (ret < 0) {
8506                 if (ret == -EAGAIN)
8507                         goto loop;
8508                 goto out;
8509         }
8510
8511         ret = check_devices(&dev_cache, &dev_extent_cache);
8512         if (ret && err)
8513                 ret = err;
8514
8515 out:
8516         task_stop(ctx.info);
8517         if (repair) {
8518                 free_corrupt_blocks_tree(root->fs_info->corrupt_blocks);
8519                 extent_io_tree_cleanup(&excluded_extents);
8520                 root->fs_info->fsck_extent_cache = NULL;
8521                 root->fs_info->free_extent_hook = NULL;
8522                 root->fs_info->corrupt_blocks = NULL;
8523                 root->fs_info->excluded_extents = NULL;
8524         }
8525         free(bits);
8526         free_chunk_cache_tree(&chunk_cache);
8527         free_device_cache_tree(&dev_cache);
8528         free_block_group_tree(&block_group_cache);
8529         free_device_extent_tree(&dev_extent_cache);
8530         free_extent_cache_tree(&seen);
8531         free_extent_cache_tree(&pending);
8532         free_extent_cache_tree(&reada);
8533         free_extent_cache_tree(&nodes);
8534         return ret;
8535 loop:
8536         free_corrupt_blocks_tree(root->fs_info->corrupt_blocks);
8537         free_extent_cache_tree(&seen);
8538         free_extent_cache_tree(&pending);
8539         free_extent_cache_tree(&reada);
8540         free_extent_cache_tree(&nodes);
8541         free_chunk_cache_tree(&chunk_cache);
8542         free_block_group_tree(&block_group_cache);
8543         free_device_cache_tree(&dev_cache);
8544         free_device_extent_tree(&dev_extent_cache);
8545         free_extent_record_cache(root->fs_info, &extent_cache);
8546         free_root_item_list(&normal_trees);
8547         free_root_item_list(&dropping_trees);
8548         extent_io_tree_cleanup(&excluded_extents);
8549         goto again;
8550 }
8551
8552 /*
8553  * Check backrefs of a tree block given by @bytenr or @eb.
8554  *
8555  * @root:       the root containing the @bytenr or @eb
8556  * @eb:         tree block extent buffer, can be NULL
8557  * @bytenr:     bytenr of the tree block to search
8558  * @level:      tree level of the tree block
8559  * @owner:      owner of the tree block
8560  *
8561  * Return >0 for any error found and output error message
8562  * Return 0 for no error found
8563  */
8564 static int check_tree_block_ref(struct btrfs_root *root,
8565                                 struct extent_buffer *eb, u64 bytenr,
8566                                 int level, u64 owner)
8567 {
8568         struct btrfs_key key;
8569         struct btrfs_root *extent_root = root->fs_info->extent_root;
8570         struct btrfs_path path;
8571         struct btrfs_extent_item *ei;
8572         struct btrfs_extent_inline_ref *iref;
8573         struct extent_buffer *leaf;
8574         unsigned long end;
8575         unsigned long ptr;
8576         int slot;
8577         int skinny_level;
8578         int type;
8579         u32 nodesize = root->nodesize;
8580         u32 item_size;
8581         u64 offset;
8582         int found_ref = 0;
8583         int err = 0;
8584         int ret;
8585
8586         btrfs_init_path(&path);
8587         key.objectid = bytenr;
8588         if (btrfs_fs_incompat(root->fs_info,
8589                               BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA))
8590                 key.type = BTRFS_METADATA_ITEM_KEY;
8591         else
8592                 key.type = BTRFS_EXTENT_ITEM_KEY;
8593         key.offset = (u64)-1;
8594
8595         /* Search for the backref in extent tree */
8596         ret = btrfs_search_slot(NULL, extent_root, &key, &path, 0, 0);
8597         if (ret < 0) {
8598                 err |= BACKREF_MISSING;
8599                 goto out;
8600         }
8601         ret = btrfs_previous_extent_item(extent_root, &path, bytenr);
8602         if (ret) {
8603                 err |= BACKREF_MISSING;
8604                 goto out;
8605         }
8606
8607         leaf = path.nodes[0];
8608         slot = path.slots[0];
8609         btrfs_item_key_to_cpu(leaf, &key, slot);
8610
8611         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
8612
8613         if (key.type == BTRFS_METADATA_ITEM_KEY) {
8614                 skinny_level = (int)key.offset;
8615                 iref = (struct btrfs_extent_inline_ref *)(ei + 1);
8616         } else {
8617                 struct btrfs_tree_block_info *info;
8618
8619                 info = (struct btrfs_tree_block_info *)(ei + 1);
8620                 skinny_level = btrfs_tree_block_level(leaf, info);
8621                 iref = (struct btrfs_extent_inline_ref *)(info + 1);
8622         }
8623
8624         if (eb) {
8625                 u64 header_gen;
8626                 u64 extent_gen;
8627
8628                 if (!(btrfs_extent_flags(leaf, ei) &
8629                       BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
8630                         error(
8631                 "extent[%llu %u] backref type mismatch, missing bit: %llx",
8632                                 key.objectid, nodesize,
8633                                 BTRFS_EXTENT_FLAG_TREE_BLOCK);
8634                         err = BACKREF_MISMATCH;
8635                 }
8636                 header_gen = btrfs_header_generation(eb);
8637                 extent_gen = btrfs_extent_generation(leaf, ei);
8638                 if (header_gen != extent_gen) {
8639                         error(
8640         "extent[%llu %u] backref generation mismatch, wanted: %llu, have: %llu",
8641                                 key.objectid, nodesize, header_gen,
8642                                 extent_gen);
8643                         err = BACKREF_MISMATCH;
8644                 }
8645                 if (level != skinny_level) {
8646                         error(
8647                         "extent[%llu %u] level mismatch, wanted: %u, have: %u",
8648                                 key.objectid, nodesize, level, skinny_level);
8649                         err = BACKREF_MISMATCH;
8650                 }
8651                 if (!is_fstree(owner) && btrfs_extent_refs(leaf, ei) != 1) {
8652                         error(
8653                         "extent[%llu %u] is referred by other roots than %llu",
8654                                 key.objectid, nodesize, root->objectid);
8655                         err = BACKREF_MISMATCH;
8656                 }
8657         }
8658
8659         /*
8660          * Iterate the extent/metadata item to find the exact backref
8661          */
8662         item_size = btrfs_item_size_nr(leaf, slot);
8663         ptr = (unsigned long)iref;
8664         end = (unsigned long)ei + item_size;
8665         while (ptr < end) {
8666                 iref = (struct btrfs_extent_inline_ref *)ptr;
8667                 type = btrfs_extent_inline_ref_type(leaf, iref);
8668                 offset = btrfs_extent_inline_ref_offset(leaf, iref);
8669
8670                 if (type == BTRFS_TREE_BLOCK_REF_KEY &&
8671                         (offset == root->objectid || offset == owner)) {
8672                         found_ref = 1;
8673                 } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
8674                         /* Check if the backref points to valid referencer */
8675                         found_ref = !check_tree_block_ref(root, NULL, offset,
8676                                                           level + 1, owner);
8677                 }
8678
8679                 if (found_ref)
8680                         break;
8681                 ptr += btrfs_extent_inline_ref_size(type);
8682         }
8683
8684         /*
8685          * Inlined extent item doesn't have what we need, check
8686          * TREE_BLOCK_REF_KEY
8687          */
8688         if (!found_ref) {
8689                 btrfs_release_path(&path);
8690                 key.objectid = bytenr;
8691                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
8692                 key.offset = root->objectid;
8693
8694                 ret = btrfs_search_slot(NULL, extent_root, &key, &path, 0, 0);
8695                 if (!ret)
8696                         found_ref = 1;
8697         }
8698         if (!found_ref)
8699                 err |= BACKREF_MISSING;
8700 out:
8701         btrfs_release_path(&path);
8702         if (eb && (err & BACKREF_MISSING))
8703                 error("extent[%llu %u] backref lost (owner: %llu, level: %u)",
8704                         bytenr, nodesize, owner, level);
8705         return err;
8706 }
8707
8708 /*
8709  * Check EXTENT_DATA item, mainly for its dbackref in extent tree
8710  *
8711  * Return >0 any error found and output error message
8712  * Return 0 for no error found
8713  */
8714 static int check_extent_data_item(struct btrfs_root *root,
8715                                   struct extent_buffer *eb, int slot)
8716 {
8717         struct btrfs_file_extent_item *fi;
8718         struct btrfs_path path;
8719         struct btrfs_root *extent_root = root->fs_info->extent_root;
8720         struct btrfs_key fi_key;
8721         struct btrfs_key dbref_key;
8722         struct extent_buffer *leaf;
8723         struct btrfs_extent_item *ei;
8724         struct btrfs_extent_inline_ref *iref;
8725         struct btrfs_extent_data_ref *dref;
8726         u64 owner;
8727         u64 file_extent_gen;
8728         u64 disk_bytenr;
8729         u64 disk_num_bytes;
8730         u64 extent_num_bytes;
8731         u64 extent_flags;
8732         u64 extent_gen;
8733         u32 item_size;
8734         unsigned long end;
8735         unsigned long ptr;
8736         int type;
8737         u64 ref_root;
8738         int found_dbackref = 0;
8739         int err = 0;
8740         int ret;
8741
8742         btrfs_item_key_to_cpu(eb, &fi_key, slot);
8743         fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
8744         file_extent_gen = btrfs_file_extent_generation(eb, fi);
8745
8746         /* Nothing to check for hole and inline data extents */
8747         if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE ||
8748             btrfs_file_extent_disk_bytenr(eb, fi) == 0)
8749                 return 0;
8750
8751         disk_bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
8752         disk_num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8753         extent_num_bytes = btrfs_file_extent_num_bytes(eb, fi);
8754
8755         /* Check unaligned disk_num_bytes and num_bytes */
8756         if (!IS_ALIGNED(disk_num_bytes, root->sectorsize)) {
8757                 error(
8758 "file extent [%llu, %llu] has unaligned disk num bytes: %llu, should be aligned to %u",
8759                         fi_key.objectid, fi_key.offset, disk_num_bytes,
8760                         root->sectorsize);
8761                 err |= BYTES_UNALIGNED;
8762         } else {
8763                 data_bytes_allocated += disk_num_bytes;
8764         }
8765         if (!IS_ALIGNED(extent_num_bytes, root->sectorsize)) {
8766                 error(
8767 "file extent [%llu, %llu] has unaligned num bytes: %llu, should be aligned to %u",
8768                         fi_key.objectid, fi_key.offset, extent_num_bytes,
8769                         root->sectorsize);
8770                 err |= BYTES_UNALIGNED;
8771         } else {
8772                 data_bytes_referenced += extent_num_bytes;
8773         }
8774         owner = btrfs_header_owner(eb);
8775
8776         /* Check the extent item of the file extent in extent tree */
8777         btrfs_init_path(&path);
8778         dbref_key.objectid = btrfs_file_extent_disk_bytenr(eb, fi);
8779         dbref_key.type = BTRFS_EXTENT_ITEM_KEY;
8780         dbref_key.offset = btrfs_file_extent_disk_num_bytes(eb, fi);
8781
8782         ret = btrfs_search_slot(NULL, extent_root, &dbref_key, &path, 0, 0);
8783         if (ret) {
8784                 err |= BACKREF_MISSING;
8785                 goto error;
8786         }
8787
8788         leaf = path.nodes[0];
8789         slot = path.slots[0];
8790         ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
8791
8792         extent_flags = btrfs_extent_flags(leaf, ei);
8793         extent_gen = btrfs_extent_generation(leaf, ei);
8794
8795         if (!(extent_flags & BTRFS_EXTENT_FLAG_DATA)) {
8796                 error(
8797                     "extent[%llu %llu] backref type mismatch, wanted bit: %llx",
8798                     disk_bytenr, disk_num_bytes,
8799                     BTRFS_EXTENT_FLAG_DATA);
8800                 err |= BACKREF_MISMATCH;
8801         }
8802
8803         if (file_extent_gen < extent_gen) {
8804                 error(
8805 "extent[%llu %llu] backref generation mismatch, wanted: <=%llu, have: %llu",
8806                         disk_bytenr, disk_num_bytes, file_extent_gen,
8807                         extent_gen);
8808                 err |= BACKREF_MISMATCH;
8809         }
8810
8811         /* Check data backref inside that extent item */
8812         item_size = btrfs_item_size_nr(leaf, path.slots[0]);
8813         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
8814         ptr = (unsigned long)iref;
8815         end = (unsigned long)ei + item_size;
8816         while (ptr < end) {
8817                 iref = (struct btrfs_extent_inline_ref *)ptr;
8818                 type = btrfs_extent_inline_ref_type(leaf, iref);
8819                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
8820
8821                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
8822                         ref_root = btrfs_extent_data_ref_root(leaf, dref);
8823                         if (ref_root == owner || ref_root == root->objectid)
8824                                 found_dbackref = 1;
8825                 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
8826                         found_dbackref = !check_tree_block_ref(root, NULL,
8827                                 btrfs_extent_inline_ref_offset(leaf, iref),
8828                                 0, owner);
8829                 }
8830
8831                 if (found_dbackref)
8832                         break;
8833                 ptr += btrfs_extent_inline_ref_size(type);
8834         }
8835
8836         /* Didn't found inlined data backref, try EXTENT_DATA_REF_KEY */
8837         if (!found_dbackref) {
8838                 btrfs_release_path(&path);
8839
8840                 btrfs_init_path(&path);
8841                 dbref_key.objectid = btrfs_file_extent_disk_bytenr(eb, fi);
8842                 dbref_key.type = BTRFS_EXTENT_DATA_REF_KEY;
8843                 dbref_key.offset = hash_extent_data_ref(root->objectid,
8844                                 fi_key.objectid, fi_key.offset);
8845
8846                 ret = btrfs_search_slot(NULL, root->fs_info->extent_root,
8847                                         &dbref_key, &path, 0, 0);
8848                 if (!ret)
8849                         found_dbackref = 1;
8850         }
8851
8852         if (!found_dbackref)
8853                 err |= BACKREF_MISSING;
8854 error:
8855         btrfs_release_path(&path);
8856         if (err & BACKREF_MISSING) {
8857                 error("data extent[%llu %llu] backref lost",
8858                       disk_bytenr, disk_num_bytes);
8859         }
8860         return err;
8861 }
8862
8863 /*
8864  * Get real tree block level for the case like shared block
8865  * Return >= 0 as tree level
8866  * Return <0 for error
8867  */
8868 static int query_tree_block_level(struct btrfs_fs_info *fs_info, u64 bytenr)
8869 {
8870         struct extent_buffer *eb;
8871         struct btrfs_path path;
8872         struct btrfs_key key;
8873         struct btrfs_extent_item *ei;
8874         u64 flags;
8875         u64 transid;
8876         u32 nodesize = btrfs_super_nodesize(fs_info->super_copy);
8877         u8 backref_level;
8878         u8 header_level;
8879         int ret;
8880
8881         /* Search extent tree for extent generation and level */
8882         key.objectid = bytenr;
8883         key.type = BTRFS_METADATA_ITEM_KEY;
8884         key.offset = (u64)-1;
8885
8886         btrfs_init_path(&path);
8887         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, &path, 0, 0);
8888         if (ret < 0)
8889                 goto release_out;
8890         ret = btrfs_previous_extent_item(fs_info->extent_root, &path, bytenr);
8891         if (ret < 0)
8892                 goto release_out;
8893         if (ret > 0) {
8894                 ret = -ENOENT;
8895                 goto release_out;
8896         }
8897
8898         btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
8899         ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
8900                             struct btrfs_extent_item);
8901         flags = btrfs_extent_flags(path.nodes[0], ei);
8902         if (!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
8903                 ret = -ENOENT;
8904                 goto release_out;
8905         }
8906
8907         /* Get transid for later read_tree_block() check */
8908         transid = btrfs_extent_generation(path.nodes[0], ei);
8909
8910         /* Get backref level as one source */
8911         if (key.type == BTRFS_METADATA_ITEM_KEY) {
8912                 backref_level = key.offset;
8913         } else {
8914                 struct btrfs_tree_block_info *info;
8915
8916                 info = (struct btrfs_tree_block_info *)(ei + 1);
8917                 backref_level = btrfs_tree_block_level(path.nodes[0], info);
8918         }
8919         btrfs_release_path(&path);
8920
8921         /* Get level from tree block as an alternative source */
8922         eb = read_tree_block_fs_info(fs_info, bytenr, nodesize, transid);
8923         if (!extent_buffer_uptodate(eb)) {
8924                 free_extent_buffer(eb);
8925                 return -EIO;
8926         }
8927         header_level = btrfs_header_level(eb);
8928         free_extent_buffer(eb);
8929
8930         if (header_level != backref_level)
8931                 return -EIO;
8932         return header_level;
8933
8934 release_out:
8935         btrfs_release_path(&path);
8936         return ret;
8937 }
8938
8939 static int btrfs_fsck_reinit_root(struct btrfs_trans_handle *trans,
8940                            struct btrfs_root *root, int overwrite)
8941 {
8942         struct extent_buffer *c;
8943         struct extent_buffer *old = root->node;
8944         int level;
8945         int ret;
8946         struct btrfs_disk_key disk_key = {0,0,0};
8947
8948         level = 0;
8949
8950         if (overwrite) {
8951                 c = old;
8952                 extent_buffer_get(c);
8953                 goto init;
8954         }
8955         c = btrfs_alloc_free_block(trans, root,
8956                                    root->nodesize,
8957                                    root->root_key.objectid,
8958                                    &disk_key, level, 0, 0);
8959         if (IS_ERR(c)) {
8960                 c = old;
8961                 extent_buffer_get(c);
8962                 overwrite = 1;
8963         }
8964 init:
8965         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
8966         btrfs_set_header_level(c, level);
8967         btrfs_set_header_bytenr(c, c->start);
8968         btrfs_set_header_generation(c, trans->transid);
8969         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
8970         btrfs_set_header_owner(c, root->root_key.objectid);
8971
8972         write_extent_buffer(c, root->fs_info->fsid,
8973                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
8974
8975         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
8976                             btrfs_header_chunk_tree_uuid(c),
8977                             BTRFS_UUID_SIZE);
8978
8979         btrfs_mark_buffer_dirty(c);
8980         /*
8981          * this case can happen in the following case:
8982          *
8983          * 1.overwrite previous root.
8984          *
8985          * 2.reinit reloc data root, this is because we skip pin
8986          * down reloc data tree before which means we can allocate
8987          * same block bytenr here.
8988          */
8989         if (old->start == c->start) {
8990                 btrfs_set_root_generation(&root->root_item,
8991                                           trans->transid);
8992                 root->root_item.level = btrfs_header_level(root->node);
8993                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
8994                                         &root->root_key, &root->root_item);
8995                 if (ret) {
8996                         free_extent_buffer(c);
8997                         return ret;
8998                 }
8999         }
9000         free_extent_buffer(old);
9001         root->node = c;
9002         add_root_to_dirty_list(root);
9003         return 0;
9004 }
9005
9006 static int pin_down_tree_blocks(struct btrfs_fs_info *fs_info,
9007                                 struct extent_buffer *eb, int tree_root)
9008 {
9009         struct extent_buffer *tmp;
9010         struct btrfs_root_item *ri;
9011         struct btrfs_key key;
9012         u64 bytenr;
9013         u32 nodesize;
9014         int level = btrfs_header_level(eb);
9015         int nritems;
9016         int ret;
9017         int i;
9018
9019         /*
9020          * If we have pinned this block before, don't pin it again.
9021          * This can not only avoid forever loop with broken filesystem
9022          * but also give us some speedups.
9023          */
9024         if (test_range_bit(&fs_info->pinned_extents, eb->start,
9025                            eb->start + eb->len - 1, EXTENT_DIRTY, 0))
9026                 return 0;
9027
9028         btrfs_pin_extent(fs_info, eb->start, eb->len);
9029
9030         nodesize = btrfs_super_nodesize(fs_info->super_copy);
9031         nritems = btrfs_header_nritems(eb);
9032         for (i = 0; i < nritems; i++) {
9033                 if (level == 0) {
9034                         btrfs_item_key_to_cpu(eb, &key, i);
9035                         if (key.type != BTRFS_ROOT_ITEM_KEY)
9036                                 continue;
9037                         /* Skip the extent root and reloc roots */
9038                         if (key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
9039                             key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
9040                             key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
9041                                 continue;
9042                         ri = btrfs_item_ptr(eb, i, struct btrfs_root_item);
9043                         bytenr = btrfs_disk_root_bytenr(eb, ri);
9044
9045                         /*
9046                          * If at any point we start needing the real root we
9047                          * will have to build a stump root for the root we are
9048                          * in, but for now this doesn't actually use the root so
9049                          * just pass in extent_root.
9050                          */
9051                         tmp = read_tree_block(fs_info->extent_root, bytenr,
9052                                               nodesize, 0);
9053                         if (!extent_buffer_uptodate(tmp)) {
9054                                 fprintf(stderr, "Error reading root block\n");
9055                                 return -EIO;
9056                         }
9057                         ret = pin_down_tree_blocks(fs_info, tmp, 0);
9058                         free_extent_buffer(tmp);
9059                         if (ret)
9060                                 return ret;
9061                 } else {
9062                         bytenr = btrfs_node_blockptr(eb, i);
9063
9064                         /* If we aren't the tree root don't read the block */
9065                         if (level == 1 && !tree_root) {
9066                                 btrfs_pin_extent(fs_info, bytenr, nodesize);
9067                                 continue;
9068                         }
9069
9070                         tmp = read_tree_block(fs_info->extent_root, bytenr,
9071                                               nodesize, 0);
9072                         if (!extent_buffer_uptodate(tmp)) {
9073                                 fprintf(stderr, "Error reading tree block\n");
9074                                 return -EIO;
9075                         }
9076                         ret = pin_down_tree_blocks(fs_info, tmp, tree_root);
9077                         free_extent_buffer(tmp);
9078                         if (ret)
9079                                 return ret;
9080                 }
9081         }
9082
9083         return 0;
9084 }
9085
9086 static int pin_metadata_blocks(struct btrfs_fs_info *fs_info)
9087 {
9088         int ret;
9089
9090         ret = pin_down_tree_blocks(fs_info, fs_info->chunk_root->node, 0);
9091         if (ret)
9092                 return ret;
9093
9094         return pin_down_tree_blocks(fs_info, fs_info->tree_root->node, 1);
9095 }
9096
9097 static int reset_block_groups(struct btrfs_fs_info *fs_info)
9098 {
9099         struct btrfs_block_group_cache *cache;
9100         struct btrfs_path *path;
9101         struct extent_buffer *leaf;
9102         struct btrfs_chunk *chunk;
9103         struct btrfs_key key;
9104         int ret;
9105         u64 start;
9106
9107         path = btrfs_alloc_path();
9108         if (!path)
9109                 return -ENOMEM;
9110
9111         key.objectid = 0;
9112         key.type = BTRFS_CHUNK_ITEM_KEY;
9113         key.offset = 0;
9114
9115         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
9116         if (ret < 0) {
9117                 btrfs_free_path(path);
9118                 return ret;
9119         }
9120
9121         /*
9122          * We do this in case the block groups were screwed up and had alloc
9123          * bits that aren't actually set on the chunks.  This happens with
9124          * restored images every time and could happen in real life I guess.
9125          */
9126         fs_info->avail_data_alloc_bits = 0;
9127         fs_info->avail_metadata_alloc_bits = 0;
9128         fs_info->avail_system_alloc_bits = 0;
9129
9130         /* First we need to create the in-memory block groups */
9131         while (1) {
9132                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
9133                         ret = btrfs_next_leaf(fs_info->chunk_root, path);
9134                         if (ret < 0) {
9135                                 btrfs_free_path(path);
9136                                 return ret;
9137                         }
9138                         if (ret) {
9139                                 ret = 0;
9140                                 break;
9141                         }
9142                 }
9143                 leaf = path->nodes[0];
9144                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
9145                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
9146                         path->slots[0]++;
9147                         continue;
9148                 }
9149
9150                 chunk = btrfs_item_ptr(leaf, path->slots[0],
9151                                        struct btrfs_chunk);
9152                 btrfs_add_block_group(fs_info, 0,
9153                                       btrfs_chunk_type(leaf, chunk),
9154                                       key.objectid, key.offset,
9155                                       btrfs_chunk_length(leaf, chunk));
9156                 set_extent_dirty(&fs_info->free_space_cache, key.offset,
9157                                  key.offset + btrfs_chunk_length(leaf, chunk),
9158                                  GFP_NOFS);
9159                 path->slots[0]++;
9160         }
9161         start = 0;
9162         while (1) {
9163                 cache = btrfs_lookup_first_block_group(fs_info, start);
9164                 if (!cache)
9165                         break;
9166                 cache->cached = 1;
9167                 start = cache->key.objectid + cache->key.offset;
9168         }
9169
9170         btrfs_free_path(path);
9171         return 0;
9172 }
9173
9174 static int reset_balance(struct btrfs_trans_handle *trans,
9175                          struct btrfs_fs_info *fs_info)
9176 {
9177         struct btrfs_root *root = fs_info->tree_root;
9178         struct btrfs_path *path;
9179         struct extent_buffer *leaf;
9180         struct btrfs_key key;
9181         int del_slot, del_nr = 0;
9182         int ret;
9183         int found = 0;
9184
9185         path = btrfs_alloc_path();
9186         if (!path)
9187                 return -ENOMEM;
9188
9189         key.objectid = BTRFS_BALANCE_OBJECTID;
9190         key.type = BTRFS_BALANCE_ITEM_KEY;
9191         key.offset = 0;
9192
9193         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9194         if (ret) {
9195                 if (ret > 0)
9196                         ret = 0;
9197                 if (!ret)
9198                         goto reinit_data_reloc;
9199                 else
9200                         goto out;
9201         }
9202
9203         ret = btrfs_del_item(trans, root, path);
9204         if (ret)
9205                 goto out;
9206         btrfs_release_path(path);
9207
9208         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
9209         key.type = BTRFS_ROOT_ITEM_KEY;
9210         key.offset = 0;
9211
9212         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9213         if (ret < 0)
9214                 goto out;
9215         while (1) {
9216                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
9217                         if (!found)
9218                                 break;
9219
9220                         if (del_nr) {
9221                                 ret = btrfs_del_items(trans, root, path,
9222                                                       del_slot, del_nr);
9223                                 del_nr = 0;
9224                                 if (ret)
9225                                         goto out;
9226                         }
9227                         key.offset++;
9228                         btrfs_release_path(path);
9229
9230                         found = 0;
9231                         ret = btrfs_search_slot(trans, root, &key, path,
9232                                                 -1, 1);
9233                         if (ret < 0)
9234                                 goto out;
9235                         continue;
9236                 }
9237                 found = 1;
9238                 leaf = path->nodes[0];
9239                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
9240                 if (key.objectid > BTRFS_TREE_RELOC_OBJECTID)
9241                         break;
9242                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9243                         path->slots[0]++;
9244                         continue;
9245                 }
9246                 if (!del_nr) {
9247                         del_slot = path->slots[0];
9248                         del_nr = 1;
9249                 } else {
9250                         del_nr++;
9251                 }
9252                 path->slots[0]++;
9253         }
9254
9255         if (del_nr) {
9256                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
9257                 if (ret)
9258                         goto out;
9259         }
9260         btrfs_release_path(path);
9261
9262 reinit_data_reloc:
9263         key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
9264         key.type = BTRFS_ROOT_ITEM_KEY;
9265         key.offset = (u64)-1;
9266         root = btrfs_read_fs_root(fs_info, &key);
9267         if (IS_ERR(root)) {
9268                 fprintf(stderr, "Error reading data reloc tree\n");
9269                 ret = PTR_ERR(root);
9270                 goto out;
9271         }
9272         record_root_in_trans(trans, root);
9273         ret = btrfs_fsck_reinit_root(trans, root, 0);
9274         if (ret)
9275                 goto out;
9276         ret = btrfs_make_root_dir(trans, root, BTRFS_FIRST_FREE_OBJECTID);
9277 out:
9278         btrfs_free_path(path);
9279         return ret;
9280 }
9281
9282 static int reinit_extent_tree(struct btrfs_trans_handle *trans,
9283                               struct btrfs_fs_info *fs_info)
9284 {
9285         u64 start = 0;
9286         int ret;
9287
9288         /*
9289          * The only reason we don't do this is because right now we're just
9290          * walking the trees we find and pinning down their bytes, we don't look
9291          * at any of the leaves.  In order to do mixed groups we'd have to check
9292          * the leaves of any fs roots and pin down the bytes for any file
9293          * extents we find.  Not hard but why do it if we don't have to?
9294          */
9295         if (btrfs_fs_incompat(fs_info, BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)) {
9296                 fprintf(stderr, "We don't support re-initing the extent tree "
9297                         "for mixed block groups yet, please notify a btrfs "
9298                         "developer you want to do this so they can add this "
9299                         "functionality.\n");
9300                 return -EINVAL;
9301         }
9302
9303         /*
9304          * first we need to walk all of the trees except the extent tree and pin
9305          * down the bytes that are in use so we don't overwrite any existing
9306          * metadata.
9307          */
9308         ret = pin_metadata_blocks(fs_info);
9309         if (ret) {
9310                 fprintf(stderr, "error pinning down used bytes\n");
9311                 return ret;
9312         }
9313
9314         /*
9315          * Need to drop all the block groups since we're going to recreate all
9316          * of them again.
9317          */
9318         btrfs_free_block_groups(fs_info);
9319         ret = reset_block_groups(fs_info);
9320         if (ret) {
9321                 fprintf(stderr, "error resetting the block groups\n");
9322                 return ret;
9323         }
9324
9325         /* Ok we can allocate now, reinit the extent root */
9326         ret = btrfs_fsck_reinit_root(trans, fs_info->extent_root, 0);
9327         if (ret) {
9328                 fprintf(stderr, "extent root initialization failed\n");
9329                 /*
9330                  * When the transaction code is updated we should end the
9331                  * transaction, but for now progs only knows about commit so
9332                  * just return an error.
9333                  */
9334                 return ret;
9335         }
9336
9337         /*
9338          * Now we have all the in-memory block groups setup so we can make
9339          * allocations properly, and the metadata we care about is safe since we
9340          * pinned all of it above.
9341          */
9342         while (1) {
9343                 struct btrfs_block_group_cache *cache;
9344
9345                 cache = btrfs_lookup_first_block_group(fs_info, start);
9346                 if (!cache)
9347                         break;
9348                 start = cache->key.objectid + cache->key.offset;
9349                 ret = btrfs_insert_item(trans, fs_info->extent_root,
9350                                         &cache->key, &cache->item,
9351                                         sizeof(cache->item));
9352                 if (ret) {
9353                         fprintf(stderr, "Error adding block group\n");
9354                         return ret;
9355                 }
9356                 btrfs_extent_post_op(trans, fs_info->extent_root);
9357         }
9358
9359         ret = reset_balance(trans, fs_info);
9360         if (ret)
9361                 fprintf(stderr, "error resetting the pending balance\n");
9362
9363         return ret;
9364 }
9365
9366 static int recow_extent_buffer(struct btrfs_root *root, struct extent_buffer *eb)
9367 {
9368         struct btrfs_path *path;
9369         struct btrfs_trans_handle *trans;
9370         struct btrfs_key key;
9371         int ret;
9372
9373         printf("Recowing metadata block %llu\n", eb->start);
9374         key.objectid = btrfs_header_owner(eb);
9375         key.type = BTRFS_ROOT_ITEM_KEY;
9376         key.offset = (u64)-1;
9377
9378         root = btrfs_read_fs_root(root->fs_info, &key);
9379         if (IS_ERR(root)) {
9380                 fprintf(stderr, "Couldn't find owner root %llu\n",
9381                         key.objectid);
9382                 return PTR_ERR(root);
9383         }
9384
9385         path = btrfs_alloc_path();
9386         if (!path)
9387                 return -ENOMEM;
9388
9389         trans = btrfs_start_transaction(root, 1);
9390         if (IS_ERR(trans)) {
9391                 btrfs_free_path(path);
9392                 return PTR_ERR(trans);
9393         }
9394
9395         path->lowest_level = btrfs_header_level(eb);
9396         if (path->lowest_level)
9397                 btrfs_node_key_to_cpu(eb, &key, 0);
9398         else
9399                 btrfs_item_key_to_cpu(eb, &key, 0);
9400
9401         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
9402         btrfs_commit_transaction(trans, root);
9403         btrfs_free_path(path);
9404         return ret;
9405 }
9406
9407 static int delete_bad_item(struct btrfs_root *root, struct bad_item *bad)
9408 {
9409         struct btrfs_path *path;
9410         struct btrfs_trans_handle *trans;
9411         struct btrfs_key key;
9412         int ret;
9413
9414         printf("Deleting bad item [%llu,%u,%llu]\n", bad->key.objectid,
9415                bad->key.type, bad->key.offset);
9416         key.objectid = bad->root_id;
9417         key.type = BTRFS_ROOT_ITEM_KEY;
9418         key.offset = (u64)-1;
9419
9420         root = btrfs_read_fs_root(root->fs_info, &key);
9421         if (IS_ERR(root)) {
9422                 fprintf(stderr, "Couldn't find owner root %llu\n",
9423                         key.objectid);
9424                 return PTR_ERR(root);
9425         }
9426
9427         path = btrfs_alloc_path();
9428         if (!path)
9429                 return -ENOMEM;
9430
9431         trans = btrfs_start_transaction(root, 1);
9432         if (IS_ERR(trans)) {
9433                 btrfs_free_path(path);
9434                 return PTR_ERR(trans);
9435         }
9436
9437         ret = btrfs_search_slot(trans, root, &bad->key, path, -1, 1);
9438         if (ret) {
9439                 if (ret > 0)
9440                         ret = 0;
9441                 goto out;
9442         }
9443         ret = btrfs_del_item(trans, root, path);
9444 out:
9445         btrfs_commit_transaction(trans, root);
9446         btrfs_free_path(path);
9447         return ret;
9448 }
9449
9450 static int zero_log_tree(struct btrfs_root *root)
9451 {
9452         struct btrfs_trans_handle *trans;
9453         int ret;
9454
9455         trans = btrfs_start_transaction(root, 1);
9456         if (IS_ERR(trans)) {
9457                 ret = PTR_ERR(trans);
9458                 return ret;
9459         }
9460         btrfs_set_super_log_root(root->fs_info->super_copy, 0);
9461         btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
9462         ret = btrfs_commit_transaction(trans, root);
9463         return ret;
9464 }
9465
9466 static int populate_csum(struct btrfs_trans_handle *trans,
9467                          struct btrfs_root *csum_root, char *buf, u64 start,
9468                          u64 len)
9469 {
9470         u64 offset = 0;
9471         u64 sectorsize;
9472         int ret = 0;
9473
9474         while (offset < len) {
9475                 sectorsize = csum_root->sectorsize;
9476                 ret = read_extent_data(csum_root, buf, start + offset,
9477                                        &sectorsize, 0);
9478                 if (ret)
9479                         break;
9480                 ret = btrfs_csum_file_block(trans, csum_root, start + len,
9481                                             start + offset, buf, sectorsize);
9482                 if (ret)
9483                         break;
9484                 offset += sectorsize;
9485         }
9486         return ret;
9487 }
9488
9489 static int fill_csum_tree_from_one_fs_root(struct btrfs_trans_handle *trans,
9490                                       struct btrfs_root *csum_root,
9491                                       struct btrfs_root *cur_root)
9492 {
9493         struct btrfs_path *path;
9494         struct btrfs_key key;
9495         struct extent_buffer *node;
9496         struct btrfs_file_extent_item *fi;
9497         char *buf = NULL;
9498         u64 start = 0;
9499         u64 len = 0;
9500         int slot = 0;
9501         int ret = 0;
9502
9503         path = btrfs_alloc_path();
9504         if (!path)
9505                 return -ENOMEM;
9506         buf = malloc(cur_root->fs_info->csum_root->sectorsize);
9507         if (!buf) {
9508                 ret = -ENOMEM;
9509                 goto out;
9510         }
9511
9512         key.objectid = 0;
9513         key.offset = 0;
9514         key.type = 0;
9515
9516         ret = btrfs_search_slot(NULL, cur_root, &key, path, 0, 0);
9517         if (ret < 0)
9518                 goto out;
9519         /* Iterate all regular file extents and fill its csum */
9520         while (1) {
9521                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
9522
9523                 if (key.type != BTRFS_EXTENT_DATA_KEY)
9524                         goto next;
9525                 node = path->nodes[0];
9526                 slot = path->slots[0];
9527                 fi = btrfs_item_ptr(node, slot, struct btrfs_file_extent_item);
9528                 if (btrfs_file_extent_type(node, fi) != BTRFS_FILE_EXTENT_REG)
9529                         goto next;
9530                 start = btrfs_file_extent_disk_bytenr(node, fi);
9531                 len = btrfs_file_extent_disk_num_bytes(node, fi);
9532
9533                 ret = populate_csum(trans, csum_root, buf, start, len);
9534                 if (ret == -EEXIST)
9535                         ret = 0;
9536                 if (ret < 0)
9537                         goto out;
9538 next:
9539                 /*
9540                  * TODO: if next leaf is corrupted, jump to nearest next valid
9541                  * leaf.
9542                  */
9543                 ret = btrfs_next_item(cur_root, path);
9544                 if (ret < 0)
9545                         goto out;
9546                 if (ret > 0) {
9547                         ret = 0;
9548                         goto out;
9549                 }
9550         }
9551
9552 out:
9553         btrfs_free_path(path);
9554         free(buf);
9555         return ret;
9556 }
9557
9558 static int fill_csum_tree_from_fs(struct btrfs_trans_handle *trans,
9559                                   struct btrfs_root *csum_root)
9560 {
9561         struct btrfs_fs_info *fs_info = csum_root->fs_info;
9562         struct btrfs_path *path;
9563         struct btrfs_root *tree_root = fs_info->tree_root;
9564         struct btrfs_root *cur_root;
9565         struct extent_buffer *node;
9566         struct btrfs_key key;
9567         int slot = 0;
9568         int ret = 0;
9569
9570         path = btrfs_alloc_path();
9571         if (!path)
9572                 return -ENOMEM;
9573
9574         key.objectid = BTRFS_FS_TREE_OBJECTID;
9575         key.offset = 0;
9576         key.type = BTRFS_ROOT_ITEM_KEY;
9577
9578         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
9579         if (ret < 0)
9580                 goto out;
9581         if (ret > 0) {
9582                 ret = -ENOENT;
9583                 goto out;
9584         }
9585
9586         while (1) {
9587                 node = path->nodes[0];
9588                 slot = path->slots[0];
9589                 btrfs_item_key_to_cpu(node, &key, slot);
9590                 if (key.objectid > BTRFS_LAST_FREE_OBJECTID)
9591                         goto out;
9592                 if (key.type != BTRFS_ROOT_ITEM_KEY)
9593                         goto next;
9594                 if (!is_fstree(key.objectid))
9595                         goto next;
9596                 key.offset = (u64)-1;
9597
9598                 cur_root = btrfs_read_fs_root(fs_info, &key);
9599                 if (IS_ERR(cur_root) || !cur_root) {
9600                         fprintf(stderr, "Fail to read fs/subvol tree: %lld\n",
9601                                 key.objectid);
9602                         goto out;
9603                 }
9604                 ret = fill_csum_tree_from_one_fs_root(trans, csum_root,
9605                                 cur_root);
9606                 if (ret < 0)
9607                         goto out;
9608 next:
9609                 ret = btrfs_next_item(tree_root, path);
9610                 if (ret > 0) {
9611                         ret = 0;
9612                         goto out;
9613                 }
9614                 if (ret < 0)
9615                         goto out;
9616         }
9617
9618 out:
9619         btrfs_free_path(path);
9620         return ret;
9621 }
9622
9623 static int fill_csum_tree_from_extent(struct btrfs_trans_handle *trans,
9624                                       struct btrfs_root *csum_root)
9625 {
9626         struct btrfs_root *extent_root = csum_root->fs_info->extent_root;
9627         struct btrfs_path *path;
9628         struct btrfs_extent_item *ei;
9629         struct extent_buffer *leaf;
9630         char *buf;
9631         struct btrfs_key key;
9632         int ret;
9633
9634         path = btrfs_alloc_path();
9635         if (!path)
9636                 return -ENOMEM;
9637
9638         key.objectid = 0;
9639         key.type = BTRFS_EXTENT_ITEM_KEY;
9640         key.offset = 0;
9641
9642         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
9643         if (ret < 0) {
9644                 btrfs_free_path(path);
9645                 return ret;
9646         }
9647
9648         buf = malloc(csum_root->sectorsize);
9649         if (!buf) {
9650                 btrfs_free_path(path);
9651                 return -ENOMEM;
9652         }
9653
9654         while (1) {
9655                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
9656                         ret = btrfs_next_leaf(extent_root, path);
9657                         if (ret < 0)
9658                                 break;
9659                         if (ret) {
9660                                 ret = 0;
9661                                 break;
9662                         }
9663                 }
9664                 leaf = path->nodes[0];
9665
9666                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
9667                 if (key.type != BTRFS_EXTENT_ITEM_KEY) {
9668                         path->slots[0]++;
9669                         continue;
9670                 }
9671
9672                 ei = btrfs_item_ptr(leaf, path->slots[0],
9673                                     struct btrfs_extent_item);
9674                 if (!(btrfs_extent_flags(leaf, ei) &
9675                       BTRFS_EXTENT_FLAG_DATA)) {
9676                         path->slots[0]++;
9677                         continue;
9678                 }
9679
9680                 ret = populate_csum(trans, csum_root, buf, key.objectid,
9681                                     key.offset);
9682                 if (ret)
9683                         break;
9684                 path->slots[0]++;
9685         }
9686
9687         btrfs_free_path(path);
9688         free(buf);
9689         return ret;
9690 }
9691
9692 /*
9693  * Recalculate the csum and put it into the csum tree.
9694  *
9695  * Extent tree init will wipe out all the extent info, so in that case, we
9696  * can't depend on extent tree, but use fs tree.  If search_fs_tree is set, we
9697  * will use fs/subvol trees to init the csum tree.
9698  */
9699 static int fill_csum_tree(struct btrfs_trans_handle *trans,
9700                           struct btrfs_root *csum_root,
9701                           int search_fs_tree)
9702 {
9703         if (search_fs_tree)
9704                 return fill_csum_tree_from_fs(trans, csum_root);
9705         else
9706                 return fill_csum_tree_from_extent(trans, csum_root);
9707 }
9708
9709 static void free_roots_info_cache(void)
9710 {
9711         if (!roots_info_cache)
9712                 return;
9713
9714         while (!cache_tree_empty(roots_info_cache)) {
9715                 struct cache_extent *entry;
9716                 struct root_item_info *rii;
9717
9718                 entry = first_cache_extent(roots_info_cache);
9719                 if (!entry)
9720                         break;
9721                 remove_cache_extent(roots_info_cache, entry);
9722                 rii = container_of(entry, struct root_item_info, cache_extent);
9723                 free(rii);
9724         }
9725
9726         free(roots_info_cache);
9727         roots_info_cache = NULL;
9728 }
9729
9730 static int build_roots_info_cache(struct btrfs_fs_info *info)
9731 {
9732         int ret = 0;
9733         struct btrfs_key key;
9734         struct extent_buffer *leaf;
9735         struct btrfs_path *path;
9736
9737         if (!roots_info_cache) {
9738                 roots_info_cache = malloc(sizeof(*roots_info_cache));
9739                 if (!roots_info_cache)
9740                         return -ENOMEM;
9741                 cache_tree_init(roots_info_cache);
9742         }
9743
9744         path = btrfs_alloc_path();
9745         if (!path)
9746                 return -ENOMEM;
9747
9748         key.objectid = 0;
9749         key.type = BTRFS_EXTENT_ITEM_KEY;
9750         key.offset = 0;
9751
9752         ret = btrfs_search_slot(NULL, info->extent_root, &key, path, 0, 0);
9753         if (ret < 0)
9754                 goto out;
9755         leaf = path->nodes[0];
9756
9757         while (1) {
9758                 struct btrfs_key found_key;
9759                 struct btrfs_extent_item *ei;
9760                 struct btrfs_extent_inline_ref *iref;
9761                 int slot = path->slots[0];
9762                 int type;
9763                 u64 flags;
9764                 u64 root_id;
9765                 u8 level;
9766                 struct cache_extent *entry;
9767                 struct root_item_info *rii;
9768
9769                 if (slot >= btrfs_header_nritems(leaf)) {
9770                         ret = btrfs_next_leaf(info->extent_root, path);
9771                         if (ret < 0) {
9772                                 break;
9773                         } else if (ret) {
9774                                 ret = 0;
9775                                 break;
9776                         }
9777                         leaf = path->nodes[0];
9778                         slot = path->slots[0];
9779                 }
9780
9781                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9782
9783                 if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
9784                     found_key.type != BTRFS_METADATA_ITEM_KEY)
9785                         goto next;
9786
9787                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
9788                 flags = btrfs_extent_flags(leaf, ei);
9789
9790                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
9791                     !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
9792                         goto next;
9793
9794                 if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
9795                         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
9796                         level = found_key.offset;
9797                 } else {
9798                         struct btrfs_tree_block_info *binfo;
9799
9800                         binfo = (struct btrfs_tree_block_info *)(ei + 1);
9801                         iref = (struct btrfs_extent_inline_ref *)(binfo + 1);
9802                         level = btrfs_tree_block_level(leaf, binfo);
9803                 }
9804
9805                 /*
9806                  * For a root extent, it must be of the following type and the
9807                  * first (and only one) iref in the item.
9808                  */
9809                 type = btrfs_extent_inline_ref_type(leaf, iref);
9810                 if (type != BTRFS_TREE_BLOCK_REF_KEY)
9811                         goto next;
9812
9813                 root_id = btrfs_extent_inline_ref_offset(leaf, iref);
9814                 entry = lookup_cache_extent(roots_info_cache, root_id, 1);
9815                 if (!entry) {
9816                         rii = malloc(sizeof(struct root_item_info));
9817                         if (!rii) {
9818                                 ret = -ENOMEM;
9819                                 goto out;
9820                         }
9821                         rii->cache_extent.start = root_id;
9822                         rii->cache_extent.size = 1;
9823                         rii->level = (u8)-1;
9824                         entry = &rii->cache_extent;
9825                         ret = insert_cache_extent(roots_info_cache, entry);
9826                         ASSERT(ret == 0);
9827                 } else {
9828                         rii = container_of(entry, struct root_item_info,
9829                                            cache_extent);
9830                 }
9831
9832                 ASSERT(rii->cache_extent.start == root_id);
9833                 ASSERT(rii->cache_extent.size == 1);
9834
9835                 if (level > rii->level || rii->level == (u8)-1) {
9836                         rii->level = level;
9837                         rii->bytenr = found_key.objectid;
9838                         rii->gen = btrfs_extent_generation(leaf, ei);
9839                         rii->node_count = 1;
9840                 } else if (level == rii->level) {
9841                         rii->node_count++;
9842                 }
9843 next:
9844                 path->slots[0]++;
9845         }
9846
9847 out:
9848         btrfs_free_path(path);
9849
9850         return ret;
9851 }
9852
9853 static int maybe_repair_root_item(struct btrfs_fs_info *info,
9854                                   struct btrfs_path *path,
9855                                   const struct btrfs_key *root_key,
9856                                   const int read_only_mode)
9857 {
9858         const u64 root_id = root_key->objectid;
9859         struct cache_extent *entry;
9860         struct root_item_info *rii;
9861         struct btrfs_root_item ri;
9862         unsigned long offset;
9863
9864         entry = lookup_cache_extent(roots_info_cache, root_id, 1);
9865         if (!entry) {
9866                 fprintf(stderr,
9867                         "Error: could not find extent items for root %llu\n",
9868                         root_key->objectid);
9869                 return -ENOENT;
9870         }
9871
9872         rii = container_of(entry, struct root_item_info, cache_extent);
9873         ASSERT(rii->cache_extent.start == root_id);
9874         ASSERT(rii->cache_extent.size == 1);
9875
9876         if (rii->node_count != 1) {
9877                 fprintf(stderr,
9878                         "Error: could not find btree root extent for root %llu\n",
9879                         root_id);
9880                 return -ENOENT;
9881         }
9882
9883         offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
9884         read_extent_buffer(path->nodes[0], &ri, offset, sizeof(ri));
9885
9886         if (btrfs_root_bytenr(&ri) != rii->bytenr ||
9887             btrfs_root_level(&ri) != rii->level ||
9888             btrfs_root_generation(&ri) != rii->gen) {
9889
9890                 /*
9891                  * If we're in repair mode but our caller told us to not update
9892                  * the root item, i.e. just check if it needs to be updated, don't
9893                  * print this message, since the caller will call us again shortly
9894                  * for the same root item without read only mode (the caller will
9895                  * open a transaction first).
9896                  */
9897                 if (!(read_only_mode && repair))
9898                         fprintf(stderr,
9899                                 "%sroot item for root %llu,"
9900                                 " current bytenr %llu, current gen %llu, current level %u,"
9901                                 " new bytenr %llu, new gen %llu, new level %u\n",
9902                                 (read_only_mode ? "" : "fixing "),
9903                                 root_id,
9904                                 btrfs_root_bytenr(&ri), btrfs_root_generation(&ri),
9905                                 btrfs_root_level(&ri),
9906                                 rii->bytenr, rii->gen, rii->level);
9907
9908                 if (btrfs_root_generation(&ri) > rii->gen) {
9909                         fprintf(stderr,
9910                                 "root %llu has a root item with a more recent gen (%llu) compared to the found root node (%llu)\n",
9911                                 root_id, btrfs_root_generation(&ri), rii->gen);
9912                         return -EINVAL;
9913                 }
9914
9915                 if (!read_only_mode) {
9916                         btrfs_set_root_bytenr(&ri, rii->bytenr);
9917                         btrfs_set_root_level(&ri, rii->level);
9918                         btrfs_set_root_generation(&ri, rii->gen);
9919                         write_extent_buffer(path->nodes[0], &ri,
9920                                             offset, sizeof(ri));
9921                 }
9922
9923                 return 1;
9924         }
9925
9926         return 0;
9927 }
9928
9929 /*
9930  * A regression introduced in the 3.17 kernel (more specifically in 3.17-rc2),
9931  * caused read-only snapshots to be corrupted if they were created at a moment
9932  * when the source subvolume/snapshot had orphan items. The issue was that the
9933  * on-disk root items became incorrect, referring to the pre orphan cleanup root
9934  * node instead of the post orphan cleanup root node.
9935  * So this function, and its callees, just detects and fixes those cases. Even
9936  * though the regression was for read-only snapshots, this function applies to
9937  * any snapshot/subvolume root.
9938  * This must be run before any other repair code - not doing it so, makes other
9939  * repair code delete or modify backrefs in the extent tree for example, which
9940  * will result in an inconsistent fs after repairing the root items.
9941  */
9942 static int repair_root_items(struct btrfs_fs_info *info)
9943 {
9944         struct btrfs_path *path = NULL;
9945         struct btrfs_key key;
9946         struct extent_buffer *leaf;
9947         struct btrfs_trans_handle *trans = NULL;
9948         int ret = 0;
9949         int bad_roots = 0;
9950         int need_trans = 0;
9951
9952         ret = build_roots_info_cache(info);
9953         if (ret)
9954                 goto out;
9955
9956         path = btrfs_alloc_path();
9957         if (!path) {
9958                 ret = -ENOMEM;
9959                 goto out;
9960         }
9961
9962         key.objectid = BTRFS_FIRST_FREE_OBJECTID;
9963         key.type = BTRFS_ROOT_ITEM_KEY;
9964         key.offset = 0;
9965
9966 again:
9967         /*
9968          * Avoid opening and committing transactions if a leaf doesn't have
9969          * any root items that need to be fixed, so that we avoid rotating
9970          * backup roots unnecessarily.
9971          */
9972         if (need_trans) {
9973                 trans = btrfs_start_transaction(info->tree_root, 1);
9974                 if (IS_ERR(trans)) {
9975                         ret = PTR_ERR(trans);
9976                         goto out;
9977                 }
9978         }
9979
9980         ret = btrfs_search_slot(trans, info->tree_root, &key, path,
9981                                 0, trans ? 1 : 0);
9982         if (ret < 0)
9983                 goto out;
9984         leaf = path->nodes[0];
9985
9986         while (1) {
9987                 struct btrfs_key found_key;
9988
9989                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
9990                         int no_more_keys = find_next_key(path, &key);
9991
9992                         btrfs_release_path(path);
9993                         if (trans) {
9994                                 ret = btrfs_commit_transaction(trans,
9995                                                                info->tree_root);
9996                                 trans = NULL;
9997                                 if (ret < 0)
9998                                         goto out;
9999                         }
10000                         need_trans = 0;
10001                         if (no_more_keys)
10002                                 break;
10003                         goto again;
10004                 }
10005
10006                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10007
10008                 if (found_key.type != BTRFS_ROOT_ITEM_KEY)
10009                         goto next;
10010                 if (found_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
10011                         goto next;
10012
10013                 ret = maybe_repair_root_item(info, path, &found_key,
10014                                              trans ? 0 : 1);
10015                 if (ret < 0)
10016                         goto out;
10017                 if (ret) {
10018                         if (!trans && repair) {
10019                                 need_trans = 1;
10020                                 key = found_key;
10021                                 btrfs_release_path(path);
10022                                 goto again;
10023                         }
10024                         bad_roots++;
10025                 }
10026 next:
10027                 path->slots[0]++;
10028         }
10029         ret = 0;
10030 out:
10031         free_roots_info_cache();
10032         btrfs_free_path(path);
10033         if (trans)
10034                 btrfs_commit_transaction(trans, info->tree_root);
10035         if (ret < 0)
10036                 return ret;
10037
10038         return bad_roots;
10039 }
10040
10041 const char * const cmd_check_usage[] = {
10042         "btrfs check [options] <device>",
10043         "Check structural integrity of a filesystem (unmounted).",
10044         "Check structural integrity of an unmounted filesystem. Verify internal",
10045         "trees' consistency and item connectivity. In the repair mode try to",
10046         "fix the problems found.",
10047         "WARNING: the repair mode is considered dangerous",
10048         "",
10049         "-s|--super <superblock>     use this superblock copy",
10050         "-b|--backup                 use the first valid backup root copy",
10051         "--repair                    try to repair the filesystem",
10052         "--readonly                  run in read-only mode (default)",
10053         "--init-csum-tree            create a new CRC tree",
10054         "--init-extent-tree          create a new extent tree",
10055         "--check-data-csum           verify checksums of data blocks",
10056         "-Q|--qgroup-report           print a report on qgroup consistency",
10057         "-E|--subvol-extents <subvolid>",
10058         "                            print subvolume extents and sharing state",
10059         "-r|--tree-root <bytenr>     use the given bytenr for the tree root",
10060         "--chunk-root <bytenr>       use the given bytenr for the chunk tree root",
10061         "-p|--progress               indicate progress",
10062         NULL
10063 };
10064
10065 int cmd_check(int argc, char **argv)
10066 {
10067         struct cache_tree root_cache;
10068         struct btrfs_root *root;
10069         struct btrfs_fs_info *info;
10070         u64 bytenr = 0;
10071         u64 subvolid = 0;
10072         u64 tree_root_bytenr = 0;
10073         u64 chunk_root_bytenr = 0;
10074         char uuidbuf[BTRFS_UUID_UNPARSED_SIZE];
10075         int ret;
10076         u64 num;
10077         int init_csum_tree = 0;
10078         int readonly = 0;
10079         int qgroup_report = 0;
10080         int qgroups_repaired = 0;
10081         enum btrfs_open_ctree_flags ctree_flags = OPEN_CTREE_EXCLUSIVE;
10082
10083         while(1) {
10084                 int c;
10085                 enum { GETOPT_VAL_REPAIR = 257, GETOPT_VAL_INIT_CSUM,
10086                         GETOPT_VAL_INIT_EXTENT, GETOPT_VAL_CHECK_CSUM,
10087                         GETOPT_VAL_READONLY, GETOPT_VAL_CHUNK_TREE };
10088                 static const struct option long_options[] = {
10089                         { "super", required_argument, NULL, 's' },
10090                         { "repair", no_argument, NULL, GETOPT_VAL_REPAIR },
10091                         { "readonly", no_argument, NULL, GETOPT_VAL_READONLY },
10092                         { "init-csum-tree", no_argument, NULL,
10093                                 GETOPT_VAL_INIT_CSUM },
10094                         { "init-extent-tree", no_argument, NULL,
10095                                 GETOPT_VAL_INIT_EXTENT },
10096                         { "check-data-csum", no_argument, NULL,
10097                                 GETOPT_VAL_CHECK_CSUM },
10098                         { "backup", no_argument, NULL, 'b' },
10099                         { "subvol-extents", required_argument, NULL, 'E' },
10100                         { "qgroup-report", no_argument, NULL, 'Q' },
10101                         { "tree-root", required_argument, NULL, 'r' },
10102                         { "chunk-root", required_argument, NULL,
10103                                 GETOPT_VAL_CHUNK_TREE },
10104                         { "progress", no_argument, NULL, 'p' },
10105                         { NULL, 0, NULL, 0}
10106                 };
10107
10108                 c = getopt_long(argc, argv, "as:br:p", long_options, NULL);
10109                 if (c < 0)
10110                         break;
10111                 switch(c) {
10112                         case 'a': /* ignored */ break;
10113                         case 'b':
10114                                 ctree_flags |= OPEN_CTREE_BACKUP_ROOT;
10115                                 break;
10116                         case 's':
10117                                 num = arg_strtou64(optarg);
10118                                 if (num >= BTRFS_SUPER_MIRROR_MAX) {
10119                                         fprintf(stderr,
10120                                                 "ERROR: super mirror should be less than: %d\n",
10121                                                 BTRFS_SUPER_MIRROR_MAX);
10122                                         exit(1);
10123                                 }
10124                                 bytenr = btrfs_sb_offset(((int)num));
10125                                 printf("using SB copy %llu, bytenr %llu\n", num,
10126                                        (unsigned long long)bytenr);
10127                                 break;
10128                         case 'Q':
10129                                 qgroup_report = 1;
10130                                 break;
10131                         case 'E':
10132                                 subvolid = arg_strtou64(optarg);
10133                                 break;
10134                         case 'r':
10135                                 tree_root_bytenr = arg_strtou64(optarg);
10136                                 break;
10137                         case GETOPT_VAL_CHUNK_TREE:
10138                                 chunk_root_bytenr = arg_strtou64(optarg);
10139                                 break;
10140                         case 'p':
10141                                 ctx.progress_enabled = true;
10142                                 break;
10143                         case '?':
10144                         case 'h':
10145                                 usage(cmd_check_usage);
10146                         case GETOPT_VAL_REPAIR:
10147                                 printf("enabling repair mode\n");
10148                                 repair = 1;
10149                                 ctree_flags |= OPEN_CTREE_WRITES;
10150                                 break;
10151                         case GETOPT_VAL_READONLY:
10152                                 readonly = 1;
10153                                 break;
10154                         case GETOPT_VAL_INIT_CSUM:
10155                                 printf("Creating a new CRC tree\n");
10156                                 init_csum_tree = 1;
10157                                 repair = 1;
10158                                 ctree_flags |= OPEN_CTREE_WRITES;
10159                                 break;
10160                         case GETOPT_VAL_INIT_EXTENT:
10161                                 init_extent_tree = 1;
10162                                 ctree_flags |= (OPEN_CTREE_WRITES |
10163                                                 OPEN_CTREE_NO_BLOCK_GROUPS);
10164                                 repair = 1;
10165                                 break;
10166                         case GETOPT_VAL_CHECK_CSUM:
10167                                 check_data_csum = 1;
10168                                 break;
10169                 }
10170         }
10171
10172         if (check_argc_exact(argc - optind, 1))
10173                 usage(cmd_check_usage);
10174
10175         if (ctx.progress_enabled) {
10176                 ctx.tp = TASK_NOTHING;
10177                 ctx.info = task_init(print_status_check, print_status_return, &ctx);
10178         }
10179
10180         /* This check is the only reason for --readonly to exist */
10181         if (readonly && repair) {
10182                 fprintf(stderr, "Repair options are not compatible with --readonly\n");
10183                 exit(1);
10184         }
10185
10186         radix_tree_init();
10187         cache_tree_init(&root_cache);
10188
10189         if((ret = check_mounted(argv[optind])) < 0) {
10190                 fprintf(stderr, "Could not check mount status: %s\n", strerror(-ret));
10191                 goto err_out;
10192         } else if(ret) {
10193                 fprintf(stderr, "%s is currently mounted. Aborting.\n", argv[optind]);
10194                 ret = -EBUSY;
10195                 goto err_out;
10196         }
10197
10198         /* only allow partial opening under repair mode */
10199         if (repair)
10200                 ctree_flags |= OPEN_CTREE_PARTIAL;
10201
10202         info = open_ctree_fs_info(argv[optind], bytenr, tree_root_bytenr,
10203                                   chunk_root_bytenr, ctree_flags);
10204         if (!info) {
10205                 fprintf(stderr, "Couldn't open file system\n");
10206                 ret = -EIO;
10207                 goto err_out;
10208         }
10209
10210         global_info = info;
10211         root = info->fs_root;
10212
10213         /*
10214          * repair mode will force us to commit transaction which
10215          * will make us fail to load log tree when mounting.
10216          */
10217         if (repair && btrfs_super_log_root(info->super_copy)) {
10218                 ret = ask_user("repair mode will force to clear out log tree, Are you sure?");
10219                 if (!ret) {
10220                         ret = 1;
10221                         goto close_out;
10222                 }
10223                 ret = zero_log_tree(root);
10224                 if (ret) {
10225                         fprintf(stderr, "fail to zero log tree\n");
10226                         goto close_out;
10227                 }
10228         }
10229
10230         uuid_unparse(info->super_copy->fsid, uuidbuf);
10231         if (qgroup_report) {
10232                 printf("Print quota groups for %s\nUUID: %s\n", argv[optind],
10233                        uuidbuf);
10234                 ret = qgroup_verify_all(info);
10235                 if (ret == 0)
10236                         report_qgroups(1);
10237                 goto close_out;
10238         }
10239         if (subvolid) {
10240                 printf("Print extent state for subvolume %llu on %s\nUUID: %s\n",
10241                        subvolid, argv[optind], uuidbuf);
10242                 ret = print_extent_state(info, subvolid);
10243                 goto close_out;
10244         }
10245         printf("Checking filesystem on %s\nUUID: %s\n", argv[optind], uuidbuf);
10246
10247         if (!extent_buffer_uptodate(info->tree_root->node) ||
10248             !extent_buffer_uptodate(info->dev_root->node) ||
10249             !extent_buffer_uptodate(info->chunk_root->node)) {
10250                 fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
10251                 ret = -EIO;
10252                 goto close_out;
10253         }
10254
10255         if (init_extent_tree || init_csum_tree) {
10256                 struct btrfs_trans_handle *trans;
10257
10258                 trans = btrfs_start_transaction(info->extent_root, 0);
10259                 if (IS_ERR(trans)) {
10260                         fprintf(stderr, "Error starting transaction\n");
10261                         ret = PTR_ERR(trans);
10262                         goto close_out;
10263                 }
10264
10265                 if (init_extent_tree) {
10266                         printf("Creating a new extent tree\n");
10267                         ret = reinit_extent_tree(trans, info);
10268                         if (ret)
10269                                 goto close_out;
10270                 }
10271
10272                 if (init_csum_tree) {
10273                         fprintf(stderr, "Reinit crc root\n");
10274                         ret = btrfs_fsck_reinit_root(trans, info->csum_root, 0);
10275                         if (ret) {
10276                                 fprintf(stderr, "crc root initialization failed\n");
10277                                 ret = -EIO;
10278                                 goto close_out;
10279                         }
10280
10281                         ret = fill_csum_tree(trans, info->csum_root,
10282                                              init_extent_tree);
10283                         if (ret) {
10284                                 fprintf(stderr, "crc refilling failed\n");
10285                                 return -EIO;
10286                         }
10287                 }
10288                 /*
10289                  * Ok now we commit and run the normal fsck, which will add
10290                  * extent entries for all of the items it finds.
10291                  */
10292                 ret = btrfs_commit_transaction(trans, info->extent_root);
10293                 if (ret)
10294                         goto close_out;
10295         }
10296         if (!extent_buffer_uptodate(info->extent_root->node)) {
10297                 fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
10298                 ret = -EIO;
10299                 goto close_out;
10300         }
10301         if (!extent_buffer_uptodate(info->csum_root->node)) {
10302                 fprintf(stderr, "Checksum root corrupted, rerun with --init-csum-tree option\n");
10303                 ret = -EIO;
10304                 goto close_out;
10305         }
10306
10307         if (!ctx.progress_enabled)
10308                 fprintf(stderr, "checking extents\n");
10309         ret = check_chunks_and_extents(root);
10310         if (ret)
10311                 fprintf(stderr, "Errors found in extent allocation tree or chunk allocation\n");
10312
10313         ret = repair_root_items(info);
10314         if (ret < 0)
10315                 goto close_out;
10316         if (repair) {
10317                 fprintf(stderr, "Fixed %d roots.\n", ret);
10318                 ret = 0;
10319         } else if (ret > 0) {
10320                 fprintf(stderr,
10321                        "Found %d roots with an outdated root item.\n",
10322                        ret);
10323                 fprintf(stderr,
10324                         "Please run a filesystem check with the option --repair to fix them.\n");
10325                 ret = 1;
10326                 goto close_out;
10327         }
10328
10329         if (!ctx.progress_enabled) {
10330                 if (btrfs_fs_compat_ro(info, BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE))
10331                         fprintf(stderr, "checking free space tree\n");
10332                 else
10333                         fprintf(stderr, "checking free space cache\n");
10334         }
10335         ret = check_space_cache(root);
10336         if (ret)
10337                 goto out;
10338
10339         /*
10340          * We used to have to have these hole extents in between our real
10341          * extents so if we don't have this flag set we need to make sure there
10342          * are no gaps in the file extents for inodes, otherwise we can just
10343          * ignore it when this happens.
10344          */
10345         no_holes = btrfs_fs_incompat(root->fs_info,
10346                                      BTRFS_FEATURE_INCOMPAT_NO_HOLES);
10347         if (!ctx.progress_enabled)
10348                 fprintf(stderr, "checking fs roots\n");
10349         ret = check_fs_roots(root, &root_cache);
10350         if (ret)
10351                 goto out;
10352
10353         fprintf(stderr, "checking csums\n");
10354         ret = check_csums(root);
10355         if (ret)
10356                 goto out;
10357
10358         fprintf(stderr, "checking root refs\n");
10359         ret = check_root_refs(root, &root_cache);
10360         if (ret)
10361                 goto out;
10362
10363         while (repair && !list_empty(&root->fs_info->recow_ebs)) {
10364                 struct extent_buffer *eb;
10365
10366                 eb = list_first_entry(&root->fs_info->recow_ebs,
10367                                       struct extent_buffer, recow);
10368                 list_del_init(&eb->recow);
10369                 ret = recow_extent_buffer(root, eb);
10370                 if (ret)
10371                         break;
10372         }
10373
10374         while (!list_empty(&delete_items)) {
10375                 struct bad_item *bad;
10376
10377                 bad = list_first_entry(&delete_items, struct bad_item, list);
10378                 list_del_init(&bad->list);
10379                 if (repair)
10380                         ret = delete_bad_item(root, bad);
10381                 free(bad);
10382         }
10383
10384         if (info->quota_enabled) {
10385                 int err;
10386                 fprintf(stderr, "checking quota groups\n");
10387                 err = qgroup_verify_all(info);
10388                 if (err)
10389                         goto out;
10390                 report_qgroups(0);
10391                 err = repair_qgroups(info, &qgroups_repaired);
10392                 if (err)
10393                         goto out;
10394         }
10395
10396         if (!list_empty(&root->fs_info->recow_ebs)) {
10397                 fprintf(stderr, "Transid errors in file system\n");
10398                 ret = 1;
10399         }
10400 out:
10401         /* Don't override original ret */
10402         if (!ret && qgroups_repaired)
10403                 ret = qgroups_repaired;
10404
10405         if (found_old_backref) { /*
10406                  * there was a disk format change when mixed
10407                  * backref was in testing tree. The old format
10408                  * existed about one week.
10409                  */
10410                 printf("\n * Found old mixed backref format. "
10411                        "The old format is not supported! *"
10412                        "\n * Please mount the FS in readonly mode, "
10413                        "backup data and re-format the FS. *\n\n");
10414                 ret = 1;
10415         }
10416         printf("found %llu bytes used err is %d\n",
10417                (unsigned long long)bytes_used, ret);
10418         printf("total csum bytes: %llu\n",(unsigned long long)total_csum_bytes);
10419         printf("total tree bytes: %llu\n",
10420                (unsigned long long)total_btree_bytes);
10421         printf("total fs tree bytes: %llu\n",
10422                (unsigned long long)total_fs_tree_bytes);
10423         printf("total extent tree bytes: %llu\n",
10424                (unsigned long long)total_extent_tree_bytes);
10425         printf("btree space waste bytes: %llu\n",
10426                (unsigned long long)btree_space_waste);
10427         printf("file data blocks allocated: %llu\n referenced %llu\n",
10428                 (unsigned long long)data_bytes_allocated,
10429                 (unsigned long long)data_bytes_referenced);
10430
10431         free_qgroup_counts();
10432         free_root_recs_tree(&root_cache);
10433 close_out:
10434         close_ctree(root);
10435 err_out:
10436         if (ctx.progress_enabled)
10437                 task_deinit(ctx.info);
10438
10439         return ret;
10440 }