31ed58945c7e23ab5c0dcbdff3120a06c0a88b41
[platform/upstream/btrfs-progs.git] / cmds-check.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <unistd.h>
22 #include <fcntl.h>
23 #include <sys/types.h>
24 #include <sys/stat.h>
25 #include <unistd.h>
26 #include <getopt.h>
27 #include <uuid/uuid.h>
28 #include "ctree.h"
29 #include "volumes.h"
30 #include "repair.h"
31 #include "disk-io.h"
32 #include "print-tree.h"
33 #include "transaction.h"
34 #include "utils.h"
35 #include "commands.h"
36 #include "free-space-cache.h"
37 #include "btrfsck.h"
38 #include "qgroup-verify.h"
39 #include "rbtree-utils.h"
40 #include "backref.h"
41 #include "ulist.h"
42
43 static u64 bytes_used = 0;
44 static u64 total_csum_bytes = 0;
45 static u64 total_btree_bytes = 0;
46 static u64 total_fs_tree_bytes = 0;
47 static u64 total_extent_tree_bytes = 0;
48 static u64 btree_space_waste = 0;
49 static u64 data_bytes_allocated = 0;
50 static u64 data_bytes_referenced = 0;
51 static int found_old_backref = 0;
52 static LIST_HEAD(duplicate_extents);
53 static LIST_HEAD(delete_items);
54 static int repair = 0;
55 static int no_holes = 0;
56 static int init_extent_tree = 0;
57 static int check_data_csum = 0;
58
59 struct extent_backref {
60         struct list_head list;
61         unsigned int is_data:1;
62         unsigned int found_extent_tree:1;
63         unsigned int full_backref:1;
64         unsigned int found_ref:1;
65         unsigned int broken:1;
66 };
67
68 struct data_backref {
69         struct extent_backref node;
70         union {
71                 u64 parent;
72                 u64 root;
73         };
74         u64 owner;
75         u64 offset;
76         u64 disk_bytenr;
77         u64 bytes;
78         u64 ram_bytes;
79         u32 num_refs;
80         u32 found_ref;
81 };
82
83 /*
84  * Much like data_backref, just removed the undetermined members
85  * and change it to use list_head.
86  * During extent scan, it is stored in root->orphan_data_extent.
87  * During fs tree scan, it is then moved to inode_rec->orphan_data_extents.
88  */
89 struct orphan_data_extent {
90         struct list_head list;
91         u64 root;
92         u64 objectid;
93         u64 offset;
94         u64 disk_bytenr;
95         u64 disk_len;
96 };
97
98 struct tree_backref {
99         struct extent_backref node;
100         union {
101                 u64 parent;
102                 u64 root;
103         };
104 };
105
106 struct extent_record {
107         struct list_head backrefs;
108         struct list_head dups;
109         struct list_head list;
110         struct cache_extent cache;
111         struct btrfs_disk_key parent_key;
112         u64 start;
113         u64 max_size;
114         u64 nr;
115         u64 refs;
116         u64 extent_item_refs;
117         u64 generation;
118         u64 parent_generation;
119         u64 info_objectid;
120         u32 num_duplicates;
121         u8 info_level;
122         int flag_block_full_backref;
123         unsigned int found_rec:1;
124         unsigned int content_checked:1;
125         unsigned int owner_ref_checked:1;
126         unsigned int is_root:1;
127         unsigned int metadata:1;
128         unsigned int bad_full_backref:1;
129         unsigned int crossing_stripes:1;
130 };
131
132 struct inode_backref {
133         struct list_head list;
134         unsigned int found_dir_item:1;
135         unsigned int found_dir_index:1;
136         unsigned int found_inode_ref:1;
137         unsigned int filetype:8;
138         int errors;
139         unsigned int ref_type;
140         u64 dir;
141         u64 index;
142         u16 namelen;
143         char name[0];
144 };
145
146 struct root_item_record {
147         struct list_head list;
148         u64 objectid;
149         u64 bytenr;
150         u64 last_snapshot;
151         u8 level;
152         u8 drop_level;
153         int level_size;
154         struct btrfs_key drop_key;
155 };
156
157 #define REF_ERR_NO_DIR_ITEM             (1 << 0)
158 #define REF_ERR_NO_DIR_INDEX            (1 << 1)
159 #define REF_ERR_NO_INODE_REF            (1 << 2)
160 #define REF_ERR_DUP_DIR_ITEM            (1 << 3)
161 #define REF_ERR_DUP_DIR_INDEX           (1 << 4)
162 #define REF_ERR_DUP_INODE_REF           (1 << 5)
163 #define REF_ERR_INDEX_UNMATCH           (1 << 6)
164 #define REF_ERR_FILETYPE_UNMATCH        (1 << 7)
165 #define REF_ERR_NAME_TOO_LONG           (1 << 8) // 100
166 #define REF_ERR_NO_ROOT_REF             (1 << 9)
167 #define REF_ERR_NO_ROOT_BACKREF         (1 << 10)
168 #define REF_ERR_DUP_ROOT_REF            (1 << 11)
169 #define REF_ERR_DUP_ROOT_BACKREF        (1 << 12)
170
171 struct file_extent_hole {
172         struct rb_node node;
173         u64 start;
174         u64 len;
175 };
176
177 /* Compatible function to allow reuse of old codes */
178 static u64 first_extent_gap(struct rb_root *holes)
179 {
180         struct file_extent_hole *hole;
181
182         if (RB_EMPTY_ROOT(holes))
183                 return (u64)-1;
184
185         hole = rb_entry(rb_first(holes), struct file_extent_hole, node);
186         return hole->start;
187 }
188
189 int compare_hole(struct rb_node *node1, struct rb_node *node2)
190 {
191         struct file_extent_hole *hole1;
192         struct file_extent_hole *hole2;
193
194         hole1 = rb_entry(node1, struct file_extent_hole, node);
195         hole2 = rb_entry(node2, struct file_extent_hole, node);
196
197         if (hole1->start > hole2->start)
198                 return -1;
199         if (hole1->start < hole2->start)
200                 return 1;
201         /* Now hole1->start == hole2->start */
202         if (hole1->len >= hole2->len)
203                 /*
204                  * Hole 1 will be merge center
205                  * Same hole will be merged later
206                  */
207                 return -1;
208         /* Hole 2 will be merge center */
209         return 1;
210 }
211
212 /*
213  * Add a hole to the record
214  *
215  * This will do hole merge for copy_file_extent_holes(),
216  * which will ensure there won't be continuous holes.
217  */
218 static int add_file_extent_hole(struct rb_root *holes,
219                                 u64 start, u64 len)
220 {
221         struct file_extent_hole *hole;
222         struct file_extent_hole *prev = NULL;
223         struct file_extent_hole *next = NULL;
224
225         hole = malloc(sizeof(*hole));
226         if (!hole)
227                 return -ENOMEM;
228         hole->start = start;
229         hole->len = len;
230         /* Since compare will not return 0, no -EEXIST will happen */
231         rb_insert(holes, &hole->node, compare_hole);
232
233         /* simple merge with previous hole */
234         if (rb_prev(&hole->node))
235                 prev = rb_entry(rb_prev(&hole->node), struct file_extent_hole,
236                                 node);
237         if (prev && prev->start + prev->len >= hole->start) {
238                 hole->len = hole->start + hole->len - prev->start;
239                 hole->start = prev->start;
240                 rb_erase(&prev->node, holes);
241                 free(prev);
242                 prev = NULL;
243         }
244
245         /* iterate merge with next holes */
246         while (1) {
247                 if (!rb_next(&hole->node))
248                         break;
249                 next = rb_entry(rb_next(&hole->node), struct file_extent_hole,
250                                         node);
251                 if (hole->start + hole->len >= next->start) {
252                         if (hole->start + hole->len <= next->start + next->len)
253                                 hole->len = next->start + next->len -
254                                             hole->start;
255                         rb_erase(&next->node, holes);
256                         free(next);
257                         next = NULL;
258                 } else
259                         break;
260         }
261         return 0;
262 }
263
264 static int compare_hole_range(struct rb_node *node, void *data)
265 {
266         struct file_extent_hole *hole;
267         u64 start;
268
269         hole = (struct file_extent_hole *)data;
270         start = hole->start;
271
272         hole = rb_entry(node, struct file_extent_hole, node);
273         if (start < hole->start)
274                 return -1;
275         if (start >= hole->start && start < hole->start + hole->len)
276                 return 0;
277         return 1;
278 }
279
280 /*
281  * Delete a hole in the record
282  *
283  * This will do the hole split and is much restrict than add.
284  */
285 static int del_file_extent_hole(struct rb_root *holes,
286                                 u64 start, u64 len)
287 {
288         struct file_extent_hole *hole;
289         struct file_extent_hole tmp;
290         u64 prev_start = 0;
291         u64 prev_len = 0;
292         u64 next_start = 0;
293         u64 next_len = 0;
294         struct rb_node *node;
295         int have_prev = 0;
296         int have_next = 0;
297         int ret = 0;
298
299         tmp.start = start;
300         tmp.len = len;
301         node = rb_search(holes, &tmp, compare_hole_range, NULL);
302         if (!node)
303                 return -EEXIST;
304         hole = rb_entry(node, struct file_extent_hole, node);
305         if (start + len > hole->start + hole->len)
306                 return -EEXIST;
307
308         /*
309          * Now there will be no overflap, delete the hole and re-add the
310          * split(s) if they exists.
311          */
312         if (start > hole->start) {
313                 prev_start = hole->start;
314                 prev_len = start - hole->start;
315                 have_prev = 1;
316         }
317         if (hole->start + hole->len > start + len) {
318                 next_start = start + len;
319                 next_len = hole->start + hole->len - start - len;
320                 have_next = 1;
321         }
322         rb_erase(node, holes);
323         free(hole);
324         if (have_prev) {
325                 ret = add_file_extent_hole(holes, prev_start, prev_len);
326                 if (ret < 0)
327                         return ret;
328         }
329         if (have_next) {
330                 ret = add_file_extent_hole(holes, next_start, next_len);
331                 if (ret < 0)
332                         return ret;
333         }
334         return 0;
335 }
336
337 static int copy_file_extent_holes(struct rb_root *dst,
338                                   struct rb_root *src)
339 {
340         struct file_extent_hole *hole;
341         struct rb_node *node;
342         int ret = 0;
343
344         node = rb_first(src);
345         while (node) {
346                 hole = rb_entry(node, struct file_extent_hole, node);
347                 ret = add_file_extent_hole(dst, hole->start, hole->len);
348                 if (ret)
349                         break;
350                 node = rb_next(node);
351         }
352         return ret;
353 }
354
355 static void free_file_extent_holes(struct rb_root *holes)
356 {
357         struct rb_node *node;
358         struct file_extent_hole *hole;
359
360         node = rb_first(holes);
361         while (node) {
362                 hole = rb_entry(node, struct file_extent_hole, node);
363                 rb_erase(node, holes);
364                 free(hole);
365                 node = rb_first(holes);
366         }
367 }
368
369 struct inode_record {
370         struct list_head backrefs;
371         unsigned int checked:1;
372         unsigned int merging:1;
373         unsigned int found_inode_item:1;
374         unsigned int found_dir_item:1;
375         unsigned int found_file_extent:1;
376         unsigned int found_csum_item:1;
377         unsigned int some_csum_missing:1;
378         unsigned int nodatasum:1;
379         int errors;
380
381         u64 ino;
382         u32 nlink;
383         u32 imode;
384         u64 isize;
385         u64 nbytes;
386
387         u32 found_link;
388         u64 found_size;
389         u64 extent_start;
390         u64 extent_end;
391         struct rb_root holes;
392         struct list_head orphan_extents;
393
394         u32 refs;
395 };
396
397 #define I_ERR_NO_INODE_ITEM             (1 << 0)
398 #define I_ERR_NO_ORPHAN_ITEM            (1 << 1)
399 #define I_ERR_DUP_INODE_ITEM            (1 << 2)
400 #define I_ERR_DUP_DIR_INDEX             (1 << 3)
401 #define I_ERR_ODD_DIR_ITEM              (1 << 4)
402 #define I_ERR_ODD_FILE_EXTENT           (1 << 5)
403 #define I_ERR_BAD_FILE_EXTENT           (1 << 6)
404 #define I_ERR_FILE_EXTENT_OVERLAP       (1 << 7)
405 #define I_ERR_FILE_EXTENT_DISCOUNT      (1 << 8) // 100
406 #define I_ERR_DIR_ISIZE_WRONG           (1 << 9)
407 #define I_ERR_FILE_NBYTES_WRONG         (1 << 10) // 400
408 #define I_ERR_ODD_CSUM_ITEM             (1 << 11)
409 #define I_ERR_SOME_CSUM_MISSING         (1 << 12)
410 #define I_ERR_LINK_COUNT_WRONG          (1 << 13)
411 #define I_ERR_FILE_EXTENT_ORPHAN        (1 << 14)
412
413 struct root_backref {
414         struct list_head list;
415         unsigned int found_dir_item:1;
416         unsigned int found_dir_index:1;
417         unsigned int found_back_ref:1;
418         unsigned int found_forward_ref:1;
419         unsigned int reachable:1;
420         int errors;
421         u64 ref_root;
422         u64 dir;
423         u64 index;
424         u16 namelen;
425         char name[0];
426 };
427
428 struct root_record {
429         struct list_head backrefs;
430         struct cache_extent cache;
431         unsigned int found_root_item:1;
432         u64 objectid;
433         u32 found_ref;
434 };
435
436 struct ptr_node {
437         struct cache_extent cache;
438         void *data;
439 };
440
441 struct shared_node {
442         struct cache_extent cache;
443         struct cache_tree root_cache;
444         struct cache_tree inode_cache;
445         struct inode_record *current;
446         u32 refs;
447 };
448
449 struct block_info {
450         u64 start;
451         u32 size;
452 };
453
454 struct walk_control {
455         struct cache_tree shared;
456         struct shared_node *nodes[BTRFS_MAX_LEVEL];
457         int active_node;
458         int root_level;
459 };
460
461 struct bad_item {
462         struct btrfs_key key;
463         u64 root_id;
464         struct list_head list;
465 };
466
467 static void reset_cached_block_groups(struct btrfs_fs_info *fs_info);
468
469 static void record_root_in_trans(struct btrfs_trans_handle *trans,
470                                  struct btrfs_root *root)
471 {
472         if (root->last_trans != trans->transid) {
473                 root->track_dirty = 1;
474                 root->last_trans = trans->transid;
475                 root->commit_root = root->node;
476                 extent_buffer_get(root->node);
477         }
478 }
479
480 static u8 imode_to_type(u32 imode)
481 {
482 #define S_SHIFT 12
483         static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
484                 [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
485                 [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
486                 [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
487                 [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
488                 [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
489                 [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
490                 [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
491         };
492
493         return btrfs_type_by_mode[(imode & S_IFMT) >> S_SHIFT];
494 #undef S_SHIFT
495 }
496
497 static int device_record_compare(struct rb_node *node1, struct rb_node *node2)
498 {
499         struct device_record *rec1;
500         struct device_record *rec2;
501
502         rec1 = rb_entry(node1, struct device_record, node);
503         rec2 = rb_entry(node2, struct device_record, node);
504         if (rec1->devid > rec2->devid)
505                 return -1;
506         else if (rec1->devid < rec2->devid)
507                 return 1;
508         else
509                 return 0;
510 }
511
512 static struct inode_record *clone_inode_rec(struct inode_record *orig_rec)
513 {
514         struct inode_record *rec;
515         struct inode_backref *backref;
516         struct inode_backref *orig;
517         struct orphan_data_extent *src_orphan;
518         struct orphan_data_extent *dst_orphan;
519         size_t size;
520         int ret;
521
522         rec = malloc(sizeof(*rec));
523         memcpy(rec, orig_rec, sizeof(*rec));
524         rec->refs = 1;
525         INIT_LIST_HEAD(&rec->backrefs);
526         INIT_LIST_HEAD(&rec->orphan_extents);
527         rec->holes = RB_ROOT;
528
529         list_for_each_entry(orig, &orig_rec->backrefs, list) {
530                 size = sizeof(*orig) + orig->namelen + 1;
531                 backref = malloc(size);
532                 memcpy(backref, orig, size);
533                 list_add_tail(&backref->list, &rec->backrefs);
534         }
535         list_for_each_entry(src_orphan, &orig_rec->orphan_extents, list) {
536                 dst_orphan = malloc(sizeof(*dst_orphan));
537                 /* TODO: Fix all the HELL of un-catched -ENOMEM case */
538                 BUG_ON(!dst_orphan);
539                 memcpy(dst_orphan, src_orphan, sizeof(*src_orphan));
540                 list_add_tail(&dst_orphan->list, &rec->orphan_extents);
541         }
542         ret = copy_file_extent_holes(&rec->holes, &orig_rec->holes);
543         BUG_ON(ret < 0);
544
545         return rec;
546 }
547
548 static void print_orphan_data_extents(struct list_head *orphan_extents,
549                                       u64 objectid)
550 {
551         struct orphan_data_extent *orphan;
552
553         if (list_empty(orphan_extents))
554                 return;
555         printf("The following data extent is lost in tree %llu:\n",
556                objectid);
557         list_for_each_entry(orphan, orphan_extents, list) {
558                 printf("\tinode: %llu, offset:%llu, disk_bytenr: %llu, disk_len: %llu\n",
559                        orphan->objectid, orphan->offset, orphan->disk_bytenr,
560                        orphan->disk_len);
561         }
562 }
563
564 static void print_inode_error(struct btrfs_root *root, struct inode_record *rec)
565 {
566         u64 root_objectid = root->root_key.objectid;
567         int errors = rec->errors;
568
569         if (!errors)
570                 return;
571         /* reloc root errors, we print its corresponding fs root objectid*/
572         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
573                 root_objectid = root->root_key.offset;
574                 fprintf(stderr, "reloc");
575         }
576         fprintf(stderr, "root %llu inode %llu errors %x",
577                 (unsigned long long) root_objectid,
578                 (unsigned long long) rec->ino, rec->errors);
579
580         if (errors & I_ERR_NO_INODE_ITEM)
581                 fprintf(stderr, ", no inode item");
582         if (errors & I_ERR_NO_ORPHAN_ITEM)
583                 fprintf(stderr, ", no orphan item");
584         if (errors & I_ERR_DUP_INODE_ITEM)
585                 fprintf(stderr, ", dup inode item");
586         if (errors & I_ERR_DUP_DIR_INDEX)
587                 fprintf(stderr, ", dup dir index");
588         if (errors & I_ERR_ODD_DIR_ITEM)
589                 fprintf(stderr, ", odd dir item");
590         if (errors & I_ERR_ODD_FILE_EXTENT)
591                 fprintf(stderr, ", odd file extent");
592         if (errors & I_ERR_BAD_FILE_EXTENT)
593                 fprintf(stderr, ", bad file extent");
594         if (errors & I_ERR_FILE_EXTENT_OVERLAP)
595                 fprintf(stderr, ", file extent overlap");
596         if (errors & I_ERR_FILE_EXTENT_DISCOUNT)
597                 fprintf(stderr, ", file extent discount");
598         if (errors & I_ERR_DIR_ISIZE_WRONG)
599                 fprintf(stderr, ", dir isize wrong");
600         if (errors & I_ERR_FILE_NBYTES_WRONG)
601                 fprintf(stderr, ", nbytes wrong");
602         if (errors & I_ERR_ODD_CSUM_ITEM)
603                 fprintf(stderr, ", odd csum item");
604         if (errors & I_ERR_SOME_CSUM_MISSING)
605                 fprintf(stderr, ", some csum missing");
606         if (errors & I_ERR_LINK_COUNT_WRONG)
607                 fprintf(stderr, ", link count wrong");
608         if (errors & I_ERR_FILE_EXTENT_ORPHAN)
609                 fprintf(stderr, ", orphan file extent");
610         fprintf(stderr, "\n");
611         /* Print the orphan extents if needed */
612         if (errors & I_ERR_FILE_EXTENT_ORPHAN)
613                 print_orphan_data_extents(&rec->orphan_extents, root->objectid);
614
615         /* Print the holes if needed */
616         if (errors & I_ERR_FILE_EXTENT_DISCOUNT) {
617                 struct file_extent_hole *hole;
618                 struct rb_node *node;
619                 int found = 0;
620
621                 node = rb_first(&rec->holes);
622                 fprintf(stderr, "Found file extent holes:\n");
623                 while (node) {
624                         found = 1;
625                         hole = rb_entry(node, struct file_extent_hole, node);
626                         fprintf(stderr, "\tstart: %llu, len: %llu\n",
627                                 hole->start, hole->len);
628                         node = rb_next(node);
629                 }
630                 if (!found)
631                         fprintf(stderr, "\tstart: 0, len: %llu\n",
632                                 round_up(rec->isize, root->sectorsize));
633         }
634 }
635
636 static void print_ref_error(int errors)
637 {
638         if (errors & REF_ERR_NO_DIR_ITEM)
639                 fprintf(stderr, ", no dir item");
640         if (errors & REF_ERR_NO_DIR_INDEX)
641                 fprintf(stderr, ", no dir index");
642         if (errors & REF_ERR_NO_INODE_REF)
643                 fprintf(stderr, ", no inode ref");
644         if (errors & REF_ERR_DUP_DIR_ITEM)
645                 fprintf(stderr, ", dup dir item");
646         if (errors & REF_ERR_DUP_DIR_INDEX)
647                 fprintf(stderr, ", dup dir index");
648         if (errors & REF_ERR_DUP_INODE_REF)
649                 fprintf(stderr, ", dup inode ref");
650         if (errors & REF_ERR_INDEX_UNMATCH)
651                 fprintf(stderr, ", index unmatch");
652         if (errors & REF_ERR_FILETYPE_UNMATCH)
653                 fprintf(stderr, ", filetype unmatch");
654         if (errors & REF_ERR_NAME_TOO_LONG)
655                 fprintf(stderr, ", name too long");
656         if (errors & REF_ERR_NO_ROOT_REF)
657                 fprintf(stderr, ", no root ref");
658         if (errors & REF_ERR_NO_ROOT_BACKREF)
659                 fprintf(stderr, ", no root backref");
660         if (errors & REF_ERR_DUP_ROOT_REF)
661                 fprintf(stderr, ", dup root ref");
662         if (errors & REF_ERR_DUP_ROOT_BACKREF)
663                 fprintf(stderr, ", dup root backref");
664         fprintf(stderr, "\n");
665 }
666
667 static struct inode_record *get_inode_rec(struct cache_tree *inode_cache,
668                                           u64 ino, int mod)
669 {
670         struct ptr_node *node;
671         struct cache_extent *cache;
672         struct inode_record *rec = NULL;
673         int ret;
674
675         cache = lookup_cache_extent(inode_cache, ino, 1);
676         if (cache) {
677                 node = container_of(cache, struct ptr_node, cache);
678                 rec = node->data;
679                 if (mod && rec->refs > 1) {
680                         node->data = clone_inode_rec(rec);
681                         rec->refs--;
682                         rec = node->data;
683                 }
684         } else if (mod) {
685                 rec = calloc(1, sizeof(*rec));
686                 rec->ino = ino;
687                 rec->extent_start = (u64)-1;
688                 rec->refs = 1;
689                 INIT_LIST_HEAD(&rec->backrefs);
690                 INIT_LIST_HEAD(&rec->orphan_extents);
691                 rec->holes = RB_ROOT;
692
693                 node = malloc(sizeof(*node));
694                 node->cache.start = ino;
695                 node->cache.size = 1;
696                 node->data = rec;
697
698                 if (ino == BTRFS_FREE_INO_OBJECTID)
699                         rec->found_link = 1;
700
701                 ret = insert_cache_extent(inode_cache, &node->cache);
702                 BUG_ON(ret);
703         }
704         return rec;
705 }
706
707 static void free_orphan_data_extents(struct list_head *orphan_extents)
708 {
709         struct orphan_data_extent *orphan;
710
711         while (!list_empty(orphan_extents)) {
712                 orphan = list_entry(orphan_extents->next,
713                                     struct orphan_data_extent, list);
714                 list_del(&orphan->list);
715                 free(orphan);
716         }
717 }
718
719 static void free_inode_rec(struct inode_record *rec)
720 {
721         struct inode_backref *backref;
722
723         if (--rec->refs > 0)
724                 return;
725
726         while (!list_empty(&rec->backrefs)) {
727                 backref = list_entry(rec->backrefs.next,
728                                      struct inode_backref, list);
729                 list_del(&backref->list);
730                 free(backref);
731         }
732         free_orphan_data_extents(&rec->orphan_extents);
733         free_file_extent_holes(&rec->holes);
734         free(rec);
735 }
736
737 static int can_free_inode_rec(struct inode_record *rec)
738 {
739         if (!rec->errors && rec->checked && rec->found_inode_item &&
740             rec->nlink == rec->found_link && list_empty(&rec->backrefs))
741                 return 1;
742         return 0;
743 }
744
745 static void maybe_free_inode_rec(struct cache_tree *inode_cache,
746                                  struct inode_record *rec)
747 {
748         struct cache_extent *cache;
749         struct inode_backref *tmp, *backref;
750         struct ptr_node *node;
751         unsigned char filetype;
752
753         if (!rec->found_inode_item)
754                 return;
755
756         filetype = imode_to_type(rec->imode);
757         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
758                 if (backref->found_dir_item && backref->found_dir_index) {
759                         if (backref->filetype != filetype)
760                                 backref->errors |= REF_ERR_FILETYPE_UNMATCH;
761                         if (!backref->errors && backref->found_inode_ref) {
762                                 list_del(&backref->list);
763                                 free(backref);
764                         }
765                 }
766         }
767
768         if (!rec->checked || rec->merging)
769                 return;
770
771         if (S_ISDIR(rec->imode)) {
772                 if (rec->found_size != rec->isize)
773                         rec->errors |= I_ERR_DIR_ISIZE_WRONG;
774                 if (rec->found_file_extent)
775                         rec->errors |= I_ERR_ODD_FILE_EXTENT;
776         } else if (S_ISREG(rec->imode) || S_ISLNK(rec->imode)) {
777                 if (rec->found_dir_item)
778                         rec->errors |= I_ERR_ODD_DIR_ITEM;
779                 if (rec->found_size != rec->nbytes)
780                         rec->errors |= I_ERR_FILE_NBYTES_WRONG;
781                 if (rec->nlink > 0 && !no_holes &&
782                     (rec->extent_end < rec->isize ||
783                      first_extent_gap(&rec->holes) < rec->isize))
784                         rec->errors |= I_ERR_FILE_EXTENT_DISCOUNT;
785         }
786
787         if (S_ISREG(rec->imode) || S_ISLNK(rec->imode)) {
788                 if (rec->found_csum_item && rec->nodatasum)
789                         rec->errors |= I_ERR_ODD_CSUM_ITEM;
790                 if (rec->some_csum_missing && !rec->nodatasum)
791                         rec->errors |= I_ERR_SOME_CSUM_MISSING;
792         }
793
794         BUG_ON(rec->refs != 1);
795         if (can_free_inode_rec(rec)) {
796                 cache = lookup_cache_extent(inode_cache, rec->ino, 1);
797                 node = container_of(cache, struct ptr_node, cache);
798                 BUG_ON(node->data != rec);
799                 remove_cache_extent(inode_cache, &node->cache);
800                 free(node);
801                 free_inode_rec(rec);
802         }
803 }
804
805 static int check_orphan_item(struct btrfs_root *root, u64 ino)
806 {
807         struct btrfs_path path;
808         struct btrfs_key key;
809         int ret;
810
811         key.objectid = BTRFS_ORPHAN_OBJECTID;
812         key.type = BTRFS_ORPHAN_ITEM_KEY;
813         key.offset = ino;
814
815         btrfs_init_path(&path);
816         ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
817         btrfs_release_path(&path);
818         if (ret > 0)
819                 ret = -ENOENT;
820         return ret;
821 }
822
823 static int process_inode_item(struct extent_buffer *eb,
824                               int slot, struct btrfs_key *key,
825                               struct shared_node *active_node)
826 {
827         struct inode_record *rec;
828         struct btrfs_inode_item *item;
829
830         rec = active_node->current;
831         BUG_ON(rec->ino != key->objectid || rec->refs > 1);
832         if (rec->found_inode_item) {
833                 rec->errors |= I_ERR_DUP_INODE_ITEM;
834                 return 1;
835         }
836         item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
837         rec->nlink = btrfs_inode_nlink(eb, item);
838         rec->isize = btrfs_inode_size(eb, item);
839         rec->nbytes = btrfs_inode_nbytes(eb, item);
840         rec->imode = btrfs_inode_mode(eb, item);
841         if (btrfs_inode_flags(eb, item) & BTRFS_INODE_NODATASUM)
842                 rec->nodatasum = 1;
843         rec->found_inode_item = 1;
844         if (rec->nlink == 0)
845                 rec->errors |= I_ERR_NO_ORPHAN_ITEM;
846         maybe_free_inode_rec(&active_node->inode_cache, rec);
847         return 0;
848 }
849
850 static struct inode_backref *get_inode_backref(struct inode_record *rec,
851                                                 const char *name,
852                                                 int namelen, u64 dir)
853 {
854         struct inode_backref *backref;
855
856         list_for_each_entry(backref, &rec->backrefs, list) {
857                 if (rec->ino == BTRFS_MULTIPLE_OBJECTIDS)
858                         break;
859                 if (backref->dir != dir || backref->namelen != namelen)
860                         continue;
861                 if (memcmp(name, backref->name, namelen))
862                         continue;
863                 return backref;
864         }
865
866         backref = malloc(sizeof(*backref) + namelen + 1);
867         memset(backref, 0, sizeof(*backref));
868         backref->dir = dir;
869         backref->namelen = namelen;
870         memcpy(backref->name, name, namelen);
871         backref->name[namelen] = '\0';
872         list_add_tail(&backref->list, &rec->backrefs);
873         return backref;
874 }
875
876 static int add_inode_backref(struct cache_tree *inode_cache,
877                              u64 ino, u64 dir, u64 index,
878                              const char *name, int namelen,
879                              int filetype, int itemtype, int errors)
880 {
881         struct inode_record *rec;
882         struct inode_backref *backref;
883
884         rec = get_inode_rec(inode_cache, ino, 1);
885         backref = get_inode_backref(rec, name, namelen, dir);
886         if (errors)
887                 backref->errors |= errors;
888         if (itemtype == BTRFS_DIR_INDEX_KEY) {
889                 if (backref->found_dir_index)
890                         backref->errors |= REF_ERR_DUP_DIR_INDEX;
891                 if (backref->found_inode_ref && backref->index != index)
892                         backref->errors |= REF_ERR_INDEX_UNMATCH;
893                 if (backref->found_dir_item && backref->filetype != filetype)
894                         backref->errors |= REF_ERR_FILETYPE_UNMATCH;
895
896                 backref->index = index;
897                 backref->filetype = filetype;
898                 backref->found_dir_index = 1;
899         } else if (itemtype == BTRFS_DIR_ITEM_KEY) {
900                 rec->found_link++;
901                 if (backref->found_dir_item)
902                         backref->errors |= REF_ERR_DUP_DIR_ITEM;
903                 if (backref->found_dir_index && backref->filetype != filetype)
904                         backref->errors |= REF_ERR_FILETYPE_UNMATCH;
905
906                 backref->filetype = filetype;
907                 backref->found_dir_item = 1;
908         } else if ((itemtype == BTRFS_INODE_REF_KEY) ||
909                    (itemtype == BTRFS_INODE_EXTREF_KEY)) {
910                 if (backref->found_inode_ref)
911                         backref->errors |= REF_ERR_DUP_INODE_REF;
912                 if (backref->found_dir_index && backref->index != index)
913                         backref->errors |= REF_ERR_INDEX_UNMATCH;
914                 else
915                         backref->index = index;
916
917                 backref->ref_type = itemtype;
918                 backref->found_inode_ref = 1;
919         } else {
920                 BUG_ON(1);
921         }
922
923         maybe_free_inode_rec(inode_cache, rec);
924         return 0;
925 }
926
927 static int merge_inode_recs(struct inode_record *src, struct inode_record *dst,
928                             struct cache_tree *dst_cache)
929 {
930         struct inode_backref *backref;
931         u32 dir_count = 0;
932         int ret = 0;
933
934         dst->merging = 1;
935         list_for_each_entry(backref, &src->backrefs, list) {
936                 if (backref->found_dir_index) {
937                         add_inode_backref(dst_cache, dst->ino, backref->dir,
938                                         backref->index, backref->name,
939                                         backref->namelen, backref->filetype,
940                                         BTRFS_DIR_INDEX_KEY, backref->errors);
941                 }
942                 if (backref->found_dir_item) {
943                         dir_count++;
944                         add_inode_backref(dst_cache, dst->ino,
945                                         backref->dir, 0, backref->name,
946                                         backref->namelen, backref->filetype,
947                                         BTRFS_DIR_ITEM_KEY, backref->errors);
948                 }
949                 if (backref->found_inode_ref) {
950                         add_inode_backref(dst_cache, dst->ino,
951                                         backref->dir, backref->index,
952                                         backref->name, backref->namelen, 0,
953                                         backref->ref_type, backref->errors);
954                 }
955         }
956
957         if (src->found_dir_item)
958                 dst->found_dir_item = 1;
959         if (src->found_file_extent)
960                 dst->found_file_extent = 1;
961         if (src->found_csum_item)
962                 dst->found_csum_item = 1;
963         if (src->some_csum_missing)
964                 dst->some_csum_missing = 1;
965         if (first_extent_gap(&dst->holes) > first_extent_gap(&src->holes)) {
966                 ret = copy_file_extent_holes(&dst->holes, &src->holes);
967                 if (ret < 0)
968                         return ret;
969         }
970
971         BUG_ON(src->found_link < dir_count);
972         dst->found_link += src->found_link - dir_count;
973         dst->found_size += src->found_size;
974         if (src->extent_start != (u64)-1) {
975                 if (dst->extent_start == (u64)-1) {
976                         dst->extent_start = src->extent_start;
977                         dst->extent_end = src->extent_end;
978                 } else {
979                         if (dst->extent_end > src->extent_start)
980                                 dst->errors |= I_ERR_FILE_EXTENT_OVERLAP;
981                         else if (dst->extent_end < src->extent_start) {
982                                 ret = add_file_extent_hole(&dst->holes,
983                                         dst->extent_end,
984                                         src->extent_start - dst->extent_end);
985                         }
986                         if (dst->extent_end < src->extent_end)
987                                 dst->extent_end = src->extent_end;
988                 }
989         }
990
991         dst->errors |= src->errors;
992         if (src->found_inode_item) {
993                 if (!dst->found_inode_item) {
994                         dst->nlink = src->nlink;
995                         dst->isize = src->isize;
996                         dst->nbytes = src->nbytes;
997                         dst->imode = src->imode;
998                         dst->nodatasum = src->nodatasum;
999                         dst->found_inode_item = 1;
1000                 } else {
1001                         dst->errors |= I_ERR_DUP_INODE_ITEM;
1002                 }
1003         }
1004         dst->merging = 0;
1005
1006         return 0;
1007 }
1008
1009 static int splice_shared_node(struct shared_node *src_node,
1010                               struct shared_node *dst_node)
1011 {
1012         struct cache_extent *cache;
1013         struct ptr_node *node, *ins;
1014         struct cache_tree *src, *dst;
1015         struct inode_record *rec, *conflict;
1016         u64 current_ino = 0;
1017         int splice = 0;
1018         int ret;
1019
1020         if (--src_node->refs == 0)
1021                 splice = 1;
1022         if (src_node->current)
1023                 current_ino = src_node->current->ino;
1024
1025         src = &src_node->root_cache;
1026         dst = &dst_node->root_cache;
1027 again:
1028         cache = search_cache_extent(src, 0);
1029         while (cache) {
1030                 node = container_of(cache, struct ptr_node, cache);
1031                 rec = node->data;
1032                 cache = next_cache_extent(cache);
1033
1034                 if (splice) {
1035                         remove_cache_extent(src, &node->cache);
1036                         ins = node;
1037                 } else {
1038                         ins = malloc(sizeof(*ins));
1039                         ins->cache.start = node->cache.start;
1040                         ins->cache.size = node->cache.size;
1041                         ins->data = rec;
1042                         rec->refs++;
1043                 }
1044                 ret = insert_cache_extent(dst, &ins->cache);
1045                 if (ret == -EEXIST) {
1046                         conflict = get_inode_rec(dst, rec->ino, 1);
1047                         merge_inode_recs(rec, conflict, dst);
1048                         if (rec->checked) {
1049                                 conflict->checked = 1;
1050                                 if (dst_node->current == conflict)
1051                                         dst_node->current = NULL;
1052                         }
1053                         maybe_free_inode_rec(dst, conflict);
1054                         free_inode_rec(rec);
1055                         free(ins);
1056                 } else {
1057                         BUG_ON(ret);
1058                 }
1059         }
1060
1061         if (src == &src_node->root_cache) {
1062                 src = &src_node->inode_cache;
1063                 dst = &dst_node->inode_cache;
1064                 goto again;
1065         }
1066
1067         if (current_ino > 0 && (!dst_node->current ||
1068             current_ino > dst_node->current->ino)) {
1069                 if (dst_node->current) {
1070                         dst_node->current->checked = 1;
1071                         maybe_free_inode_rec(dst, dst_node->current);
1072                 }
1073                 dst_node->current = get_inode_rec(dst, current_ino, 1);
1074         }
1075         return 0;
1076 }
1077
1078 static void free_inode_ptr(struct cache_extent *cache)
1079 {
1080         struct ptr_node *node;
1081         struct inode_record *rec;
1082
1083         node = container_of(cache, struct ptr_node, cache);
1084         rec = node->data;
1085         free_inode_rec(rec);
1086         free(node);
1087 }
1088
1089 FREE_EXTENT_CACHE_BASED_TREE(inode_recs, free_inode_ptr);
1090
1091 static struct shared_node *find_shared_node(struct cache_tree *shared,
1092                                             u64 bytenr)
1093 {
1094         struct cache_extent *cache;
1095         struct shared_node *node;
1096
1097         cache = lookup_cache_extent(shared, bytenr, 1);
1098         if (cache) {
1099                 node = container_of(cache, struct shared_node, cache);
1100                 return node;
1101         }
1102         return NULL;
1103 }
1104
1105 static int add_shared_node(struct cache_tree *shared, u64 bytenr, u32 refs)
1106 {
1107         int ret;
1108         struct shared_node *node;
1109
1110         node = calloc(1, sizeof(*node));
1111         node->cache.start = bytenr;
1112         node->cache.size = 1;
1113         cache_tree_init(&node->root_cache);
1114         cache_tree_init(&node->inode_cache);
1115         node->refs = refs;
1116
1117         ret = insert_cache_extent(shared, &node->cache);
1118         BUG_ON(ret);
1119         return 0;
1120 }
1121
1122 static int enter_shared_node(struct btrfs_root *root, u64 bytenr, u32 refs,
1123                              struct walk_control *wc, int level)
1124 {
1125         struct shared_node *node;
1126         struct shared_node *dest;
1127
1128         if (level == wc->active_node)
1129                 return 0;
1130
1131         BUG_ON(wc->active_node <= level);
1132         node = find_shared_node(&wc->shared, bytenr);
1133         if (!node) {
1134                 add_shared_node(&wc->shared, bytenr, refs);
1135                 node = find_shared_node(&wc->shared, bytenr);
1136                 wc->nodes[level] = node;
1137                 wc->active_node = level;
1138                 return 0;
1139         }
1140
1141         if (wc->root_level == wc->active_node &&
1142             btrfs_root_refs(&root->root_item) == 0) {
1143                 if (--node->refs == 0) {
1144                         free_inode_recs_tree(&node->root_cache);
1145                         free_inode_recs_tree(&node->inode_cache);
1146                         remove_cache_extent(&wc->shared, &node->cache);
1147                         free(node);
1148                 }
1149                 return 1;
1150         }
1151
1152         dest = wc->nodes[wc->active_node];
1153         splice_shared_node(node, dest);
1154         if (node->refs == 0) {
1155                 remove_cache_extent(&wc->shared, &node->cache);
1156                 free(node);
1157         }
1158         return 1;
1159 }
1160
1161 static int leave_shared_node(struct btrfs_root *root,
1162                              struct walk_control *wc, int level)
1163 {
1164         struct shared_node *node;
1165         struct shared_node *dest;
1166         int i;
1167
1168         if (level == wc->root_level)
1169                 return 0;
1170
1171         for (i = level + 1; i < BTRFS_MAX_LEVEL; i++) {
1172                 if (wc->nodes[i])
1173                         break;
1174         }
1175         BUG_ON(i >= BTRFS_MAX_LEVEL);
1176
1177         node = wc->nodes[wc->active_node];
1178         wc->nodes[wc->active_node] = NULL;
1179         wc->active_node = i;
1180
1181         dest = wc->nodes[wc->active_node];
1182         if (wc->active_node < wc->root_level ||
1183             btrfs_root_refs(&root->root_item) > 0) {
1184                 BUG_ON(node->refs <= 1);
1185                 splice_shared_node(node, dest);
1186         } else {
1187                 BUG_ON(node->refs < 2);
1188                 node->refs--;
1189         }
1190         return 0;
1191 }
1192
1193 /*
1194  * Returns:
1195  * < 0 - on error
1196  * 1   - if the root with id child_root_id is a child of root parent_root_id
1197  * 0   - if the root child_root_id isn't a child of the root parent_root_id but
1198  *       has other root(s) as parent(s)
1199  * 2   - if the root child_root_id doesn't have any parent roots
1200  */
1201 static int is_child_root(struct btrfs_root *root, u64 parent_root_id,
1202                          u64 child_root_id)
1203 {
1204         struct btrfs_path path;
1205         struct btrfs_key key;
1206         struct extent_buffer *leaf;
1207         int has_parent = 0;
1208         int ret;
1209
1210         btrfs_init_path(&path);
1211
1212         key.objectid = parent_root_id;
1213         key.type = BTRFS_ROOT_REF_KEY;
1214         key.offset = child_root_id;
1215         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, &path,
1216                                 0, 0);
1217         if (ret < 0)
1218                 return ret;
1219         btrfs_release_path(&path);
1220         if (!ret)
1221                 return 1;
1222
1223         key.objectid = child_root_id;
1224         key.type = BTRFS_ROOT_BACKREF_KEY;
1225         key.offset = 0;
1226         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, &path,
1227                                 0, 0);
1228         if (ret < 0)
1229                 goto out;
1230
1231         while (1) {
1232                 leaf = path.nodes[0];
1233                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
1234                         ret = btrfs_next_leaf(root->fs_info->tree_root, &path);
1235                         if (ret)
1236                                 break;
1237                         leaf = path.nodes[0];
1238                 }
1239
1240                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1241                 if (key.objectid != child_root_id ||
1242                     key.type != BTRFS_ROOT_BACKREF_KEY)
1243                         break;
1244
1245                 has_parent = 1;
1246
1247                 if (key.offset == parent_root_id) {
1248                         btrfs_release_path(&path);
1249                         return 1;
1250                 }
1251
1252                 path.slots[0]++;
1253         }
1254 out:
1255         btrfs_release_path(&path);
1256         if (ret < 0)
1257                 return ret;
1258         return has_parent ? 0 : 2;
1259 }
1260
1261 static int process_dir_item(struct btrfs_root *root,
1262                             struct extent_buffer *eb,
1263                             int slot, struct btrfs_key *key,
1264                             struct shared_node *active_node)
1265 {
1266         u32 total;
1267         u32 cur = 0;
1268         u32 len;
1269         u32 name_len;
1270         u32 data_len;
1271         int error;
1272         int nritems = 0;
1273         int filetype;
1274         struct btrfs_dir_item *di;
1275         struct inode_record *rec;
1276         struct cache_tree *root_cache;
1277         struct cache_tree *inode_cache;
1278         struct btrfs_key location;
1279         char namebuf[BTRFS_NAME_LEN];
1280
1281         root_cache = &active_node->root_cache;
1282         inode_cache = &active_node->inode_cache;
1283         rec = active_node->current;
1284         rec->found_dir_item = 1;
1285
1286         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1287         total = btrfs_item_size_nr(eb, slot);
1288         while (cur < total) {
1289                 nritems++;
1290                 btrfs_dir_item_key_to_cpu(eb, di, &location);
1291                 name_len = btrfs_dir_name_len(eb, di);
1292                 data_len = btrfs_dir_data_len(eb, di);
1293                 filetype = btrfs_dir_type(eb, di);
1294
1295                 rec->found_size += name_len;
1296                 if (name_len <= BTRFS_NAME_LEN) {
1297                         len = name_len;
1298                         error = 0;
1299                 } else {
1300                         len = BTRFS_NAME_LEN;
1301                         error = REF_ERR_NAME_TOO_LONG;
1302                 }
1303                 read_extent_buffer(eb, namebuf, (unsigned long)(di + 1), len);
1304
1305                 if (location.type == BTRFS_INODE_ITEM_KEY) {
1306                         add_inode_backref(inode_cache, location.objectid,
1307                                           key->objectid, key->offset, namebuf,
1308                                           len, filetype, key->type, error);
1309                 } else if (location.type == BTRFS_ROOT_ITEM_KEY) {
1310                         add_inode_backref(root_cache, location.objectid,
1311                                           key->objectid, key->offset,
1312                                           namebuf, len, filetype,
1313                                           key->type, error);
1314                 } else {
1315                         fprintf(stderr, "invalid location in dir item %u\n",
1316                                 location.type);
1317                         add_inode_backref(inode_cache, BTRFS_MULTIPLE_OBJECTIDS,
1318                                           key->objectid, key->offset, namebuf,
1319                                           len, filetype, key->type, error);
1320                 }
1321
1322                 len = sizeof(*di) + name_len + data_len;
1323                 di = (struct btrfs_dir_item *)((char *)di + len);
1324                 cur += len;
1325         }
1326         if (key->type == BTRFS_DIR_INDEX_KEY && nritems > 1)
1327                 rec->errors |= I_ERR_DUP_DIR_INDEX;
1328
1329         return 0;
1330 }
1331
1332 static int process_inode_ref(struct extent_buffer *eb,
1333                              int slot, struct btrfs_key *key,
1334                              struct shared_node *active_node)
1335 {
1336         u32 total;
1337         u32 cur = 0;
1338         u32 len;
1339         u32 name_len;
1340         u64 index;
1341         int error;
1342         struct cache_tree *inode_cache;
1343         struct btrfs_inode_ref *ref;
1344         char namebuf[BTRFS_NAME_LEN];
1345
1346         inode_cache = &active_node->inode_cache;
1347
1348         ref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1349         total = btrfs_item_size_nr(eb, slot);
1350         while (cur < total) {
1351                 name_len = btrfs_inode_ref_name_len(eb, ref);
1352                 index = btrfs_inode_ref_index(eb, ref);
1353                 if (name_len <= BTRFS_NAME_LEN) {
1354                         len = name_len;
1355                         error = 0;
1356                 } else {
1357                         len = BTRFS_NAME_LEN;
1358                         error = REF_ERR_NAME_TOO_LONG;
1359                 }
1360                 read_extent_buffer(eb, namebuf, (unsigned long)(ref + 1), len);
1361                 add_inode_backref(inode_cache, key->objectid, key->offset,
1362                                   index, namebuf, len, 0, key->type, error);
1363
1364                 len = sizeof(*ref) + name_len;
1365                 ref = (struct btrfs_inode_ref *)((char *)ref + len);
1366                 cur += len;
1367         }
1368         return 0;
1369 }
1370
1371 static int process_inode_extref(struct extent_buffer *eb,
1372                                 int slot, struct btrfs_key *key,
1373                                 struct shared_node *active_node)
1374 {
1375         u32 total;
1376         u32 cur = 0;
1377         u32 len;
1378         u32 name_len;
1379         u64 index;
1380         u64 parent;
1381         int error;
1382         struct cache_tree *inode_cache;
1383         struct btrfs_inode_extref *extref;
1384         char namebuf[BTRFS_NAME_LEN];
1385
1386         inode_cache = &active_node->inode_cache;
1387
1388         extref = btrfs_item_ptr(eb, slot, struct btrfs_inode_extref);
1389         total = btrfs_item_size_nr(eb, slot);
1390         while (cur < total) {
1391                 name_len = btrfs_inode_extref_name_len(eb, extref);
1392                 index = btrfs_inode_extref_index(eb, extref);
1393                 parent = btrfs_inode_extref_parent(eb, extref);
1394                 if (name_len <= BTRFS_NAME_LEN) {
1395                         len = name_len;
1396                         error = 0;
1397                 } else {
1398                         len = BTRFS_NAME_LEN;
1399                         error = REF_ERR_NAME_TOO_LONG;
1400                 }
1401                 read_extent_buffer(eb, namebuf,
1402                                    (unsigned long)(extref + 1), len);
1403                 add_inode_backref(inode_cache, key->objectid, parent,
1404                                   index, namebuf, len, 0, key->type, error);
1405
1406                 len = sizeof(*extref) + name_len;
1407                 extref = (struct btrfs_inode_extref *)((char *)extref + len);
1408                 cur += len;
1409         }
1410         return 0;
1411
1412 }
1413
1414 static int count_csum_range(struct btrfs_root *root, u64 start,
1415                             u64 len, u64 *found)
1416 {
1417         struct btrfs_key key;
1418         struct btrfs_path path;
1419         struct extent_buffer *leaf;
1420         int ret;
1421         size_t size;
1422         *found = 0;
1423         u64 csum_end;
1424         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1425
1426         btrfs_init_path(&path);
1427
1428         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1429         key.offset = start;
1430         key.type = BTRFS_EXTENT_CSUM_KEY;
1431
1432         ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
1433                                 &key, &path, 0, 0);
1434         if (ret < 0)
1435                 goto out;
1436         if (ret > 0 && path.slots[0] > 0) {
1437                 leaf = path.nodes[0];
1438                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0] - 1);
1439                 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
1440                     key.type == BTRFS_EXTENT_CSUM_KEY)
1441                         path.slots[0]--;
1442         }
1443
1444         while (len > 0) {
1445                 leaf = path.nodes[0];
1446                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
1447                         ret = btrfs_next_leaf(root->fs_info->csum_root, &path);
1448                         if (ret > 0)
1449                                 break;
1450                         else if (ret < 0)
1451                                 goto out;
1452                         leaf = path.nodes[0];
1453                 }
1454
1455                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1456                 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
1457                     key.type != BTRFS_EXTENT_CSUM_KEY)
1458                         break;
1459
1460                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1461                 if (key.offset >= start + len)
1462                         break;
1463
1464                 if (key.offset > start)
1465                         start = key.offset;
1466
1467                 size = btrfs_item_size_nr(leaf, path.slots[0]);
1468                 csum_end = key.offset + (size / csum_size) * root->sectorsize;
1469                 if (csum_end > start) {
1470                         size = min(csum_end - start, len);
1471                         len -= size;
1472                         start += size;
1473                         *found += size;
1474                 }
1475
1476                 path.slots[0]++;
1477         }
1478 out:
1479         btrfs_release_path(&path);
1480         if (ret < 0)
1481                 return ret;
1482         return 0;
1483 }
1484
1485 static int process_file_extent(struct btrfs_root *root,
1486                                 struct extent_buffer *eb,
1487                                 int slot, struct btrfs_key *key,
1488                                 struct shared_node *active_node)
1489 {
1490         struct inode_record *rec;
1491         struct btrfs_file_extent_item *fi;
1492         u64 num_bytes = 0;
1493         u64 disk_bytenr = 0;
1494         u64 extent_offset = 0;
1495         u64 mask = root->sectorsize - 1;
1496         int extent_type;
1497         int ret;
1498
1499         rec = active_node->current;
1500         BUG_ON(rec->ino != key->objectid || rec->refs > 1);
1501         rec->found_file_extent = 1;
1502
1503         if (rec->extent_start == (u64)-1) {
1504                 rec->extent_start = key->offset;
1505                 rec->extent_end = key->offset;
1506         }
1507
1508         if (rec->extent_end > key->offset)
1509                 rec->errors |= I_ERR_FILE_EXTENT_OVERLAP;
1510         else if (rec->extent_end < key->offset) {
1511                 ret = add_file_extent_hole(&rec->holes, rec->extent_end,
1512                                            key->offset - rec->extent_end);
1513                 if (ret < 0)
1514                         return ret;
1515         }
1516
1517         fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
1518         extent_type = btrfs_file_extent_type(eb, fi);
1519
1520         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1521                 num_bytes = btrfs_file_extent_inline_len(eb, slot, fi);
1522                 if (num_bytes == 0)
1523                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1524                 rec->found_size += num_bytes;
1525                 num_bytes = (num_bytes + mask) & ~mask;
1526         } else if (extent_type == BTRFS_FILE_EXTENT_REG ||
1527                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1528                 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1529                 disk_bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1530                 extent_offset = btrfs_file_extent_offset(eb, fi);
1531                 if (num_bytes == 0 || (num_bytes & mask))
1532                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1533                 if (num_bytes + extent_offset >
1534                     btrfs_file_extent_ram_bytes(eb, fi))
1535                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1536                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
1537                     (btrfs_file_extent_compression(eb, fi) ||
1538                      btrfs_file_extent_encryption(eb, fi) ||
1539                      btrfs_file_extent_other_encoding(eb, fi)))
1540                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1541                 if (disk_bytenr > 0)
1542                         rec->found_size += num_bytes;
1543         } else {
1544                 rec->errors |= I_ERR_BAD_FILE_EXTENT;
1545         }
1546         rec->extent_end = key->offset + num_bytes;
1547
1548         /*
1549          * The data reloc tree will copy full extents into its inode and then
1550          * copy the corresponding csums.  Because the extent it copied could be
1551          * a preallocated extent that hasn't been written to yet there may be no
1552          * csums to copy, ergo we won't have csums for our file extent.  This is
1553          * ok so just don't bother checking csums if the inode belongs to the
1554          * data reloc tree.
1555          */
1556         if (disk_bytenr > 0 &&
1557             btrfs_header_owner(eb) != BTRFS_DATA_RELOC_TREE_OBJECTID) {
1558                 u64 found;
1559                 if (btrfs_file_extent_compression(eb, fi))
1560                         num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1561                 else
1562                         disk_bytenr += extent_offset;
1563
1564                 ret = count_csum_range(root, disk_bytenr, num_bytes, &found);
1565                 if (ret < 0)
1566                         return ret;
1567                 if (extent_type == BTRFS_FILE_EXTENT_REG) {
1568                         if (found > 0)
1569                                 rec->found_csum_item = 1;
1570                         if (found < num_bytes)
1571                                 rec->some_csum_missing = 1;
1572                 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1573                         if (found > 0)
1574                                 rec->errors |= I_ERR_ODD_CSUM_ITEM;
1575                 }
1576         }
1577         return 0;
1578 }
1579
1580 static int process_one_leaf(struct btrfs_root *root, struct extent_buffer *eb,
1581                             struct walk_control *wc)
1582 {
1583         struct btrfs_key key;
1584         u32 nritems;
1585         int i;
1586         int ret = 0;
1587         struct cache_tree *inode_cache;
1588         struct shared_node *active_node;
1589
1590         if (wc->root_level == wc->active_node &&
1591             btrfs_root_refs(&root->root_item) == 0)
1592                 return 0;
1593
1594         active_node = wc->nodes[wc->active_node];
1595         inode_cache = &active_node->inode_cache;
1596         nritems = btrfs_header_nritems(eb);
1597         for (i = 0; i < nritems; i++) {
1598                 btrfs_item_key_to_cpu(eb, &key, i);
1599
1600                 if (key.objectid == BTRFS_FREE_SPACE_OBJECTID)
1601                         continue;
1602                 if (key.type == BTRFS_ORPHAN_ITEM_KEY)
1603                         continue;
1604
1605                 if (active_node->current == NULL ||
1606                     active_node->current->ino < key.objectid) {
1607                         if (active_node->current) {
1608                                 active_node->current->checked = 1;
1609                                 maybe_free_inode_rec(inode_cache,
1610                                                      active_node->current);
1611                         }
1612                         active_node->current = get_inode_rec(inode_cache,
1613                                                              key.objectid, 1);
1614                 }
1615                 switch (key.type) {
1616                 case BTRFS_DIR_ITEM_KEY:
1617                 case BTRFS_DIR_INDEX_KEY:
1618                         ret = process_dir_item(root, eb, i, &key, active_node);
1619                         break;
1620                 case BTRFS_INODE_REF_KEY:
1621                         ret = process_inode_ref(eb, i, &key, active_node);
1622                         break;
1623                 case BTRFS_INODE_EXTREF_KEY:
1624                         ret = process_inode_extref(eb, i, &key, active_node);
1625                         break;
1626                 case BTRFS_INODE_ITEM_KEY:
1627                         ret = process_inode_item(eb, i, &key, active_node);
1628                         break;
1629                 case BTRFS_EXTENT_DATA_KEY:
1630                         ret = process_file_extent(root, eb, i, &key,
1631                                                   active_node);
1632                         break;
1633                 default:
1634                         break;
1635                 };
1636         }
1637         return ret;
1638 }
1639
1640 static void reada_walk_down(struct btrfs_root *root,
1641                             struct extent_buffer *node, int slot)
1642 {
1643         u64 bytenr;
1644         u64 ptr_gen;
1645         u32 nritems;
1646         u32 blocksize;
1647         int i;
1648         int level;
1649
1650         level = btrfs_header_level(node);
1651         if (level != 1)
1652                 return;
1653
1654         nritems = btrfs_header_nritems(node);
1655         blocksize = btrfs_level_size(root, level - 1);
1656         for (i = slot; i < nritems; i++) {
1657                 bytenr = btrfs_node_blockptr(node, i);
1658                 ptr_gen = btrfs_node_ptr_generation(node, i);
1659                 readahead_tree_block(root, bytenr, blocksize, ptr_gen);
1660         }
1661 }
1662
1663 /*
1664  * Check the child node/leaf by the following condition:
1665  * 1. the first item key of the node/leaf should be the same with the one
1666  *    in parent.
1667  * 2. block in parent node should match the child node/leaf.
1668  * 3. generation of parent node and child's header should be consistent.
1669  *
1670  * Or the child node/leaf pointed by the key in parent is not valid.
1671  *
1672  * We hope to check leaf owner too, but since subvol may share leaves,
1673  * which makes leaf owner check not so strong, key check should be
1674  * sufficient enough for that case.
1675  */
1676 static int check_child_node(struct btrfs_root *root,
1677                             struct extent_buffer *parent, int slot,
1678                             struct extent_buffer *child)
1679 {
1680         struct btrfs_key parent_key;
1681         struct btrfs_key child_key;
1682         int ret = 0;
1683
1684         btrfs_node_key_to_cpu(parent, &parent_key, slot);
1685         if (btrfs_header_level(child) == 0)
1686                 btrfs_item_key_to_cpu(child, &child_key, 0);
1687         else
1688                 btrfs_node_key_to_cpu(child, &child_key, 0);
1689
1690         if (memcmp(&parent_key, &child_key, sizeof(parent_key))) {
1691                 ret = -EINVAL;
1692                 fprintf(stderr,
1693                         "Wrong key of child node/leaf, wanted: (%llu, %u, %llu), have: (%llu, %u, %llu)\n",
1694                         parent_key.objectid, parent_key.type, parent_key.offset,
1695                         child_key.objectid, child_key.type, child_key.offset);
1696         }
1697         if (btrfs_header_bytenr(child) != btrfs_node_blockptr(parent, slot)) {
1698                 ret = -EINVAL;
1699                 fprintf(stderr, "Wrong block of child node/leaf, wanted: %llu, have: %llu\n",
1700                         btrfs_node_blockptr(parent, slot),
1701                         btrfs_header_bytenr(child));
1702         }
1703         if (btrfs_node_ptr_generation(parent, slot) !=
1704             btrfs_header_generation(child)) {
1705                 ret = -EINVAL;
1706                 fprintf(stderr, "Wrong generation of child node/leaf, wanted: %llu, have: %llu\n",
1707                         btrfs_header_generation(child),
1708                         btrfs_node_ptr_generation(parent, slot));
1709         }
1710         return ret;
1711 }
1712
1713 static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
1714                           struct walk_control *wc, int *level)
1715 {
1716         enum btrfs_tree_block_status status;
1717         u64 bytenr;
1718         u64 ptr_gen;
1719         struct extent_buffer *next;
1720         struct extent_buffer *cur;
1721         u32 blocksize;
1722         int ret, err = 0;
1723         u64 refs;
1724
1725         WARN_ON(*level < 0);
1726         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1727         ret = btrfs_lookup_extent_info(NULL, root,
1728                                        path->nodes[*level]->start,
1729                                        *level, 1, &refs, NULL);
1730         if (ret < 0) {
1731                 err = ret;
1732                 goto out;
1733         }
1734
1735         if (refs > 1) {
1736                 ret = enter_shared_node(root, path->nodes[*level]->start,
1737                                         refs, wc, *level);
1738                 if (ret > 0) {
1739                         err = ret;
1740                         goto out;
1741                 }
1742         }
1743
1744         while (*level >= 0) {
1745                 WARN_ON(*level < 0);
1746                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1747                 cur = path->nodes[*level];
1748
1749                 if (btrfs_header_level(cur) != *level)
1750                         WARN_ON(1);
1751
1752                 if (path->slots[*level] >= btrfs_header_nritems(cur))
1753                         break;
1754                 if (*level == 0) {
1755                         ret = process_one_leaf(root, cur, wc);
1756                         if (ret < 0)
1757                                 err = ret;
1758                         break;
1759                 }
1760                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1761                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1762                 blocksize = btrfs_level_size(root, *level - 1);
1763                 ret = btrfs_lookup_extent_info(NULL, root, bytenr, *level - 1,
1764                                                1, &refs, NULL);
1765                 if (ret < 0)
1766                         refs = 0;
1767
1768                 if (refs > 1) {
1769                         ret = enter_shared_node(root, bytenr, refs,
1770                                                 wc, *level - 1);
1771                         if (ret > 0) {
1772                                 path->slots[*level]++;
1773                                 continue;
1774                         }
1775                 }
1776
1777                 next = btrfs_find_tree_block(root, bytenr, blocksize);
1778                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
1779                         free_extent_buffer(next);
1780                         reada_walk_down(root, cur, path->slots[*level]);
1781                         next = read_tree_block(root, bytenr, blocksize,
1782                                                ptr_gen);
1783                         if (!extent_buffer_uptodate(next)) {
1784                                 struct btrfs_key node_key;
1785
1786                                 btrfs_node_key_to_cpu(path->nodes[*level],
1787                                                       &node_key,
1788                                                       path->slots[*level]);
1789                                 btrfs_add_corrupt_extent_record(root->fs_info,
1790                                                 &node_key,
1791                                                 path->nodes[*level]->start,
1792                                                 root->leafsize, *level);
1793                                 err = -EIO;
1794                                 goto out;
1795                         }
1796                 }
1797
1798                 ret = check_child_node(root, cur, path->slots[*level], next);
1799                 if (ret) {
1800                         err = ret;
1801                         goto out;
1802                 }
1803
1804                 if (btrfs_is_leaf(next))
1805                         status = btrfs_check_leaf(root, NULL, next);
1806                 else
1807                         status = btrfs_check_node(root, NULL, next);
1808                 if (status != BTRFS_TREE_BLOCK_CLEAN) {
1809                         free_extent_buffer(next);
1810                         err = -EIO;
1811                         goto out;
1812                 }
1813
1814                 *level = *level - 1;
1815                 free_extent_buffer(path->nodes[*level]);
1816                 path->nodes[*level] = next;
1817                 path->slots[*level] = 0;
1818         }
1819 out:
1820         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1821         return err;
1822 }
1823
1824 static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
1825                         struct walk_control *wc, int *level)
1826 {
1827         int i;
1828         struct extent_buffer *leaf;
1829
1830         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1831                 leaf = path->nodes[i];
1832                 if (path->slots[i] + 1 < btrfs_header_nritems(leaf)) {
1833                         path->slots[i]++;
1834                         *level = i;
1835                         return 0;
1836                 } else {
1837                         free_extent_buffer(path->nodes[*level]);
1838                         path->nodes[*level] = NULL;
1839                         BUG_ON(*level > wc->active_node);
1840                         if (*level == wc->active_node)
1841                                 leave_shared_node(root, wc, *level);
1842                         *level = i + 1;
1843                 }
1844         }
1845         return 1;
1846 }
1847
1848 static int check_root_dir(struct inode_record *rec)
1849 {
1850         struct inode_backref *backref;
1851         int ret = -1;
1852
1853         if (!rec->found_inode_item || rec->errors)
1854                 goto out;
1855         if (rec->nlink != 1 || rec->found_link != 0)
1856                 goto out;
1857         if (list_empty(&rec->backrefs))
1858                 goto out;
1859         backref = list_entry(rec->backrefs.next, struct inode_backref, list);
1860         if (!backref->found_inode_ref)
1861                 goto out;
1862         if (backref->index != 0 || backref->namelen != 2 ||
1863             memcmp(backref->name, "..", 2))
1864                 goto out;
1865         if (backref->found_dir_index || backref->found_dir_item)
1866                 goto out;
1867         ret = 0;
1868 out:
1869         return ret;
1870 }
1871
1872 static int repair_inode_isize(struct btrfs_trans_handle *trans,
1873                               struct btrfs_root *root, struct btrfs_path *path,
1874                               struct inode_record *rec)
1875 {
1876         struct btrfs_inode_item *ei;
1877         struct btrfs_key key;
1878         int ret;
1879
1880         key.objectid = rec->ino;
1881         key.type = BTRFS_INODE_ITEM_KEY;
1882         key.offset = (u64)-1;
1883
1884         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1885         if (ret < 0)
1886                 goto out;
1887         if (ret) {
1888                 if (!path->slots[0]) {
1889                         ret = -ENOENT;
1890                         goto out;
1891                 }
1892                 path->slots[0]--;
1893                 ret = 0;
1894         }
1895         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1896         if (key.objectid != rec->ino) {
1897                 ret = -ENOENT;
1898                 goto out;
1899         }
1900
1901         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1902                             struct btrfs_inode_item);
1903         btrfs_set_inode_size(path->nodes[0], ei, rec->found_size);
1904         btrfs_mark_buffer_dirty(path->nodes[0]);
1905         rec->errors &= ~I_ERR_DIR_ISIZE_WRONG;
1906         printf("reset isize for dir %Lu root %Lu\n", rec->ino,
1907                root->root_key.objectid);
1908 out:
1909         btrfs_release_path(path);
1910         return ret;
1911 }
1912
1913 static int repair_inode_orphan_item(struct btrfs_trans_handle *trans,
1914                                     struct btrfs_root *root,
1915                                     struct btrfs_path *path,
1916                                     struct inode_record *rec)
1917 {
1918         int ret;
1919
1920         ret = btrfs_add_orphan_item(trans, root, path, rec->ino);
1921         btrfs_release_path(path);
1922         if (!ret)
1923                 rec->errors &= ~I_ERR_NO_ORPHAN_ITEM;
1924         return ret;
1925 }
1926
1927 static int repair_inode_nbytes(struct btrfs_trans_handle *trans,
1928                                struct btrfs_root *root,
1929                                struct btrfs_path *path,
1930                                struct inode_record *rec)
1931 {
1932         struct btrfs_inode_item *ei;
1933         struct btrfs_key key;
1934         int ret = 0;
1935
1936         key.objectid = rec->ino;
1937         key.type = BTRFS_INODE_ITEM_KEY;
1938         key.offset = 0;
1939
1940         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1941         if (ret) {
1942                 if (ret > 0)
1943                         ret = -ENOENT;
1944                 goto out;
1945         }
1946
1947         /* Since ret == 0, no need to check anything */
1948         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1949                             struct btrfs_inode_item);
1950         btrfs_set_inode_nbytes(path->nodes[0], ei, rec->found_size);
1951         btrfs_mark_buffer_dirty(path->nodes[0]);
1952         rec->errors &= ~I_ERR_FILE_NBYTES_WRONG;
1953         printf("reset nbytes for ino %llu root %llu\n",
1954                rec->ino, root->root_key.objectid);
1955 out:
1956         btrfs_release_path(path);
1957         return ret;
1958 }
1959
1960 static int add_missing_dir_index(struct btrfs_root *root,
1961                                  struct cache_tree *inode_cache,
1962                                  struct inode_record *rec,
1963                                  struct inode_backref *backref)
1964 {
1965         struct btrfs_path *path;
1966         struct btrfs_trans_handle *trans;
1967         struct btrfs_dir_item *dir_item;
1968         struct extent_buffer *leaf;
1969         struct btrfs_key key;
1970         struct btrfs_disk_key disk_key;
1971         struct inode_record *dir_rec;
1972         unsigned long name_ptr;
1973         u32 data_size = sizeof(*dir_item) + backref->namelen;
1974         int ret;
1975
1976         path = btrfs_alloc_path();
1977         if (!path)
1978                 return -ENOMEM;
1979
1980         trans = btrfs_start_transaction(root, 1);
1981         if (IS_ERR(trans)) {
1982                 btrfs_free_path(path);
1983                 return PTR_ERR(trans);
1984         }
1985
1986         fprintf(stderr, "repairing missing dir index item for inode %llu\n",
1987                 (unsigned long long)rec->ino);
1988         key.objectid = backref->dir;
1989         key.type = BTRFS_DIR_INDEX_KEY;
1990         key.offset = backref->index;
1991
1992         ret = btrfs_insert_empty_item(trans, root, path, &key, data_size);
1993         BUG_ON(ret);
1994
1995         leaf = path->nodes[0];
1996         dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
1997
1998         disk_key.objectid = cpu_to_le64(rec->ino);
1999         disk_key.type = BTRFS_INODE_ITEM_KEY;
2000         disk_key.offset = 0;
2001
2002         btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
2003         btrfs_set_dir_type(leaf, dir_item, imode_to_type(rec->imode));
2004         btrfs_set_dir_data_len(leaf, dir_item, 0);
2005         btrfs_set_dir_name_len(leaf, dir_item, backref->namelen);
2006         name_ptr = (unsigned long)(dir_item + 1);
2007         write_extent_buffer(leaf, backref->name, name_ptr, backref->namelen);
2008         btrfs_mark_buffer_dirty(leaf);
2009         btrfs_free_path(path);
2010         btrfs_commit_transaction(trans, root);
2011
2012         backref->found_dir_index = 1;
2013         dir_rec = get_inode_rec(inode_cache, backref->dir, 0);
2014         if (!dir_rec)
2015                 return 0;
2016         dir_rec->found_size += backref->namelen;
2017         if (dir_rec->found_size == dir_rec->isize &&
2018             (dir_rec->errors & I_ERR_DIR_ISIZE_WRONG))
2019                 dir_rec->errors &= ~I_ERR_DIR_ISIZE_WRONG;
2020         if (dir_rec->found_size != dir_rec->isize)
2021                 dir_rec->errors |= I_ERR_DIR_ISIZE_WRONG;
2022
2023         return 0;
2024 }
2025
2026 static int delete_dir_index(struct btrfs_root *root,
2027                             struct cache_tree *inode_cache,
2028                             struct inode_record *rec,
2029                             struct inode_backref *backref)
2030 {
2031         struct btrfs_trans_handle *trans;
2032         struct btrfs_dir_item *di;
2033         struct btrfs_path *path;
2034         int ret = 0;
2035
2036         path = btrfs_alloc_path();
2037         if (!path)
2038                 return -ENOMEM;
2039
2040         trans = btrfs_start_transaction(root, 1);
2041         if (IS_ERR(trans)) {
2042                 btrfs_free_path(path);
2043                 return PTR_ERR(trans);
2044         }
2045
2046
2047         fprintf(stderr, "Deleting bad dir index [%llu,%u,%llu] root %llu\n",
2048                 (unsigned long long)backref->dir,
2049                 BTRFS_DIR_INDEX_KEY, (unsigned long long)backref->index,
2050                 (unsigned long long)root->objectid);
2051
2052         di = btrfs_lookup_dir_index(trans, root, path, backref->dir,
2053                                     backref->name, backref->namelen,
2054                                     backref->index, -1);
2055         if (IS_ERR(di)) {
2056                 ret = PTR_ERR(di);
2057                 btrfs_free_path(path);
2058                 btrfs_commit_transaction(trans, root);
2059                 if (ret == -ENOENT)
2060                         return 0;
2061                 return ret;
2062         }
2063
2064         if (!di)
2065                 ret = btrfs_del_item(trans, root, path);
2066         else
2067                 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2068         BUG_ON(ret);
2069         btrfs_free_path(path);
2070         btrfs_commit_transaction(trans, root);
2071         return ret;
2072 }
2073
2074 static int create_inode_item(struct btrfs_root *root,
2075                              struct inode_record *rec,
2076                              struct inode_backref *backref, int root_dir)
2077 {
2078         struct btrfs_trans_handle *trans;
2079         struct btrfs_inode_item inode_item;
2080         time_t now = time(NULL);
2081         int ret;
2082
2083         trans = btrfs_start_transaction(root, 1);
2084         if (IS_ERR(trans)) {
2085                 ret = PTR_ERR(trans);
2086                 return ret;
2087         }
2088
2089         fprintf(stderr, "root %llu inode %llu recreating inode item, this may "
2090                 "be incomplete, please check permissions and content after "
2091                 "the fsck completes.\n", (unsigned long long)root->objectid,
2092                 (unsigned long long)rec->ino);
2093
2094         memset(&inode_item, 0, sizeof(inode_item));
2095         btrfs_set_stack_inode_generation(&inode_item, trans->transid);
2096         if (root_dir)
2097                 btrfs_set_stack_inode_nlink(&inode_item, 1);
2098         else
2099                 btrfs_set_stack_inode_nlink(&inode_item, rec->found_link);
2100         btrfs_set_stack_inode_nbytes(&inode_item, rec->found_size);
2101         if (rec->found_dir_item) {
2102                 if (rec->found_file_extent)
2103                         fprintf(stderr, "root %llu inode %llu has both a dir "
2104                                 "item and extents, unsure if it is a dir or a "
2105                                 "regular file so setting it as a directory\n",
2106                                 (unsigned long long)root->objectid,
2107                                 (unsigned long long)rec->ino);
2108                 btrfs_set_stack_inode_mode(&inode_item, S_IFDIR | 0755);
2109                 btrfs_set_stack_inode_size(&inode_item, rec->found_size);
2110         } else if (!rec->found_dir_item) {
2111                 btrfs_set_stack_inode_size(&inode_item, rec->extent_end);
2112                 btrfs_set_stack_inode_mode(&inode_item, S_IFREG | 0755);
2113         }
2114         btrfs_set_stack_timespec_sec(&inode_item.atime, now);
2115         btrfs_set_stack_timespec_nsec(&inode_item.atime, 0);
2116         btrfs_set_stack_timespec_sec(&inode_item.ctime, now);
2117         btrfs_set_stack_timespec_nsec(&inode_item.ctime, 0);
2118         btrfs_set_stack_timespec_sec(&inode_item.mtime, now);
2119         btrfs_set_stack_timespec_nsec(&inode_item.mtime, 0);
2120         btrfs_set_stack_timespec_sec(&inode_item.otime, 0);
2121         btrfs_set_stack_timespec_nsec(&inode_item.otime, 0);
2122
2123         ret = btrfs_insert_inode(trans, root, rec->ino, &inode_item);
2124         BUG_ON(ret);
2125         btrfs_commit_transaction(trans, root);
2126         return 0;
2127 }
2128
2129 static int repair_inode_backrefs(struct btrfs_root *root,
2130                                  struct inode_record *rec,
2131                                  struct cache_tree *inode_cache,
2132                                  int delete)
2133 {
2134         struct inode_backref *tmp, *backref;
2135         u64 root_dirid = btrfs_root_dirid(&root->root_item);
2136         int ret = 0;
2137         int repaired = 0;
2138
2139         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
2140                 if (!delete && rec->ino == root_dirid) {
2141                         if (!rec->found_inode_item) {
2142                                 ret = create_inode_item(root, rec, backref, 1);
2143                                 if (ret)
2144                                         break;
2145                                 repaired++;
2146                         }
2147                 }
2148
2149                 /* Index 0 for root dir's are special, don't mess with it */
2150                 if (rec->ino == root_dirid && backref->index == 0)
2151                         continue;
2152
2153                 if (delete &&
2154                     ((backref->found_dir_index && !backref->found_inode_ref) ||
2155                      (backref->found_dir_index && backref->found_inode_ref &&
2156                       (backref->errors & REF_ERR_INDEX_UNMATCH)))) {
2157                         ret = delete_dir_index(root, inode_cache, rec, backref);
2158                         if (ret)
2159                                 break;
2160                         repaired++;
2161                         list_del(&backref->list);
2162                         free(backref);
2163                 }
2164
2165                 if (!delete && !backref->found_dir_index &&
2166                     backref->found_dir_item && backref->found_inode_ref) {
2167                         ret = add_missing_dir_index(root, inode_cache, rec,
2168                                                     backref);
2169                         if (ret)
2170                                 break;
2171                         repaired++;
2172                         if (backref->found_dir_item &&
2173                             backref->found_dir_index &&
2174                             backref->found_dir_index) {
2175                                 if (!backref->errors &&
2176                                     backref->found_inode_ref) {
2177                                         list_del(&backref->list);
2178                                         free(backref);
2179                                 }
2180                         }
2181                 }
2182
2183                 if (!delete && (!backref->found_dir_index &&
2184                                 !backref->found_dir_item &&
2185                                 backref->found_inode_ref)) {
2186                         struct btrfs_trans_handle *trans;
2187                         struct btrfs_key location;
2188
2189                         ret = check_dir_conflict(root, backref->name,
2190                                                  backref->namelen,
2191                                                  backref->dir,
2192                                                  backref->index);
2193                         if (ret) {
2194                                 /*
2195                                  * let nlink fixing routine to handle it,
2196                                  * which can do it better.
2197                                  */
2198                                 ret = 0;
2199                                 break;
2200                         }
2201                         location.objectid = rec->ino;
2202                         location.type = BTRFS_INODE_ITEM_KEY;
2203                         location.offset = 0;
2204
2205                         trans = btrfs_start_transaction(root, 1);
2206                         if (IS_ERR(trans)) {
2207                                 ret = PTR_ERR(trans);
2208                                 break;
2209                         }
2210                         fprintf(stderr, "adding missing dir index/item pair "
2211                                 "for inode %llu\n",
2212                                 (unsigned long long)rec->ino);
2213                         ret = btrfs_insert_dir_item(trans, root, backref->name,
2214                                                     backref->namelen,
2215                                                     backref->dir, &location,
2216                                                     imode_to_type(rec->imode),
2217                                                     backref->index);
2218                         BUG_ON(ret);
2219                         btrfs_commit_transaction(trans, root);
2220                         repaired++;
2221                 }
2222
2223                 if (!delete && (backref->found_inode_ref &&
2224                                 backref->found_dir_index &&
2225                                 backref->found_dir_item &&
2226                                 !(backref->errors & REF_ERR_INDEX_UNMATCH) &&
2227                                 !rec->found_inode_item)) {
2228                         ret = create_inode_item(root, rec, backref, 0);
2229                         if (ret)
2230                                 break;
2231                         repaired++;
2232                 }
2233
2234         }
2235         return ret ? ret : repaired;
2236 }
2237
2238 /*
2239  * To determine the file type for nlink/inode_item repair
2240  *
2241  * Return 0 if file type is found and BTRFS_FT_* is stored into type.
2242  * Return -ENOENT if file type is not found.
2243  */
2244 static int find_file_type(struct inode_record *rec, u8 *type)
2245 {
2246         struct inode_backref *backref;
2247
2248         /* For inode item recovered case */
2249         if (rec->found_inode_item) {
2250                 *type = imode_to_type(rec->imode);
2251                 return 0;
2252         }
2253
2254         list_for_each_entry(backref, &rec->backrefs, list) {
2255                 if (backref->found_dir_index || backref->found_dir_item) {
2256                         *type = backref->filetype;
2257                         return 0;
2258                 }
2259         }
2260         return -ENOENT;
2261 }
2262
2263 /*
2264  * To determine the file name for nlink repair
2265  *
2266  * Return 0 if file name is found, set name and namelen.
2267  * Return -ENOENT if file name is not found.
2268  */
2269 static int find_file_name(struct inode_record *rec,
2270                           char *name, int *namelen)
2271 {
2272         struct inode_backref *backref;
2273
2274         list_for_each_entry(backref, &rec->backrefs, list) {
2275                 if (backref->found_dir_index || backref->found_dir_item ||
2276                     backref->found_inode_ref) {
2277                         memcpy(name, backref->name, backref->namelen);
2278                         *namelen = backref->namelen;
2279                         return 0;
2280                 }
2281         }
2282         return -ENOENT;
2283 }
2284
2285 /* Reset the nlink of the inode to the correct one */
2286 static int reset_nlink(struct btrfs_trans_handle *trans,
2287                        struct btrfs_root *root,
2288                        struct btrfs_path *path,
2289                        struct inode_record *rec)
2290 {
2291         struct inode_backref *backref;
2292         struct inode_backref *tmp;
2293         struct btrfs_key key;
2294         struct btrfs_inode_item *inode_item;
2295         int ret = 0;
2296
2297         /* We don't believe this either, reset it and iterate backref */
2298         rec->found_link = 0;
2299
2300         /* Remove all backref including the valid ones */
2301         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
2302                 ret = btrfs_unlink(trans, root, rec->ino, backref->dir,
2303                                    backref->index, backref->name,
2304                                    backref->namelen, 0);
2305                 if (ret < 0)
2306                         goto out;
2307
2308                 /* remove invalid backref, so it won't be added back */
2309                 if (!(backref->found_dir_index &&
2310                       backref->found_dir_item &&
2311                       backref->found_inode_ref)) {
2312                         list_del(&backref->list);
2313                         free(backref);
2314                 } else {
2315                         rec->found_link++;
2316                 }
2317         }
2318
2319         /* Set nlink to 0 */
2320         key.objectid = rec->ino;
2321         key.type = BTRFS_INODE_ITEM_KEY;
2322         key.offset = 0;
2323         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2324         if (ret < 0)
2325                 goto out;
2326         if (ret > 0) {
2327                 ret = -ENOENT;
2328                 goto out;
2329         }
2330         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2331                                     struct btrfs_inode_item);
2332         btrfs_set_inode_nlink(path->nodes[0], inode_item, 0);
2333         btrfs_mark_buffer_dirty(path->nodes[0]);
2334         btrfs_release_path(path);
2335
2336         /*
2337          * Add back valid inode_ref/dir_item/dir_index,
2338          * add_link() will handle the nlink inc, so new nlink must be correct
2339          */
2340         list_for_each_entry(backref, &rec->backrefs, list) {
2341                 ret = btrfs_add_link(trans, root, rec->ino, backref->dir,
2342                                      backref->name, backref->namelen,
2343                                      backref->ref_type, &backref->index, 1);
2344                 if (ret < 0)
2345                         goto out;
2346         }
2347 out:
2348         btrfs_release_path(path);
2349         return ret;
2350 }
2351
2352 static int repair_inode_nlinks(struct btrfs_trans_handle *trans,
2353                                struct btrfs_root *root,
2354                                struct btrfs_path *path,
2355                                struct inode_record *rec)
2356 {
2357         char *dir_name = "lost+found";
2358         char namebuf[BTRFS_NAME_LEN] = {0};
2359         u64 lost_found_ino;
2360         u32 mode = 0700;
2361         u8 type = 0;
2362         int namelen = 0;
2363         int name_recovered = 0;
2364         int type_recovered = 0;
2365         int ret = 0;
2366
2367         /*
2368          * Get file name and type first before these invalid inode ref
2369          * are deleted by remove_all_invalid_backref()
2370          */
2371         name_recovered = !find_file_name(rec, namebuf, &namelen);
2372         type_recovered = !find_file_type(rec, &type);
2373
2374         if (!name_recovered) {
2375                 printf("Can't get file name for inode %llu, using '%llu' as fallback\n",
2376                        rec->ino, rec->ino);
2377                 namelen = count_digits(rec->ino);
2378                 sprintf(namebuf, "%llu", rec->ino);
2379                 name_recovered = 1;
2380         }
2381         if (!type_recovered) {
2382                 printf("Can't get file type for inode %llu, using FILE as fallback\n",
2383                        rec->ino);
2384                 type = BTRFS_FT_REG_FILE;
2385                 type_recovered = 1;
2386         }
2387
2388         ret = reset_nlink(trans, root, path, rec);
2389         if (ret < 0) {
2390                 fprintf(stderr,
2391                         "Failed to reset nlink for inode %llu: %s\n",
2392                         rec->ino, strerror(-ret));
2393                 goto out;
2394         }
2395
2396         if (rec->found_link == 0) {
2397                 lost_found_ino = root->highest_inode;
2398                 if (lost_found_ino >= BTRFS_LAST_FREE_OBJECTID) {
2399                         ret = -EOVERFLOW;
2400                         goto out;
2401                 }
2402                 lost_found_ino++;
2403                 ret = btrfs_mkdir(trans, root, dir_name, strlen(dir_name),
2404                                   BTRFS_FIRST_FREE_OBJECTID, &lost_found_ino,
2405                                   mode);
2406                 if (ret < 0) {
2407                         fprintf(stderr, "Failed to create '%s' dir: %s",
2408                                 dir_name, strerror(-ret));
2409                         goto out;
2410                 }
2411                 ret = btrfs_add_link(trans, root, rec->ino, lost_found_ino,
2412                                      namebuf, namelen, type, NULL, 1);
2413                 /*
2414                  * Add ".INO" suffix several times to handle case where
2415                  * "FILENAME.INO" is already taken by another file.
2416                  */
2417                 while (ret == -EEXIST) {
2418                         /*
2419                          * Conflicting file name, add ".INO" as suffix * +1 for '.'
2420                          */
2421                         if (namelen + count_digits(rec->ino) + 1 >
2422                             BTRFS_NAME_LEN) {
2423                                 ret = -EFBIG;
2424                                 goto out;
2425                         }
2426                         snprintf(namebuf + namelen, BTRFS_NAME_LEN - namelen,
2427                                  ".%llu", rec->ino);
2428                         namelen += count_digits(rec->ino) + 1;
2429                         ret = btrfs_add_link(trans, root, rec->ino,
2430                                              lost_found_ino, namebuf,
2431                                              namelen, type, NULL, 1);
2432                 }
2433                 if (ret < 0) {
2434                         fprintf(stderr,
2435                                 "Failed to link the inode %llu to %s dir: %s",
2436                                 rec->ino, dir_name, strerror(-ret));
2437                         goto out;
2438                 }
2439                 /*
2440                  * Just increase the found_link, don't actually add the
2441                  * backref. This will make things easier and this inode
2442                  * record will be freed after the repair is done.
2443                  * So fsck will not report problem about this inode.
2444                  */
2445                 rec->found_link++;
2446                 printf("Moving file '%.*s' to '%s' dir since it has no valid backref\n",
2447                        namelen, namebuf, dir_name);
2448         }
2449         printf("Fixed the nlink of inode %llu\n", rec->ino);
2450 out:
2451         /*
2452          * Clear the flag anyway, or we will loop forever for the same inode
2453          * as it will not be removed from the bad inode list and the dead loop
2454          * happens.
2455          */
2456         rec->errors &= ~I_ERR_LINK_COUNT_WRONG;
2457         btrfs_release_path(path);
2458         return ret;
2459 }
2460
2461 /*
2462  * Check if there is any normal(reg or prealloc) file extent for given
2463  * ino.
2464  * This is used to determine the file type when neither its dir_index/item or
2465  * inode_item exists.
2466  *
2467  * This will *NOT* report error, if any error happens, just consider it does
2468  * not have any normal file extent.
2469  */
2470 static int find_normal_file_extent(struct btrfs_root *root, u64 ino)
2471 {
2472         struct btrfs_path *path;
2473         struct btrfs_key key;
2474         struct btrfs_key found_key;
2475         struct btrfs_file_extent_item *fi;
2476         u8 type;
2477         int ret = 0;
2478
2479         path = btrfs_alloc_path();
2480         if (!path)
2481                 goto out;
2482         key.objectid = ino;
2483         key.type = BTRFS_EXTENT_DATA_KEY;
2484         key.offset = 0;
2485
2486         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2487         if (ret < 0) {
2488                 ret = 0;
2489                 goto out;
2490         }
2491         if (ret && path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2492                 ret = btrfs_next_leaf(root, path);
2493                 if (ret) {
2494                         ret = 0;
2495                         goto out;
2496                 }
2497         }
2498         while (1) {
2499                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2500                                       path->slots[0]);
2501                 if (found_key.objectid != ino ||
2502                     found_key.type != BTRFS_EXTENT_DATA_KEY)
2503                         break;
2504                 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
2505                                     struct btrfs_file_extent_item);
2506                 type = btrfs_file_extent_type(path->nodes[0], fi);
2507                 if (type != BTRFS_FILE_EXTENT_INLINE) {
2508                         ret = 1;
2509                         goto out;
2510                 }
2511         }
2512 out:
2513         btrfs_free_path(path);
2514         return ret;
2515 }
2516
2517 static u32 btrfs_type_to_imode(u8 type)
2518 {
2519         static u32 imode_by_btrfs_type[] = {
2520                 [BTRFS_FT_REG_FILE]     = S_IFREG,
2521                 [BTRFS_FT_DIR]          = S_IFDIR,
2522                 [BTRFS_FT_CHRDEV]       = S_IFCHR,
2523                 [BTRFS_FT_BLKDEV]       = S_IFBLK,
2524                 [BTRFS_FT_FIFO]         = S_IFIFO,
2525                 [BTRFS_FT_SOCK]         = S_IFSOCK,
2526                 [BTRFS_FT_SYMLINK]      = S_IFLNK,
2527         };
2528
2529         return imode_by_btrfs_type[(type)];
2530 }
2531
2532 static int repair_inode_no_item(struct btrfs_trans_handle *trans,
2533                                 struct btrfs_root *root,
2534                                 struct btrfs_path *path,
2535                                 struct inode_record *rec)
2536 {
2537         u8 filetype;
2538         u32 mode = 0700;
2539         int type_recovered = 0;
2540         int ret = 0;
2541
2542         printf("Trying to rebuild inode:%llu\n", rec->ino);
2543
2544         type_recovered = !find_file_type(rec, &filetype);
2545
2546         /*
2547          * Try to determine inode type if type not found.
2548          *
2549          * For found regular file extent, it must be FILE.
2550          * For found dir_item/index, it must be DIR.
2551          *
2552          * For undetermined one, use FILE as fallback.
2553          *
2554          * TODO:
2555          * 1. If found backref(inode_index/item is already handled) to it,
2556          *    it must be DIR.
2557          *    Need new inode-inode ref structure to allow search for that.
2558          */
2559         if (!type_recovered) {
2560                 if (rec->found_file_extent &&
2561                     find_normal_file_extent(root, rec->ino)) {
2562                         type_recovered = 1;
2563                         filetype = BTRFS_FT_REG_FILE;
2564                 } else if (rec->found_dir_item) {
2565                         type_recovered = 1;
2566                         filetype = BTRFS_FT_DIR;
2567                 } else if (!list_empty(&rec->orphan_extents)) {
2568                         type_recovered = 1;
2569                         filetype = BTRFS_FT_REG_FILE;
2570                 } else{
2571                         printf("Can't determint the filetype for inode %llu, assume it is a normal file\n",
2572                                rec->ino);
2573                         type_recovered = 1;
2574                         filetype = BTRFS_FT_REG_FILE;
2575                 }
2576         }
2577
2578         ret = btrfs_new_inode(trans, root, rec->ino,
2579                               mode | btrfs_type_to_imode(filetype));
2580         if (ret < 0)
2581                 goto out;
2582
2583         /*
2584          * Here inode rebuild is done, we only rebuild the inode item,
2585          * don't repair the nlink(like move to lost+found).
2586          * That is the job of nlink repair.
2587          *
2588          * We just fill the record and return
2589          */
2590         rec->found_dir_item = 1;
2591         rec->imode = mode | btrfs_type_to_imode(filetype);
2592         rec->nlink = 0;
2593         rec->errors &= ~I_ERR_NO_INODE_ITEM;
2594         /* Ensure the inode_nlinks repair function will be called */
2595         rec->errors |= I_ERR_LINK_COUNT_WRONG;
2596 out:
2597         return ret;
2598 }
2599
2600 static int repair_inode_orphan_extent(struct btrfs_trans_handle *trans,
2601                                       struct btrfs_root *root,
2602                                       struct btrfs_path *path,
2603                                       struct inode_record *rec)
2604 {
2605         struct orphan_data_extent *orphan;
2606         struct orphan_data_extent *tmp;
2607         int ret = 0;
2608
2609         list_for_each_entry_safe(orphan, tmp, &rec->orphan_extents, list) {
2610                 /*
2611                  * Check for conflicting file extents
2612                  *
2613                  * Here we don't know whether the extents is compressed or not,
2614                  * so we can only assume it not compressed nor data offset,
2615                  * and use its disk_len as extent length.
2616                  */
2617                 ret = btrfs_get_extent(NULL, root, path, orphan->objectid,
2618                                        orphan->offset, orphan->disk_len, 0);
2619                 btrfs_release_path(path);
2620                 if (ret < 0)
2621                         goto out;
2622                 if (!ret) {
2623                         fprintf(stderr,
2624                                 "orphan extent (%llu, %llu) conflicts, delete the orphan\n",
2625                                 orphan->disk_bytenr, orphan->disk_len);
2626                         ret = btrfs_free_extent(trans,
2627                                         root->fs_info->extent_root,
2628                                         orphan->disk_bytenr, orphan->disk_len,
2629                                         0, root->objectid, orphan->objectid,
2630                                         orphan->offset);
2631                         if (ret < 0)
2632                                 goto out;
2633                 }
2634                 ret = btrfs_insert_file_extent(trans, root, orphan->objectid,
2635                                 orphan->offset, orphan->disk_bytenr,
2636                                 orphan->disk_len, orphan->disk_len);
2637                 if (ret < 0)
2638                         goto out;
2639
2640                 /* Update file size info */
2641                 rec->found_size += orphan->disk_len;
2642                 if (rec->found_size == rec->nbytes)
2643                         rec->errors &= ~I_ERR_FILE_NBYTES_WRONG;
2644
2645                 /* Update the file extent hole info too */
2646                 ret = del_file_extent_hole(&rec->holes, orphan->offset,
2647                                            orphan->disk_len);
2648                 if (ret < 0)
2649                         goto out;
2650                 if (RB_EMPTY_ROOT(&rec->holes))
2651                         rec->errors &= ~I_ERR_FILE_EXTENT_DISCOUNT;
2652
2653                 list_del(&orphan->list);
2654                 free(orphan);
2655         }
2656         rec->errors &= ~I_ERR_FILE_EXTENT_ORPHAN;
2657 out:
2658         return ret;
2659 }
2660
2661 static int repair_inode_discount_extent(struct btrfs_trans_handle *trans,
2662                                         struct btrfs_root *root,
2663                                         struct btrfs_path *path,
2664                                         struct inode_record *rec)
2665 {
2666         struct rb_node *node;
2667         struct file_extent_hole *hole;
2668         int ret = 0;
2669
2670         node = rb_first(&rec->holes);
2671
2672         while (node) {
2673                 hole = rb_entry(node, struct file_extent_hole, node);
2674                 ret = btrfs_punch_hole(trans, root, rec->ino,
2675                                        hole->start, hole->len);
2676                 if (ret < 0)
2677                         goto out;
2678                 ret = del_file_extent_hole(&rec->holes, hole->start,
2679                                            hole->len);
2680                 if (ret < 0)
2681                         goto out;
2682                 if (RB_EMPTY_ROOT(&rec->holes))
2683                         rec->errors &= ~I_ERR_FILE_EXTENT_DISCOUNT;
2684                 node = rb_first(&rec->holes);
2685         }
2686         printf("Fixed discount file extents for inode: %llu in root: %llu\n",
2687                rec->ino, root->objectid);
2688 out:
2689         return ret;
2690 }
2691
2692 static int try_repair_inode(struct btrfs_root *root, struct inode_record *rec)
2693 {
2694         struct btrfs_trans_handle *trans;
2695         struct btrfs_path *path;
2696         int ret = 0;
2697
2698         if (!(rec->errors & (I_ERR_DIR_ISIZE_WRONG |
2699                              I_ERR_NO_ORPHAN_ITEM |
2700                              I_ERR_LINK_COUNT_WRONG |
2701                              I_ERR_NO_INODE_ITEM |
2702                              I_ERR_FILE_EXTENT_ORPHAN |
2703                              I_ERR_FILE_EXTENT_DISCOUNT|
2704                              I_ERR_FILE_NBYTES_WRONG)))
2705                 return rec->errors;
2706
2707         path = btrfs_alloc_path();
2708         if (!path)
2709                 return -ENOMEM;
2710
2711         /*
2712          * For nlink repair, it may create a dir and add link, so
2713          * 2 for parent(256)'s dir_index and dir_item
2714          * 2 for lost+found dir's inode_item and inode_ref
2715          * 1 for the new inode_ref of the file
2716          * 2 for lost+found dir's dir_index and dir_item for the file
2717          */
2718         trans = btrfs_start_transaction(root, 7);
2719         if (IS_ERR(trans)) {
2720                 btrfs_free_path(path);
2721                 return PTR_ERR(trans);
2722         }
2723
2724         if (rec->errors & I_ERR_NO_INODE_ITEM)
2725                 ret = repair_inode_no_item(trans, root, path, rec);
2726         if (!ret && rec->errors & I_ERR_FILE_EXTENT_ORPHAN)
2727                 ret = repair_inode_orphan_extent(trans, root, path, rec);
2728         if (!ret && rec->errors & I_ERR_FILE_EXTENT_DISCOUNT)
2729                 ret = repair_inode_discount_extent(trans, root, path, rec);
2730         if (!ret && rec->errors & I_ERR_DIR_ISIZE_WRONG)
2731                 ret = repair_inode_isize(trans, root, path, rec);
2732         if (!ret && rec->errors & I_ERR_NO_ORPHAN_ITEM)
2733                 ret = repair_inode_orphan_item(trans, root, path, rec);
2734         if (!ret && rec->errors & I_ERR_LINK_COUNT_WRONG)
2735                 ret = repair_inode_nlinks(trans, root, path, rec);
2736         if (!ret && rec->errors & I_ERR_FILE_NBYTES_WRONG)
2737                 ret = repair_inode_nbytes(trans, root, path, rec);
2738         btrfs_commit_transaction(trans, root);
2739         btrfs_free_path(path);
2740         return ret;
2741 }
2742
2743 static int check_inode_recs(struct btrfs_root *root,
2744                             struct cache_tree *inode_cache)
2745 {
2746         struct cache_extent *cache;
2747         struct ptr_node *node;
2748         struct inode_record *rec;
2749         struct inode_backref *backref;
2750         int stage = 0;
2751         int ret = 0;
2752         int err = 0;
2753         u64 error = 0;
2754         u64 root_dirid = btrfs_root_dirid(&root->root_item);
2755
2756         if (btrfs_root_refs(&root->root_item) == 0) {
2757                 if (!cache_tree_empty(inode_cache))
2758                         fprintf(stderr, "warning line %d\n", __LINE__);
2759                 return 0;
2760         }
2761
2762         /*
2763          * We need to record the highest inode number for later 'lost+found'
2764          * dir creation.
2765          * We must select a ino not used/refered by any existing inode, or
2766          * 'lost+found' ino may be a missing ino in a corrupted leaf,
2767          * this may cause 'lost+found' dir has wrong nlinks.
2768          */
2769         cache = last_cache_extent(inode_cache);
2770         if (cache) {
2771                 node = container_of(cache, struct ptr_node, cache);
2772                 rec = node->data;
2773                 if (rec->ino > root->highest_inode)
2774                         root->highest_inode = rec->ino;
2775         }
2776
2777         /*
2778          * We need to repair backrefs first because we could change some of the
2779          * errors in the inode recs.
2780          *
2781          * We also need to go through and delete invalid backrefs first and then
2782          * add the correct ones second.  We do this because we may get EEXIST
2783          * when adding back the correct index because we hadn't yet deleted the
2784          * invalid index.
2785          *
2786          * For example, if we were missing a dir index then the directories
2787          * isize would be wrong, so if we fixed the isize to what we thought it
2788          * would be and then fixed the backref we'd still have a invalid fs, so
2789          * we need to add back the dir index and then check to see if the isize
2790          * is still wrong.
2791          */
2792         while (stage < 3) {
2793                 stage++;
2794                 if (stage == 3 && !err)
2795                         break;
2796
2797                 cache = search_cache_extent(inode_cache, 0);
2798                 while (repair && cache) {
2799                         node = container_of(cache, struct ptr_node, cache);
2800                         rec = node->data;
2801                         cache = next_cache_extent(cache);
2802
2803                         /* Need to free everything up and rescan */
2804                         if (stage == 3) {
2805                                 remove_cache_extent(inode_cache, &node->cache);
2806                                 free(node);
2807                                 free_inode_rec(rec);
2808                                 continue;
2809                         }
2810
2811                         if (list_empty(&rec->backrefs))
2812                                 continue;
2813
2814                         ret = repair_inode_backrefs(root, rec, inode_cache,
2815                                                     stage == 1);
2816                         if (ret < 0) {
2817                                 err = ret;
2818                                 stage = 2;
2819                                 break;
2820                         } if (ret > 0) {
2821                                 err = -EAGAIN;
2822                         }
2823                 }
2824         }
2825         if (err)
2826                 return err;
2827
2828         rec = get_inode_rec(inode_cache, root_dirid, 0);
2829         if (rec) {
2830                 ret = check_root_dir(rec);
2831                 if (ret) {
2832                         fprintf(stderr, "root %llu root dir %llu error\n",
2833                                 (unsigned long long)root->root_key.objectid,
2834                                 (unsigned long long)root_dirid);
2835                         print_inode_error(root, rec);
2836                         error++;
2837                 }
2838         } else {
2839                 if (repair) {
2840                         struct btrfs_trans_handle *trans;
2841
2842                         trans = btrfs_start_transaction(root, 1);
2843                         if (IS_ERR(trans)) {
2844                                 err = PTR_ERR(trans);
2845                                 return err;
2846                         }
2847
2848                         fprintf(stderr,
2849                                 "root %llu missing its root dir, recreating\n",
2850                                 (unsigned long long)root->objectid);
2851
2852                         ret = btrfs_make_root_dir(trans, root, root_dirid);
2853                         BUG_ON(ret);
2854
2855                         btrfs_commit_transaction(trans, root);
2856                         return -EAGAIN;
2857                 }
2858
2859                 fprintf(stderr, "root %llu root dir %llu not found\n",
2860                         (unsigned long long)root->root_key.objectid,
2861                         (unsigned long long)root_dirid);
2862         }
2863
2864         while (1) {
2865                 cache = search_cache_extent(inode_cache, 0);
2866                 if (!cache)
2867                         break;
2868                 node = container_of(cache, struct ptr_node, cache);
2869                 rec = node->data;
2870                 remove_cache_extent(inode_cache, &node->cache);
2871                 free(node);
2872                 if (rec->ino == root_dirid ||
2873                     rec->ino == BTRFS_ORPHAN_OBJECTID) {
2874                         free_inode_rec(rec);
2875                         continue;
2876                 }
2877
2878                 if (rec->errors & I_ERR_NO_ORPHAN_ITEM) {
2879                         ret = check_orphan_item(root, rec->ino);
2880                         if (ret == 0)
2881                                 rec->errors &= ~I_ERR_NO_ORPHAN_ITEM;
2882                         if (can_free_inode_rec(rec)) {
2883                                 free_inode_rec(rec);
2884                                 continue;
2885                         }
2886                 }
2887
2888                 if (!rec->found_inode_item)
2889                         rec->errors |= I_ERR_NO_INODE_ITEM;
2890                 if (rec->found_link != rec->nlink)
2891                         rec->errors |= I_ERR_LINK_COUNT_WRONG;
2892                 if (repair) {
2893                         ret = try_repair_inode(root, rec);
2894                         if (ret == 0 && can_free_inode_rec(rec)) {
2895                                 free_inode_rec(rec);
2896                                 continue;
2897                         }
2898                         ret = 0;
2899                 }
2900
2901                 if (!(repair && ret == 0))
2902                         error++;
2903                 print_inode_error(root, rec);
2904                 list_for_each_entry(backref, &rec->backrefs, list) {
2905                         if (!backref->found_dir_item)
2906                                 backref->errors |= REF_ERR_NO_DIR_ITEM;
2907                         if (!backref->found_dir_index)
2908                                 backref->errors |= REF_ERR_NO_DIR_INDEX;
2909                         if (!backref->found_inode_ref)
2910                                 backref->errors |= REF_ERR_NO_INODE_REF;
2911                         fprintf(stderr, "\tunresolved ref dir %llu index %llu"
2912                                 " namelen %u name %s filetype %d errors %x",
2913                                 (unsigned long long)backref->dir,
2914                                 (unsigned long long)backref->index,
2915                                 backref->namelen, backref->name,
2916                                 backref->filetype, backref->errors);
2917                         print_ref_error(backref->errors);
2918                 }
2919                 free_inode_rec(rec);
2920         }
2921         return (error > 0) ? -1 : 0;
2922 }
2923
2924 static struct root_record *get_root_rec(struct cache_tree *root_cache,
2925                                         u64 objectid)
2926 {
2927         struct cache_extent *cache;
2928         struct root_record *rec = NULL;
2929         int ret;
2930
2931         cache = lookup_cache_extent(root_cache, objectid, 1);
2932         if (cache) {
2933                 rec = container_of(cache, struct root_record, cache);
2934         } else {
2935                 rec = calloc(1, sizeof(*rec));
2936                 rec->objectid = objectid;
2937                 INIT_LIST_HEAD(&rec->backrefs);
2938                 rec->cache.start = objectid;
2939                 rec->cache.size = 1;
2940
2941                 ret = insert_cache_extent(root_cache, &rec->cache);
2942                 BUG_ON(ret);
2943         }
2944         return rec;
2945 }
2946
2947 static struct root_backref *get_root_backref(struct root_record *rec,
2948                                              u64 ref_root, u64 dir, u64 index,
2949                                              const char *name, int namelen)
2950 {
2951         struct root_backref *backref;
2952
2953         list_for_each_entry(backref, &rec->backrefs, list) {
2954                 if (backref->ref_root != ref_root || backref->dir != dir ||
2955                     backref->namelen != namelen)
2956                         continue;
2957                 if (memcmp(name, backref->name, namelen))
2958                         continue;
2959                 return backref;
2960         }
2961
2962         backref = malloc(sizeof(*backref) + namelen + 1);
2963         memset(backref, 0, sizeof(*backref));
2964         backref->ref_root = ref_root;
2965         backref->dir = dir;
2966         backref->index = index;
2967         backref->namelen = namelen;
2968         memcpy(backref->name, name, namelen);
2969         backref->name[namelen] = '\0';
2970         list_add_tail(&backref->list, &rec->backrefs);
2971         return backref;
2972 }
2973
2974 static void free_root_record(struct cache_extent *cache)
2975 {
2976         struct root_record *rec;
2977         struct root_backref *backref;
2978
2979         rec = container_of(cache, struct root_record, cache);
2980         while (!list_empty(&rec->backrefs)) {
2981                 backref = list_entry(rec->backrefs.next,
2982                                      struct root_backref, list);
2983                 list_del(&backref->list);
2984                 free(backref);
2985         }
2986
2987         kfree(rec);
2988 }
2989
2990 FREE_EXTENT_CACHE_BASED_TREE(root_recs, free_root_record);
2991
2992 static int add_root_backref(struct cache_tree *root_cache,
2993                             u64 root_id, u64 ref_root, u64 dir, u64 index,
2994                             const char *name, int namelen,
2995                             int item_type, int errors)
2996 {
2997         struct root_record *rec;
2998         struct root_backref *backref;
2999
3000         rec = get_root_rec(root_cache, root_id);
3001         backref = get_root_backref(rec, ref_root, dir, index, name, namelen);
3002
3003         backref->errors |= errors;
3004
3005         if (item_type != BTRFS_DIR_ITEM_KEY) {
3006                 if (backref->found_dir_index || backref->found_back_ref ||
3007                     backref->found_forward_ref) {
3008                         if (backref->index != index)
3009                                 backref->errors |= REF_ERR_INDEX_UNMATCH;
3010                 } else {
3011                         backref->index = index;
3012                 }
3013         }
3014
3015         if (item_type == BTRFS_DIR_ITEM_KEY) {
3016                 if (backref->found_forward_ref)
3017                         rec->found_ref++;
3018                 backref->found_dir_item = 1;
3019         } else if (item_type == BTRFS_DIR_INDEX_KEY) {
3020                 backref->found_dir_index = 1;
3021         } else if (item_type == BTRFS_ROOT_REF_KEY) {
3022                 if (backref->found_forward_ref)
3023                         backref->errors |= REF_ERR_DUP_ROOT_REF;
3024                 else if (backref->found_dir_item)
3025                         rec->found_ref++;
3026                 backref->found_forward_ref = 1;
3027         } else if (item_type == BTRFS_ROOT_BACKREF_KEY) {
3028                 if (backref->found_back_ref)
3029                         backref->errors |= REF_ERR_DUP_ROOT_BACKREF;
3030                 backref->found_back_ref = 1;
3031         } else {
3032                 BUG_ON(1);
3033         }
3034
3035         if (backref->found_forward_ref && backref->found_dir_item)
3036                 backref->reachable = 1;
3037         return 0;
3038 }
3039
3040 static int merge_root_recs(struct btrfs_root *root,
3041                            struct cache_tree *src_cache,
3042                            struct cache_tree *dst_cache)
3043 {
3044         struct cache_extent *cache;
3045         struct ptr_node *node;
3046         struct inode_record *rec;
3047         struct inode_backref *backref;
3048         int ret = 0;
3049
3050         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
3051                 free_inode_recs_tree(src_cache);
3052                 return 0;
3053         }
3054
3055         while (1) {
3056                 cache = search_cache_extent(src_cache, 0);
3057                 if (!cache)
3058                         break;
3059                 node = container_of(cache, struct ptr_node, cache);
3060                 rec = node->data;
3061                 remove_cache_extent(src_cache, &node->cache);
3062                 free(node);
3063
3064                 ret = is_child_root(root, root->objectid, rec->ino);
3065                 if (ret < 0)
3066                         break;
3067                 else if (ret == 0)
3068                         goto skip;
3069
3070                 list_for_each_entry(backref, &rec->backrefs, list) {
3071                         BUG_ON(backref->found_inode_ref);
3072                         if (backref->found_dir_item)
3073                                 add_root_backref(dst_cache, rec->ino,
3074                                         root->root_key.objectid, backref->dir,
3075                                         backref->index, backref->name,
3076                                         backref->namelen, BTRFS_DIR_ITEM_KEY,
3077                                         backref->errors);
3078                         if (backref->found_dir_index)
3079                                 add_root_backref(dst_cache, rec->ino,
3080                                         root->root_key.objectid, backref->dir,
3081                                         backref->index, backref->name,
3082                                         backref->namelen, BTRFS_DIR_INDEX_KEY,
3083                                         backref->errors);
3084                 }
3085 skip:
3086                 free_inode_rec(rec);
3087         }
3088         if (ret < 0)
3089                 return ret;
3090         return 0;
3091 }
3092
3093 static int check_root_refs(struct btrfs_root *root,
3094                            struct cache_tree *root_cache)
3095 {
3096         struct root_record *rec;
3097         struct root_record *ref_root;
3098         struct root_backref *backref;
3099         struct cache_extent *cache;
3100         int loop = 1;
3101         int ret;
3102         int error;
3103         int errors = 0;
3104
3105         rec = get_root_rec(root_cache, BTRFS_FS_TREE_OBJECTID);
3106         rec->found_ref = 1;
3107
3108         /* fixme: this can not detect circular references */
3109         while (loop) {
3110                 loop = 0;
3111                 cache = search_cache_extent(root_cache, 0);
3112                 while (1) {
3113                         if (!cache)
3114                                 break;
3115                         rec = container_of(cache, struct root_record, cache);
3116                         cache = next_cache_extent(cache);
3117
3118                         if (rec->found_ref == 0)
3119                                 continue;
3120
3121                         list_for_each_entry(backref, &rec->backrefs, list) {
3122                                 if (!backref->reachable)
3123                                         continue;
3124
3125                                 ref_root = get_root_rec(root_cache,
3126                                                         backref->ref_root);
3127                                 if (ref_root->found_ref > 0)
3128                                         continue;
3129
3130                                 backref->reachable = 0;
3131                                 rec->found_ref--;
3132                                 if (rec->found_ref == 0)
3133                                         loop = 1;
3134                         }
3135                 }
3136         }
3137
3138         cache = search_cache_extent(root_cache, 0);
3139         while (1) {
3140                 if (!cache)
3141                         break;
3142                 rec = container_of(cache, struct root_record, cache);
3143                 cache = next_cache_extent(cache);
3144
3145                 if (rec->found_ref == 0 &&
3146                     rec->objectid >= BTRFS_FIRST_FREE_OBJECTID &&
3147                     rec->objectid <= BTRFS_LAST_FREE_OBJECTID) {
3148                         ret = check_orphan_item(root->fs_info->tree_root,
3149                                                 rec->objectid);
3150                         if (ret == 0)
3151                                 continue;
3152
3153                         /*
3154                          * If we don't have a root item then we likely just have
3155                          * a dir item in a snapshot for this root but no actual
3156                          * ref key or anything so it's meaningless.
3157                          */
3158                         if (!rec->found_root_item)
3159                                 continue;
3160                         errors++;
3161                         fprintf(stderr, "fs tree %llu not referenced\n",
3162                                 (unsigned long long)rec->objectid);
3163                 }
3164
3165                 error = 0;
3166                 if (rec->found_ref > 0 && !rec->found_root_item)
3167                         error = 1;
3168                 list_for_each_entry(backref, &rec->backrefs, list) {
3169                         if (!backref->found_dir_item)
3170                                 backref->errors |= REF_ERR_NO_DIR_ITEM;
3171                         if (!backref->found_dir_index)
3172                                 backref->errors |= REF_ERR_NO_DIR_INDEX;
3173                         if (!backref->found_back_ref)
3174                                 backref->errors |= REF_ERR_NO_ROOT_BACKREF;
3175                         if (!backref->found_forward_ref)
3176                                 backref->errors |= REF_ERR_NO_ROOT_REF;
3177                         if (backref->reachable && backref->errors)
3178                                 error = 1;
3179                 }
3180                 if (!error)
3181                         continue;
3182
3183                 errors++;
3184                 fprintf(stderr, "fs tree %llu refs %u %s\n",
3185                         (unsigned long long)rec->objectid, rec->found_ref,
3186                          rec->found_root_item ? "" : "not found");
3187
3188                 list_for_each_entry(backref, &rec->backrefs, list) {
3189                         if (!backref->reachable)
3190                                 continue;
3191                         if (!backref->errors && rec->found_root_item)
3192                                 continue;
3193                         fprintf(stderr, "\tunresolved ref root %llu dir %llu"
3194                                 " index %llu namelen %u name %s errors %x\n",
3195                                 (unsigned long long)backref->ref_root,
3196                                 (unsigned long long)backref->dir,
3197                                 (unsigned long long)backref->index,
3198                                 backref->namelen, backref->name,
3199                                 backref->errors);
3200                         print_ref_error(backref->errors);
3201                 }
3202         }
3203         return errors > 0 ? 1 : 0;
3204 }
3205
3206 static int process_root_ref(struct extent_buffer *eb, int slot,
3207                             struct btrfs_key *key,
3208                             struct cache_tree *root_cache)
3209 {
3210         u64 dirid;
3211         u64 index;
3212         u32 len;
3213         u32 name_len;
3214         struct btrfs_root_ref *ref;
3215         char namebuf[BTRFS_NAME_LEN];
3216         int error;
3217
3218         ref = btrfs_item_ptr(eb, slot, struct btrfs_root_ref);
3219
3220         dirid = btrfs_root_ref_dirid(eb, ref);
3221         index = btrfs_root_ref_sequence(eb, ref);
3222         name_len = btrfs_root_ref_name_len(eb, ref);
3223
3224         if (name_len <= BTRFS_NAME_LEN) {
3225                 len = name_len;
3226                 error = 0;
3227         } else {
3228                 len = BTRFS_NAME_LEN;
3229                 error = REF_ERR_NAME_TOO_LONG;
3230         }
3231         read_extent_buffer(eb, namebuf, (unsigned long)(ref + 1), len);
3232
3233         if (key->type == BTRFS_ROOT_REF_KEY) {
3234                 add_root_backref(root_cache, key->offset, key->objectid, dirid,
3235                                  index, namebuf, len, key->type, error);
3236         } else {
3237                 add_root_backref(root_cache, key->objectid, key->offset, dirid,
3238                                  index, namebuf, len, key->type, error);
3239         }
3240         return 0;
3241 }
3242
3243 static void free_corrupt_block(struct cache_extent *cache)
3244 {
3245         struct btrfs_corrupt_block *corrupt;
3246
3247         corrupt = container_of(cache, struct btrfs_corrupt_block, cache);
3248         free(corrupt);
3249 }
3250
3251 FREE_EXTENT_CACHE_BASED_TREE(corrupt_blocks, free_corrupt_block);
3252
3253 /*
3254  * Repair the btree of the given root.
3255  *
3256  * The fix is to remove the node key in corrupt_blocks cache_tree.
3257  * and rebalance the tree.
3258  * After the fix, the btree should be writeable.
3259  */
3260 static int repair_btree(struct btrfs_root *root,
3261                         struct cache_tree *corrupt_blocks)
3262 {
3263         struct btrfs_trans_handle *trans;
3264         struct btrfs_path *path;
3265         struct btrfs_corrupt_block *corrupt;
3266         struct cache_extent *cache;
3267         struct btrfs_key key;
3268         u64 offset;
3269         int level;
3270         int ret = 0;
3271
3272         if (cache_tree_empty(corrupt_blocks))
3273                 return 0;
3274
3275         path = btrfs_alloc_path();
3276         if (!path)
3277                 return -ENOMEM;
3278
3279         trans = btrfs_start_transaction(root, 1);
3280         if (IS_ERR(trans)) {
3281                 ret = PTR_ERR(trans);
3282                 fprintf(stderr, "Error starting transaction: %s\n",
3283                         strerror(-ret));
3284                 goto out_free_path;
3285         }
3286         cache = first_cache_extent(corrupt_blocks);
3287         while (cache) {
3288                 corrupt = container_of(cache, struct btrfs_corrupt_block,
3289                                        cache);
3290                 level = corrupt->level;
3291                 path->lowest_level = level;
3292                 key.objectid = corrupt->key.objectid;
3293                 key.type = corrupt->key.type;
3294                 key.offset = corrupt->key.offset;
3295
3296                 /*
3297                  * Here we don't want to do any tree balance, since it may
3298                  * cause a balance with corrupted brother leaf/node,
3299                  * so ins_len set to 0 here.
3300                  * Balance will be done after all corrupt node/leaf is deleted.
3301                  */
3302                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3303                 if (ret < 0)
3304                         goto out;
3305                 offset = btrfs_node_blockptr(path->nodes[level],
3306                                              path->slots[level]);
3307
3308                 /* Remove the ptr */
3309                 ret = btrfs_del_ptr(trans, root, path, level,
3310                                     path->slots[level]);
3311                 if (ret < 0)
3312                         goto out;
3313                 /*
3314                  * Remove the corresponding extent
3315                  * return value is not concerned.
3316                  */
3317                 btrfs_release_path(path);
3318                 ret = btrfs_free_extent(trans, root, offset, root->nodesize,
3319                                         0, root->root_key.objectid,
3320                                         level - 1, 0);
3321                 cache = next_cache_extent(cache);
3322         }
3323
3324         /* Balance the btree using btrfs_search_slot() */
3325         cache = first_cache_extent(corrupt_blocks);
3326         while (cache) {
3327                 corrupt = container_of(cache, struct btrfs_corrupt_block,
3328                                        cache);
3329                 memcpy(&key, &corrupt->key, sizeof(key));
3330                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3331                 if (ret < 0)
3332                         goto out;
3333                 /* return will always >0 since it won't find the item */
3334                 ret = 0;
3335                 btrfs_release_path(path);
3336                 cache = next_cache_extent(cache);
3337         }
3338 out:
3339         btrfs_commit_transaction(trans, root);
3340 out_free_path:
3341         btrfs_free_path(path);
3342         return ret;
3343 }
3344
3345 static int check_fs_root(struct btrfs_root *root,
3346                          struct cache_tree *root_cache,
3347                          struct walk_control *wc)
3348 {
3349         int ret = 0;
3350         int err = 0;
3351         int wret;
3352         int level;
3353         struct btrfs_path path;
3354         struct shared_node root_node;
3355         struct root_record *rec;
3356         struct btrfs_root_item *root_item = &root->root_item;
3357         struct cache_tree corrupt_blocks;
3358         struct orphan_data_extent *orphan;
3359         struct orphan_data_extent *tmp;
3360         enum btrfs_tree_block_status status;
3361
3362         /*
3363          * Reuse the corrupt_block cache tree to record corrupted tree block
3364          *
3365          * Unlike the usage in extent tree check, here we do it in a per
3366          * fs/subvol tree base.
3367          */
3368         cache_tree_init(&corrupt_blocks);
3369         root->fs_info->corrupt_blocks = &corrupt_blocks;
3370
3371         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
3372                 rec = get_root_rec(root_cache, root->root_key.objectid);
3373                 if (btrfs_root_refs(root_item) > 0)
3374                         rec->found_root_item = 1;
3375         }
3376
3377         btrfs_init_path(&path);
3378         memset(&root_node, 0, sizeof(root_node));
3379         cache_tree_init(&root_node.root_cache);
3380         cache_tree_init(&root_node.inode_cache);
3381
3382         /* Move the orphan extent record to corresponding inode_record */
3383         list_for_each_entry_safe(orphan, tmp,
3384                                  &root->orphan_data_extents, list) {
3385                 struct inode_record *inode;
3386
3387                 inode = get_inode_rec(&root_node.inode_cache, orphan->objectid,
3388                                       1);
3389                 inode->errors |= I_ERR_FILE_EXTENT_ORPHAN;
3390                 list_move(&orphan->list, &inode->orphan_extents);
3391         }
3392
3393         level = btrfs_header_level(root->node);
3394         memset(wc->nodes, 0, sizeof(wc->nodes));
3395         wc->nodes[level] = &root_node;
3396         wc->active_node = level;
3397         wc->root_level = level;
3398
3399         /* We may not have checked the root block, lets do that now */
3400         if (btrfs_is_leaf(root->node))
3401                 status = btrfs_check_leaf(root, NULL, root->node);
3402         else
3403                 status = btrfs_check_node(root, NULL, root->node);
3404         if (status != BTRFS_TREE_BLOCK_CLEAN)
3405                 return -EIO;
3406
3407         if (btrfs_root_refs(root_item) > 0 ||
3408             btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
3409                 path.nodes[level] = root->node;
3410                 extent_buffer_get(root->node);
3411                 path.slots[level] = 0;
3412         } else {
3413                 struct btrfs_key key;
3414                 struct btrfs_disk_key found_key;
3415
3416                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
3417                 level = root_item->drop_level;
3418                 path.lowest_level = level;
3419                 wret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
3420                 if (wret < 0)
3421                         goto skip_walking;
3422                 btrfs_node_key(path.nodes[level], &found_key,
3423                                 path.slots[level]);
3424                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
3425                                         sizeof(found_key)));
3426         }
3427
3428         while (1) {
3429                 wret = walk_down_tree(root, &path, wc, &level);
3430                 if (wret < 0)
3431                         ret = wret;
3432                 if (wret != 0)
3433                         break;
3434
3435                 wret = walk_up_tree(root, &path, wc, &level);
3436                 if (wret < 0)
3437                         ret = wret;
3438                 if (wret != 0)
3439                         break;
3440         }
3441 skip_walking:
3442         btrfs_release_path(&path);
3443
3444         if (!cache_tree_empty(&corrupt_blocks)) {
3445                 struct cache_extent *cache;
3446                 struct btrfs_corrupt_block *corrupt;
3447
3448                 printf("The following tree block(s) is corrupted in tree %llu:\n",
3449                        root->root_key.objectid);
3450                 cache = first_cache_extent(&corrupt_blocks);
3451                 while (cache) {
3452                         corrupt = container_of(cache,
3453                                                struct btrfs_corrupt_block,
3454                                                cache);
3455                         printf("\ttree block bytenr: %llu, level: %d, node key: (%llu, %u, %llu)\n",
3456                                cache->start, corrupt->level,
3457                                corrupt->key.objectid, corrupt->key.type,
3458                                corrupt->key.offset);
3459                         cache = next_cache_extent(cache);
3460                 }
3461                 if (repair) {
3462                         printf("Try to repair the btree for root %llu\n",
3463                                root->root_key.objectid);
3464                         ret = repair_btree(root, &corrupt_blocks);
3465                         if (ret < 0)
3466                                 fprintf(stderr, "Failed to repair btree: %s\n",
3467                                         strerror(-ret));
3468                         if (!ret)
3469                                 printf("Btree for root %llu is fixed\n",
3470                                        root->root_key.objectid);
3471                 }
3472         }
3473
3474         err = merge_root_recs(root, &root_node.root_cache, root_cache);
3475         if (err < 0)
3476                 ret = err;
3477
3478         if (root_node.current) {
3479                 root_node.current->checked = 1;
3480                 maybe_free_inode_rec(&root_node.inode_cache,
3481                                 root_node.current);
3482         }
3483
3484         err = check_inode_recs(root, &root_node.inode_cache);
3485         if (!ret)
3486                 ret = err;
3487
3488         free_corrupt_blocks_tree(&corrupt_blocks);
3489         root->fs_info->corrupt_blocks = NULL;
3490         free_orphan_data_extents(&root->orphan_data_extents);
3491         return ret;
3492 }
3493
3494 static int fs_root_objectid(u64 objectid)
3495 {
3496         if (objectid == BTRFS_TREE_RELOC_OBJECTID ||
3497             objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3498                 return 1;
3499         return is_fstree(objectid);
3500 }
3501
3502 static int check_fs_roots(struct btrfs_root *root,
3503                           struct cache_tree *root_cache)
3504 {
3505         struct btrfs_path path;
3506         struct btrfs_key key;
3507         struct walk_control wc;
3508         struct extent_buffer *leaf, *tree_node;
3509         struct btrfs_root *tmp_root;
3510         struct btrfs_root *tree_root = root->fs_info->tree_root;
3511         int ret;
3512         int err = 0;
3513
3514         /*
3515          * Just in case we made any changes to the extent tree that weren't
3516          * reflected into the free space cache yet.
3517          */
3518         if (repair)
3519                 reset_cached_block_groups(root->fs_info);
3520         memset(&wc, 0, sizeof(wc));
3521         cache_tree_init(&wc.shared);
3522         btrfs_init_path(&path);
3523
3524 again:
3525         key.offset = 0;
3526         key.objectid = 0;
3527         key.type = BTRFS_ROOT_ITEM_KEY;
3528         ret = btrfs_search_slot(NULL, tree_root, &key, &path, 0, 0);
3529         if (ret < 0) {
3530                 err = 1;
3531                 goto out;
3532         }
3533         tree_node = tree_root->node;
3534         while (1) {
3535                 if (tree_node != tree_root->node) {
3536                         free_root_recs_tree(root_cache);
3537                         btrfs_release_path(&path);
3538                         goto again;
3539                 }
3540                 leaf = path.nodes[0];
3541                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
3542                         ret = btrfs_next_leaf(tree_root, &path);
3543                         if (ret) {
3544                                 if (ret < 0)
3545                                         err = 1;
3546                                 break;
3547                         }
3548                         leaf = path.nodes[0];
3549                 }
3550                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
3551                 if (key.type == BTRFS_ROOT_ITEM_KEY &&
3552                     fs_root_objectid(key.objectid)) {
3553                         if (key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
3554                                 tmp_root = btrfs_read_fs_root_no_cache(
3555                                                 root->fs_info, &key);
3556                         } else {
3557                                 key.offset = (u64)-1;
3558                                 tmp_root = btrfs_read_fs_root(
3559                                                 root->fs_info, &key);
3560                         }
3561                         if (IS_ERR(tmp_root)) {
3562                                 err = 1;
3563                                 goto next;
3564                         }
3565                         ret = check_fs_root(tmp_root, root_cache, &wc);
3566                         if (ret == -EAGAIN) {
3567                                 free_root_recs_tree(root_cache);
3568                                 btrfs_release_path(&path);
3569                                 goto again;
3570                         }
3571                         if (ret)
3572                                 err = 1;
3573                         if (key.objectid == BTRFS_TREE_RELOC_OBJECTID)
3574                                 btrfs_free_fs_root(tmp_root);
3575                 } else if (key.type == BTRFS_ROOT_REF_KEY ||
3576                            key.type == BTRFS_ROOT_BACKREF_KEY) {
3577                         process_root_ref(leaf, path.slots[0], &key,
3578                                          root_cache);
3579                 }
3580 next:
3581                 path.slots[0]++;
3582         }
3583 out:
3584         btrfs_release_path(&path);
3585         if (err)
3586                 free_extent_cache_tree(&wc.shared);
3587         if (!cache_tree_empty(&wc.shared))
3588                 fprintf(stderr, "warning line %d\n", __LINE__);
3589
3590         return err;
3591 }
3592
3593 static int all_backpointers_checked(struct extent_record *rec, int print_errs)
3594 {
3595         struct list_head *cur = rec->backrefs.next;
3596         struct extent_backref *back;
3597         struct tree_backref *tback;
3598         struct data_backref *dback;
3599         u64 found = 0;
3600         int err = 0;
3601
3602         while(cur != &rec->backrefs) {
3603                 back = list_entry(cur, struct extent_backref, list);
3604                 cur = cur->next;
3605                 if (!back->found_extent_tree) {
3606                         err = 1;
3607                         if (!print_errs)
3608                                 goto out;
3609                         if (back->is_data) {
3610                                 dback = (struct data_backref *)back;
3611                                 fprintf(stderr, "Backref %llu %s %llu"
3612                                         " owner %llu offset %llu num_refs %lu"
3613                                         " not found in extent tree\n",
3614                                         (unsigned long long)rec->start,
3615                                         back->full_backref ?
3616                                         "parent" : "root",
3617                                         back->full_backref ?
3618                                         (unsigned long long)dback->parent:
3619                                         (unsigned long long)dback->root,
3620                                         (unsigned long long)dback->owner,
3621                                         (unsigned long long)dback->offset,
3622                                         (unsigned long)dback->num_refs);
3623                         } else {
3624                                 tback = (struct tree_backref *)back;
3625                                 fprintf(stderr, "Backref %llu parent %llu"
3626                                         " root %llu not found in extent tree\n",
3627                                         (unsigned long long)rec->start,
3628                                         (unsigned long long)tback->parent,
3629                                         (unsigned long long)tback->root);
3630                         }
3631                 }
3632                 if (!back->is_data && !back->found_ref) {
3633                         err = 1;
3634                         if (!print_errs)
3635                                 goto out;
3636                         tback = (struct tree_backref *)back;
3637                         fprintf(stderr, "Backref %llu %s %llu not referenced back %p\n",
3638                                 (unsigned long long)rec->start,
3639                                 back->full_backref ? "parent" : "root",
3640                                 back->full_backref ?
3641                                 (unsigned long long)tback->parent :
3642                                 (unsigned long long)tback->root, back);
3643                 }
3644                 if (back->is_data) {
3645                         dback = (struct data_backref *)back;
3646                         if (dback->found_ref != dback->num_refs) {
3647                                 err = 1;
3648                                 if (!print_errs)
3649                                         goto out;
3650                                 fprintf(stderr, "Incorrect local backref count"
3651                                         " on %llu %s %llu owner %llu"
3652                                         " offset %llu found %u wanted %u back %p\n",
3653                                         (unsigned long long)rec->start,
3654                                         back->full_backref ?
3655                                         "parent" : "root",
3656                                         back->full_backref ?
3657                                         (unsigned long long)dback->parent:
3658                                         (unsigned long long)dback->root,
3659                                         (unsigned long long)dback->owner,
3660                                         (unsigned long long)dback->offset,
3661                                         dback->found_ref, dback->num_refs, back);
3662                         }
3663                         if (dback->disk_bytenr != rec->start) {
3664                                 err = 1;
3665                                 if (!print_errs)
3666                                         goto out;
3667                                 fprintf(stderr, "Backref disk bytenr does not"
3668                                         " match extent record, bytenr=%llu, "
3669                                         "ref bytenr=%llu\n",
3670                                         (unsigned long long)rec->start,
3671                                         (unsigned long long)dback->disk_bytenr);
3672                         }
3673
3674                         if (dback->bytes != rec->nr) {
3675                                 err = 1;
3676                                 if (!print_errs)
3677                                         goto out;
3678                                 fprintf(stderr, "Backref bytes do not match "
3679                                         "extent backref, bytenr=%llu, ref "
3680                                         "bytes=%llu, backref bytes=%llu\n",
3681                                         (unsigned long long)rec->start,
3682                                         (unsigned long long)rec->nr,
3683                                         (unsigned long long)dback->bytes);
3684                         }
3685                 }
3686                 if (!back->is_data) {
3687                         found += 1;
3688                 } else {
3689                         dback = (struct data_backref *)back;
3690                         found += dback->found_ref;
3691                 }
3692         }
3693         if (found != rec->refs) {
3694                 err = 1;
3695                 if (!print_errs)
3696                         goto out;
3697                 fprintf(stderr, "Incorrect global backref count "
3698                         "on %llu found %llu wanted %llu\n",
3699                         (unsigned long long)rec->start,
3700                         (unsigned long long)found,
3701                         (unsigned long long)rec->refs);
3702         }
3703 out:
3704         return err;
3705 }
3706
3707 static int free_all_extent_backrefs(struct extent_record *rec)
3708 {
3709         struct extent_backref *back;
3710         struct list_head *cur;
3711         while (!list_empty(&rec->backrefs)) {
3712                 cur = rec->backrefs.next;
3713                 back = list_entry(cur, struct extent_backref, list);
3714                 list_del(cur);
3715                 free(back);
3716         }
3717         return 0;
3718 }
3719
3720 static void free_extent_record_cache(struct btrfs_fs_info *fs_info,
3721                                      struct cache_tree *extent_cache)
3722 {
3723         struct cache_extent *cache;
3724         struct extent_record *rec;
3725
3726         while (1) {
3727                 cache = first_cache_extent(extent_cache);
3728                 if (!cache)
3729                         break;
3730                 rec = container_of(cache, struct extent_record, cache);
3731                 remove_cache_extent(extent_cache, cache);
3732                 free_all_extent_backrefs(rec);
3733                 free(rec);
3734         }
3735 }
3736
3737 static int maybe_free_extent_rec(struct cache_tree *extent_cache,
3738                                  struct extent_record *rec)
3739 {
3740         if (rec->content_checked && rec->owner_ref_checked &&
3741             rec->extent_item_refs == rec->refs && rec->refs > 0 &&
3742             rec->num_duplicates == 0 && !all_backpointers_checked(rec, 0) &&
3743             !rec->bad_full_backref && !rec->crossing_stripes) {
3744                 remove_cache_extent(extent_cache, &rec->cache);
3745                 free_all_extent_backrefs(rec);
3746                 list_del_init(&rec->list);
3747                 free(rec);
3748         }
3749         return 0;
3750 }
3751
3752 static int check_owner_ref(struct btrfs_root *root,
3753                             struct extent_record *rec,
3754                             struct extent_buffer *buf)
3755 {
3756         struct extent_backref *node;
3757         struct tree_backref *back;
3758         struct btrfs_root *ref_root;
3759         struct btrfs_key key;
3760         struct btrfs_path path;
3761         struct extent_buffer *parent;
3762         int level;
3763         int found = 0;
3764         int ret;
3765
3766         list_for_each_entry(node, &rec->backrefs, list) {
3767                 if (node->is_data)
3768                         continue;
3769                 if (!node->found_ref)
3770                         continue;
3771                 if (node->full_backref)
3772                         continue;
3773                 back = (struct tree_backref *)node;
3774                 if (btrfs_header_owner(buf) == back->root)
3775                         return 0;
3776         }
3777         BUG_ON(rec->is_root);
3778
3779         /* try to find the block by search corresponding fs tree */
3780         key.objectid = btrfs_header_owner(buf);
3781         key.type = BTRFS_ROOT_ITEM_KEY;
3782         key.offset = (u64)-1;
3783
3784         ref_root = btrfs_read_fs_root(root->fs_info, &key);
3785         if (IS_ERR(ref_root))
3786                 return 1;
3787
3788         level = btrfs_header_level(buf);
3789         if (level == 0)
3790                 btrfs_item_key_to_cpu(buf, &key, 0);
3791         else
3792                 btrfs_node_key_to_cpu(buf, &key, 0);
3793
3794         btrfs_init_path(&path);
3795         path.lowest_level = level + 1;
3796         ret = btrfs_search_slot(NULL, ref_root, &key, &path, 0, 0);
3797         if (ret < 0)
3798                 return 0;
3799
3800         parent = path.nodes[level + 1];
3801         if (parent && buf->start == btrfs_node_blockptr(parent,
3802                                                         path.slots[level + 1]))
3803                 found = 1;
3804
3805         btrfs_release_path(&path);
3806         return found ? 0 : 1;
3807 }
3808
3809 static int is_extent_tree_record(struct extent_record *rec)
3810 {
3811         struct list_head *cur = rec->backrefs.next;
3812         struct extent_backref *node;
3813         struct tree_backref *back;
3814         int is_extent = 0;
3815
3816         while(cur != &rec->backrefs) {
3817                 node = list_entry(cur, struct extent_backref, list);
3818                 cur = cur->next;
3819                 if (node->is_data)
3820                         return 0;
3821                 back = (struct tree_backref *)node;
3822                 if (node->full_backref)
3823                         return 0;
3824                 if (back->root == BTRFS_EXTENT_TREE_OBJECTID)
3825                         is_extent = 1;
3826         }
3827         return is_extent;
3828 }
3829
3830
3831 static int record_bad_block_io(struct btrfs_fs_info *info,
3832                                struct cache_tree *extent_cache,
3833                                u64 start, u64 len)
3834 {
3835         struct extent_record *rec;
3836         struct cache_extent *cache;
3837         struct btrfs_key key;
3838
3839         cache = lookup_cache_extent(extent_cache, start, len);
3840         if (!cache)
3841                 return 0;
3842
3843         rec = container_of(cache, struct extent_record, cache);
3844         if (!is_extent_tree_record(rec))
3845                 return 0;
3846
3847         btrfs_disk_key_to_cpu(&key, &rec->parent_key);
3848         return btrfs_add_corrupt_extent_record(info, &key, start, len, 0);
3849 }
3850
3851 static int swap_values(struct btrfs_root *root, struct btrfs_path *path,
3852                        struct extent_buffer *buf, int slot)
3853 {
3854         if (btrfs_header_level(buf)) {
3855                 struct btrfs_key_ptr ptr1, ptr2;
3856
3857                 read_extent_buffer(buf, &ptr1, btrfs_node_key_ptr_offset(slot),
3858                                    sizeof(struct btrfs_key_ptr));
3859                 read_extent_buffer(buf, &ptr2,
3860                                    btrfs_node_key_ptr_offset(slot + 1),
3861                                    sizeof(struct btrfs_key_ptr));
3862                 write_extent_buffer(buf, &ptr1,
3863                                     btrfs_node_key_ptr_offset(slot + 1),
3864                                     sizeof(struct btrfs_key_ptr));
3865                 write_extent_buffer(buf, &ptr2,
3866                                     btrfs_node_key_ptr_offset(slot),
3867                                     sizeof(struct btrfs_key_ptr));
3868                 if (slot == 0) {
3869                         struct btrfs_disk_key key;
3870                         btrfs_node_key(buf, &key, 0);
3871                         btrfs_fixup_low_keys(root, path, &key,
3872                                              btrfs_header_level(buf) + 1);
3873                 }
3874         } else {
3875                 struct btrfs_item *item1, *item2;
3876                 struct btrfs_key k1, k2;
3877                 char *item1_data, *item2_data;
3878                 u32 item1_offset, item2_offset, item1_size, item2_size;
3879
3880                 item1 = btrfs_item_nr(slot);
3881                 item2 = btrfs_item_nr(slot + 1);
3882                 btrfs_item_key_to_cpu(buf, &k1, slot);
3883                 btrfs_item_key_to_cpu(buf, &k2, slot + 1);
3884                 item1_offset = btrfs_item_offset(buf, item1);
3885                 item2_offset = btrfs_item_offset(buf, item2);
3886                 item1_size = btrfs_item_size(buf, item1);
3887                 item2_size = btrfs_item_size(buf, item2);
3888
3889                 item1_data = malloc(item1_size);
3890                 if (!item1_data)
3891                         return -ENOMEM;
3892                 item2_data = malloc(item2_size);
3893                 if (!item2_data) {
3894                         free(item1_data);
3895                         return -ENOMEM;
3896                 }
3897
3898                 read_extent_buffer(buf, item1_data, item1_offset, item1_size);
3899                 read_extent_buffer(buf, item2_data, item2_offset, item2_size);
3900
3901                 write_extent_buffer(buf, item1_data, item2_offset, item2_size);
3902                 write_extent_buffer(buf, item2_data, item1_offset, item1_size);
3903                 free(item1_data);
3904                 free(item2_data);
3905
3906                 btrfs_set_item_offset(buf, item1, item2_offset);
3907                 btrfs_set_item_offset(buf, item2, item1_offset);
3908                 btrfs_set_item_size(buf, item1, item2_size);
3909                 btrfs_set_item_size(buf, item2, item1_size);
3910
3911                 path->slots[0] = slot;
3912                 btrfs_set_item_key_unsafe(root, path, &k2);
3913                 path->slots[0] = slot + 1;
3914                 btrfs_set_item_key_unsafe(root, path, &k1);
3915         }
3916         return 0;
3917 }
3918
3919 static int fix_key_order(struct btrfs_trans_handle *trans,
3920                          struct btrfs_root *root,
3921                          struct btrfs_path *path)
3922 {
3923         struct extent_buffer *buf;
3924         struct btrfs_key k1, k2;
3925         int i;
3926         int level = path->lowest_level;
3927         int ret = -EIO;
3928
3929         buf = path->nodes[level];
3930         for (i = 0; i < btrfs_header_nritems(buf) - 1; i++) {
3931                 if (level) {
3932                         btrfs_node_key_to_cpu(buf, &k1, i);
3933                         btrfs_node_key_to_cpu(buf, &k2, i + 1);
3934                 } else {
3935                         btrfs_item_key_to_cpu(buf, &k1, i);
3936                         btrfs_item_key_to_cpu(buf, &k2, i + 1);
3937                 }
3938                 if (btrfs_comp_cpu_keys(&k1, &k2) < 0)
3939                         continue;
3940                 ret = swap_values(root, path, buf, i);
3941                 if (ret)
3942                         break;
3943                 btrfs_mark_buffer_dirty(buf);
3944                 i = 0;
3945         }
3946         return ret;
3947 }
3948
3949 static int delete_bogus_item(struct btrfs_trans_handle *trans,
3950                              struct btrfs_root *root,
3951                              struct btrfs_path *path,
3952                              struct extent_buffer *buf, int slot)
3953 {
3954         struct btrfs_key key;
3955         int nritems = btrfs_header_nritems(buf);
3956
3957         btrfs_item_key_to_cpu(buf, &key, slot);
3958
3959         /* These are all the keys we can deal with missing. */
3960         if (key.type != BTRFS_DIR_INDEX_KEY &&
3961             key.type != BTRFS_EXTENT_ITEM_KEY &&
3962             key.type != BTRFS_METADATA_ITEM_KEY &&
3963             key.type != BTRFS_TREE_BLOCK_REF_KEY &&
3964             key.type != BTRFS_EXTENT_DATA_REF_KEY)
3965                 return -1;
3966
3967         printf("Deleting bogus item [%llu,%u,%llu] at slot %d on block %llu\n",
3968                (unsigned long long)key.objectid, key.type,
3969                (unsigned long long)key.offset, slot, buf->start);
3970         memmove_extent_buffer(buf, btrfs_item_nr_offset(slot),
3971                               btrfs_item_nr_offset(slot + 1),
3972                               sizeof(struct btrfs_item) *
3973                               (nritems - slot - 1));
3974         btrfs_set_header_nritems(buf, nritems - 1);
3975         if (slot == 0) {
3976                 struct btrfs_disk_key disk_key;
3977
3978                 btrfs_item_key(buf, &disk_key, 0);
3979                 btrfs_fixup_low_keys(root, path, &disk_key, 1);
3980         }
3981         btrfs_mark_buffer_dirty(buf);
3982         return 0;
3983 }
3984
3985 static int fix_item_offset(struct btrfs_trans_handle *trans,
3986                            struct btrfs_root *root,
3987                            struct btrfs_path *path)
3988 {
3989         struct extent_buffer *buf;
3990         int i;
3991         int ret = 0;
3992
3993         /* We should only get this for leaves */
3994         BUG_ON(path->lowest_level);
3995         buf = path->nodes[0];
3996 again:
3997         for (i = 0; i < btrfs_header_nritems(buf); i++) {
3998                 unsigned int shift = 0, offset;
3999
4000                 if (i == 0 && btrfs_item_end_nr(buf, i) !=
4001                     BTRFS_LEAF_DATA_SIZE(root)) {
4002                         if (btrfs_item_end_nr(buf, i) >
4003                             BTRFS_LEAF_DATA_SIZE(root)) {
4004                                 ret = delete_bogus_item(trans, root, path,
4005                                                         buf, i);
4006                                 if (!ret)
4007                                         goto again;
4008                                 fprintf(stderr, "item is off the end of the "
4009                                         "leaf, can't fix\n");
4010                                 ret = -EIO;
4011                                 break;
4012                         }
4013                         shift = BTRFS_LEAF_DATA_SIZE(root) -
4014                                 btrfs_item_end_nr(buf, i);
4015                 } else if (i > 0 && btrfs_item_end_nr(buf, i) !=
4016                            btrfs_item_offset_nr(buf, i - 1)) {
4017                         if (btrfs_item_end_nr(buf, i) >
4018                             btrfs_item_offset_nr(buf, i - 1)) {
4019                                 ret = delete_bogus_item(trans, root, path,
4020                                                         buf, i);
4021                                 if (!ret)
4022                                         goto again;
4023                                 fprintf(stderr, "items overlap, can't fix\n");
4024                                 ret = -EIO;
4025                                 break;
4026                         }
4027                         shift = btrfs_item_offset_nr(buf, i - 1) -
4028                                 btrfs_item_end_nr(buf, i);
4029                 }
4030                 if (!shift)
4031                         continue;
4032
4033                 printf("Shifting item nr %d by %u bytes in block %llu\n",
4034                        i, shift, (unsigned long long)buf->start);
4035                 offset = btrfs_item_offset_nr(buf, i);
4036                 memmove_extent_buffer(buf,
4037                                       btrfs_leaf_data(buf) + offset + shift,
4038                                       btrfs_leaf_data(buf) + offset,
4039                                       btrfs_item_size_nr(buf, i));
4040                 btrfs_set_item_offset(buf, btrfs_item_nr(i),
4041                                       offset + shift);
4042                 btrfs_mark_buffer_dirty(buf);
4043         }
4044
4045         /*
4046          * We may have moved things, in which case we want to exit so we don't
4047          * write those changes out.  Once we have proper abort functionality in
4048          * progs this can be changed to something nicer.
4049          */
4050         BUG_ON(ret);
4051         return ret;
4052 }
4053
4054 /*
4055  * Attempt to fix basic block failures.  If we can't fix it for whatever reason
4056  * then just return -EIO.
4057  */
4058 static int try_to_fix_bad_block(struct btrfs_root *root,
4059                                 struct extent_buffer *buf,
4060                                 enum btrfs_tree_block_status status)
4061 {
4062         struct btrfs_trans_handle *trans;
4063         struct ulist *roots;
4064         struct ulist_node *node;
4065         struct btrfs_root *search_root;
4066         struct btrfs_path *path;
4067         struct ulist_iterator iter;
4068         struct btrfs_key root_key, key;
4069         int ret;
4070
4071         if (status != BTRFS_TREE_BLOCK_BAD_KEY_ORDER &&
4072             status != BTRFS_TREE_BLOCK_INVALID_OFFSETS)
4073                 return -EIO;
4074
4075         path = btrfs_alloc_path();
4076         if (!path)
4077                 return -EIO;
4078
4079         ret = btrfs_find_all_roots(NULL, root->fs_info, buf->start,
4080                                    0, &roots);
4081         if (ret) {
4082                 btrfs_free_path(path);
4083                 return -EIO;
4084         }
4085
4086         ULIST_ITER_INIT(&iter);
4087         while ((node = ulist_next(roots, &iter))) {
4088                 root_key.objectid = node->val;
4089                 root_key.type = BTRFS_ROOT_ITEM_KEY;
4090                 root_key.offset = (u64)-1;
4091
4092                 search_root = btrfs_read_fs_root(root->fs_info, &root_key);
4093                 if (IS_ERR(root)) {
4094                         ret = -EIO;
4095                         break;
4096                 }
4097
4098
4099                 trans = btrfs_start_transaction(search_root, 0);
4100                 if (IS_ERR(trans)) {
4101                         ret = PTR_ERR(trans);
4102                         break;
4103                 }
4104
4105                 path->lowest_level = btrfs_header_level(buf);
4106                 path->skip_check_block = 1;
4107                 if (path->lowest_level)
4108                         btrfs_node_key_to_cpu(buf, &key, 0);
4109                 else
4110                         btrfs_item_key_to_cpu(buf, &key, 0);
4111                 ret = btrfs_search_slot(trans, search_root, &key, path, 0, 1);
4112                 if (ret) {
4113                         ret = -EIO;
4114                         btrfs_commit_transaction(trans, search_root);
4115                         break;
4116                 }
4117                 if (status == BTRFS_TREE_BLOCK_BAD_KEY_ORDER)
4118                         ret = fix_key_order(trans, search_root, path);
4119                 else if (status == BTRFS_TREE_BLOCK_INVALID_OFFSETS)
4120                         ret = fix_item_offset(trans, search_root, path);
4121                 if (ret) {
4122                         btrfs_commit_transaction(trans, search_root);
4123                         break;
4124                 }
4125                 btrfs_release_path(path);
4126                 btrfs_commit_transaction(trans, search_root);
4127         }
4128         ulist_free(roots);
4129         btrfs_free_path(path);
4130         return ret;
4131 }
4132
4133 static int check_block(struct btrfs_root *root,
4134                        struct cache_tree *extent_cache,
4135                        struct extent_buffer *buf, u64 flags)
4136 {
4137         struct extent_record *rec;
4138         struct cache_extent *cache;
4139         struct btrfs_key key;
4140         enum btrfs_tree_block_status status;
4141         int ret = 0;
4142         int level;
4143
4144         cache = lookup_cache_extent(extent_cache, buf->start, buf->len);
4145         if (!cache)
4146                 return 1;
4147         rec = container_of(cache, struct extent_record, cache);
4148         rec->generation = btrfs_header_generation(buf);
4149
4150         level = btrfs_header_level(buf);
4151         if (btrfs_header_nritems(buf) > 0) {
4152
4153                 if (level == 0)
4154                         btrfs_item_key_to_cpu(buf, &key, 0);
4155                 else
4156                         btrfs_node_key_to_cpu(buf, &key, 0);
4157
4158                 rec->info_objectid = key.objectid;
4159         }
4160         rec->info_level = level;
4161
4162         if (btrfs_is_leaf(buf))
4163                 status = btrfs_check_leaf(root, &rec->parent_key, buf);
4164         else
4165                 status = btrfs_check_node(root, &rec->parent_key, buf);
4166
4167         if (status != BTRFS_TREE_BLOCK_CLEAN) {
4168                 if (repair)
4169                         status = try_to_fix_bad_block(root, buf, status);
4170                 if (status != BTRFS_TREE_BLOCK_CLEAN) {
4171                         ret = -EIO;
4172                         fprintf(stderr, "bad block %llu\n",
4173                                 (unsigned long long)buf->start);
4174                 } else {
4175                         /*
4176                          * Signal to callers we need to start the scan over
4177                          * again since we'll have cow'ed blocks.
4178                          */
4179                         ret = -EAGAIN;
4180                 }
4181         } else {
4182                 rec->content_checked = 1;
4183                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
4184                         rec->owner_ref_checked = 1;
4185                 else {
4186                         ret = check_owner_ref(root, rec, buf);
4187                         if (!ret)
4188                                 rec->owner_ref_checked = 1;
4189                 }
4190         }
4191         if (!ret)
4192                 maybe_free_extent_rec(extent_cache, rec);
4193         return ret;
4194 }
4195
4196 static struct tree_backref *find_tree_backref(struct extent_record *rec,
4197                                                 u64 parent, u64 root)
4198 {
4199         struct list_head *cur = rec->backrefs.next;
4200         struct extent_backref *node;
4201         struct tree_backref *back;
4202
4203         while(cur != &rec->backrefs) {
4204                 node = list_entry(cur, struct extent_backref, list);
4205                 cur = cur->next;
4206                 if (node->is_data)
4207                         continue;
4208                 back = (struct tree_backref *)node;
4209                 if (parent > 0) {
4210                         if (!node->full_backref)
4211                                 continue;
4212                         if (parent == back->parent)
4213                                 return back;
4214                 } else {
4215                         if (node->full_backref)
4216                                 continue;
4217                         if (back->root == root)
4218                                 return back;
4219                 }
4220         }
4221         return NULL;
4222 }
4223
4224 static struct tree_backref *alloc_tree_backref(struct extent_record *rec,
4225                                                 u64 parent, u64 root)
4226 {
4227         struct tree_backref *ref = malloc(sizeof(*ref));
4228         memset(&ref->node, 0, sizeof(ref->node));
4229         if (parent > 0) {
4230                 ref->parent = parent;
4231                 ref->node.full_backref = 1;
4232         } else {
4233                 ref->root = root;
4234                 ref->node.full_backref = 0;
4235         }
4236         list_add_tail(&ref->node.list, &rec->backrefs);
4237
4238         return ref;
4239 }
4240
4241 static struct data_backref *find_data_backref(struct extent_record *rec,
4242                                                 u64 parent, u64 root,
4243                                                 u64 owner, u64 offset,
4244                                                 int found_ref,
4245                                                 u64 disk_bytenr, u64 bytes)
4246 {
4247         struct list_head *cur = rec->backrefs.next;
4248         struct extent_backref *node;
4249         struct data_backref *back;
4250
4251         while(cur != &rec->backrefs) {
4252                 node = list_entry(cur, struct extent_backref, list);
4253                 cur = cur->next;
4254                 if (!node->is_data)
4255                         continue;
4256                 back = (struct data_backref *)node;
4257                 if (parent > 0) {
4258                         if (!node->full_backref)
4259                                 continue;
4260                         if (parent == back->parent)
4261                                 return back;
4262                 } else {
4263                         if (node->full_backref)
4264                                 continue;
4265                         if (back->root == root && back->owner == owner &&
4266                             back->offset == offset) {
4267                                 if (found_ref && node->found_ref &&
4268                                     (back->bytes != bytes ||
4269                                     back->disk_bytenr != disk_bytenr))
4270                                         continue;
4271                                 return back;
4272                         }
4273                 }
4274         }
4275         return NULL;
4276 }
4277
4278 static struct data_backref *alloc_data_backref(struct extent_record *rec,
4279                                                 u64 parent, u64 root,
4280                                                 u64 owner, u64 offset,
4281                                                 u64 max_size)
4282 {
4283         struct data_backref *ref = malloc(sizeof(*ref));
4284         memset(&ref->node, 0, sizeof(ref->node));
4285         ref->node.is_data = 1;
4286
4287         if (parent > 0) {
4288                 ref->parent = parent;
4289                 ref->owner = 0;
4290                 ref->offset = 0;
4291                 ref->node.full_backref = 1;
4292         } else {
4293                 ref->root = root;
4294                 ref->owner = owner;
4295                 ref->offset = offset;
4296                 ref->node.full_backref = 0;
4297         }
4298         ref->bytes = max_size;
4299         ref->found_ref = 0;
4300         ref->num_refs = 0;
4301         list_add_tail(&ref->node.list, &rec->backrefs);
4302         if (max_size > rec->max_size)
4303                 rec->max_size = max_size;
4304         return ref;
4305 }
4306
4307 static int add_extent_rec(struct cache_tree *extent_cache,
4308                           struct btrfs_key *parent_key, u64 parent_gen,
4309                           u64 start, u64 nr, u64 extent_item_refs,
4310                           int is_root, int inc_ref, int set_checked,
4311                           int metadata, int extent_rec, u64 max_size)
4312 {
4313         struct extent_record *rec;
4314         struct cache_extent *cache;
4315         int ret = 0;
4316         int dup = 0;
4317
4318         cache = lookup_cache_extent(extent_cache, start, nr);
4319         if (cache) {
4320                 rec = container_of(cache, struct extent_record, cache);
4321                 if (inc_ref)
4322                         rec->refs++;
4323                 if (rec->nr == 1)
4324                         rec->nr = max(nr, max_size);
4325
4326                 /*
4327                  * We need to make sure to reset nr to whatever the extent
4328                  * record says was the real size, this way we can compare it to
4329                  * the backrefs.
4330                  */
4331                 if (extent_rec) {
4332                         if (start != rec->start || rec->found_rec) {
4333                                 struct extent_record *tmp;
4334
4335                                 dup = 1;
4336                                 if (list_empty(&rec->list))
4337                                         list_add_tail(&rec->list,
4338                                                       &duplicate_extents);
4339
4340                                 /*
4341                                  * We have to do this song and dance in case we
4342                                  * find an extent record that falls inside of
4343                                  * our current extent record but does not have
4344                                  * the same objectid.
4345                                  */
4346                                 tmp = malloc(sizeof(*tmp));
4347                                 if (!tmp)
4348                                         return -ENOMEM;
4349                                 tmp->start = start;
4350                                 tmp->max_size = max_size;
4351                                 tmp->nr = nr;
4352                                 tmp->found_rec = 1;
4353                                 tmp->metadata = metadata;
4354                                 tmp->extent_item_refs = extent_item_refs;
4355                                 INIT_LIST_HEAD(&tmp->list);
4356                                 list_add_tail(&tmp->list, &rec->dups);
4357                                 rec->num_duplicates++;
4358                         } else {
4359                                 rec->nr = nr;
4360                                 rec->found_rec = 1;
4361                         }
4362                 }
4363
4364                 if (extent_item_refs && !dup) {
4365                         if (rec->extent_item_refs) {
4366                                 fprintf(stderr, "block %llu rec "
4367                                         "extent_item_refs %llu, passed %llu\n",
4368                                         (unsigned long long)start,
4369                                         (unsigned long long)
4370                                                         rec->extent_item_refs,
4371                                         (unsigned long long)extent_item_refs);
4372                         }
4373                         rec->extent_item_refs = extent_item_refs;
4374                 }
4375                 if (is_root)
4376                         rec->is_root = 1;
4377                 if (set_checked) {
4378                         rec->content_checked = 1;
4379                         rec->owner_ref_checked = 1;
4380                 }
4381
4382                 if (parent_key)
4383                         btrfs_cpu_key_to_disk(&rec->parent_key, parent_key);
4384                 if (parent_gen)
4385                         rec->parent_generation = parent_gen;
4386
4387                 if (rec->max_size < max_size)
4388                         rec->max_size = max_size;
4389
4390                 /*
4391                  * A metadata extent can't cross stripe_len boundary, otherwise
4392                  * kernel scrub won't be able to handle it.
4393                  * As now stripe_len is fixed to BTRFS_STRIPE_LEN, just check
4394                  * it.
4395                  */
4396                 if (metadata && check_crossing_stripes(rec->start,
4397                                                        rec->max_size))
4398                                 rec->crossing_stripes = 1;
4399                 maybe_free_extent_rec(extent_cache, rec);
4400                 return ret;
4401         }
4402         rec = malloc(sizeof(*rec));
4403         rec->start = start;
4404         rec->max_size = max_size;
4405         rec->nr = max(nr, max_size);
4406         rec->found_rec = !!extent_rec;
4407         rec->content_checked = 0;
4408         rec->owner_ref_checked = 0;
4409         rec->num_duplicates = 0;
4410         rec->metadata = metadata;
4411         rec->flag_block_full_backref = -1;
4412         rec->bad_full_backref = 0;
4413         INIT_LIST_HEAD(&rec->backrefs);
4414         INIT_LIST_HEAD(&rec->dups);
4415         INIT_LIST_HEAD(&rec->list);
4416
4417         if (is_root)
4418                 rec->is_root = 1;
4419         else
4420                 rec->is_root = 0;
4421
4422         if (inc_ref)
4423                 rec->refs = 1;
4424         else
4425                 rec->refs = 0;
4426
4427         if (extent_item_refs)
4428                 rec->extent_item_refs = extent_item_refs;
4429         else
4430                 rec->extent_item_refs = 0;
4431
4432         if (parent_key)
4433                 btrfs_cpu_key_to_disk(&rec->parent_key, parent_key);
4434         else
4435                 memset(&rec->parent_key, 0, sizeof(*parent_key));
4436
4437         if (parent_gen)
4438                 rec->parent_generation = parent_gen;
4439         else
4440                 rec->parent_generation = 0;
4441
4442         rec->cache.start = start;
4443         rec->cache.size = nr;
4444         ret = insert_cache_extent(extent_cache, &rec->cache);
4445         BUG_ON(ret);
4446         bytes_used += nr;
4447         if (set_checked) {
4448                 rec->content_checked = 1;
4449                 rec->owner_ref_checked = 1;
4450         }
4451
4452         if (metadata)
4453                 if (check_crossing_stripes(rec->start, rec->max_size))
4454                         rec->crossing_stripes = 1;
4455         return ret;
4456 }
4457
4458 static int add_tree_backref(struct cache_tree *extent_cache, u64 bytenr,
4459                             u64 parent, u64 root, int found_ref)
4460 {
4461         struct extent_record *rec;
4462         struct tree_backref *back;
4463         struct cache_extent *cache;
4464
4465         cache = lookup_cache_extent(extent_cache, bytenr, 1);
4466         if (!cache) {
4467                 add_extent_rec(extent_cache, NULL, 0, bytenr,
4468                                1, 0, 0, 0, 0, 1, 0, 0);
4469                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
4470                 if (!cache)
4471                         abort();
4472         }
4473
4474         rec = container_of(cache, struct extent_record, cache);
4475         if (rec->start != bytenr) {
4476                 abort();
4477         }
4478
4479         back = find_tree_backref(rec, parent, root);
4480         if (!back)
4481                 back = alloc_tree_backref(rec, parent, root);
4482
4483         if (found_ref) {
4484                 if (back->node.found_ref) {
4485                         fprintf(stderr, "Extent back ref already exists "
4486                                 "for %llu parent %llu root %llu \n",
4487                                 (unsigned long long)bytenr,
4488                                 (unsigned long long)parent,
4489                                 (unsigned long long)root);
4490                 }
4491                 back->node.found_ref = 1;
4492         } else {
4493                 if (back->node.found_extent_tree) {
4494                         fprintf(stderr, "Extent back ref already exists "
4495                                 "for %llu parent %llu root %llu \n",
4496                                 (unsigned long long)bytenr,
4497                                 (unsigned long long)parent,
4498                                 (unsigned long long)root);
4499                 }
4500                 back->node.found_extent_tree = 1;
4501         }
4502         maybe_free_extent_rec(extent_cache, rec);
4503         return 0;
4504 }
4505
4506 static int add_data_backref(struct cache_tree *extent_cache, u64 bytenr,
4507                             u64 parent, u64 root, u64 owner, u64 offset,
4508                             u32 num_refs, int found_ref, u64 max_size)
4509 {
4510         struct extent_record *rec;
4511         struct data_backref *back;
4512         struct cache_extent *cache;
4513
4514         cache = lookup_cache_extent(extent_cache, bytenr, 1);
4515         if (!cache) {
4516                 add_extent_rec(extent_cache, NULL, 0, bytenr, 1, 0, 0, 0, 0,
4517                                0, 0, max_size);
4518                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
4519                 if (!cache)
4520                         abort();
4521         }
4522
4523         rec = container_of(cache, struct extent_record, cache);
4524         if (rec->max_size < max_size)
4525                 rec->max_size = max_size;
4526
4527         /*
4528          * If found_ref is set then max_size is the real size and must match the
4529          * existing refs.  So if we have already found a ref then we need to
4530          * make sure that this ref matches the existing one, otherwise we need
4531          * to add a new backref so we can notice that the backrefs don't match
4532          * and we need to figure out who is telling the truth.  This is to
4533          * account for that awful fsync bug I introduced where we'd end up with
4534          * a btrfs_file_extent_item that would have its length include multiple
4535          * prealloc extents or point inside of a prealloc extent.
4536          */
4537         back = find_data_backref(rec, parent, root, owner, offset, found_ref,
4538                                  bytenr, max_size);
4539         if (!back)
4540                 back = alloc_data_backref(rec, parent, root, owner, offset,
4541                                           max_size);
4542
4543         if (found_ref) {
4544                 BUG_ON(num_refs != 1);
4545                 if (back->node.found_ref)
4546                         BUG_ON(back->bytes != max_size);
4547                 back->node.found_ref = 1;
4548                 back->found_ref += 1;
4549                 back->bytes = max_size;
4550                 back->disk_bytenr = bytenr;
4551                 rec->refs += 1;
4552                 rec->content_checked = 1;
4553                 rec->owner_ref_checked = 1;
4554         } else {
4555                 if (back->node.found_extent_tree) {
4556                         fprintf(stderr, "Extent back ref already exists "
4557                                 "for %llu parent %llu root %llu "
4558                                 "owner %llu offset %llu num_refs %lu\n",
4559                                 (unsigned long long)bytenr,
4560                                 (unsigned long long)parent,
4561                                 (unsigned long long)root,
4562                                 (unsigned long long)owner,
4563                                 (unsigned long long)offset,
4564                                 (unsigned long)num_refs);
4565                 }
4566                 back->num_refs = num_refs;
4567                 back->node.found_extent_tree = 1;
4568         }
4569         maybe_free_extent_rec(extent_cache, rec);
4570         return 0;
4571 }
4572
4573 static int add_pending(struct cache_tree *pending,
4574                        struct cache_tree *seen, u64 bytenr, u32 size)
4575 {
4576         int ret;
4577         ret = add_cache_extent(seen, bytenr, size);
4578         if (ret)
4579                 return ret;
4580         add_cache_extent(pending, bytenr, size);
4581         return 0;
4582 }
4583
4584 static int pick_next_pending(struct cache_tree *pending,
4585                         struct cache_tree *reada,
4586                         struct cache_tree *nodes,
4587                         u64 last, struct block_info *bits, int bits_nr,
4588                         int *reada_bits)
4589 {
4590         unsigned long node_start = last;
4591         struct cache_extent *cache;
4592         int ret;
4593
4594         cache = search_cache_extent(reada, 0);
4595         if (cache) {
4596                 bits[0].start = cache->start;
4597                 bits[0].size = cache->size;
4598                 *reada_bits = 1;
4599                 return 1;
4600         }
4601         *reada_bits = 0;
4602         if (node_start > 32768)
4603                 node_start -= 32768;
4604
4605         cache = search_cache_extent(nodes, node_start);
4606         if (!cache)
4607                 cache = search_cache_extent(nodes, 0);
4608
4609         if (!cache) {
4610                  cache = search_cache_extent(pending, 0);
4611                  if (!cache)
4612                          return 0;
4613                  ret = 0;
4614                  do {
4615                          bits[ret].start = cache->start;
4616                          bits[ret].size = cache->size;
4617                          cache = next_cache_extent(cache);
4618                          ret++;
4619                  } while (cache && ret < bits_nr);
4620                  return ret;
4621         }
4622
4623         ret = 0;
4624         do {
4625                 bits[ret].start = cache->start;
4626                 bits[ret].size = cache->size;
4627                 cache = next_cache_extent(cache);
4628                 ret++;
4629         } while (cache && ret < bits_nr);
4630
4631         if (bits_nr - ret > 8) {
4632                 u64 lookup = bits[0].start + bits[0].size;
4633                 struct cache_extent *next;
4634                 next = search_cache_extent(pending, lookup);
4635                 while(next) {
4636                         if (next->start - lookup > 32768)
4637                                 break;
4638                         bits[ret].start = next->start;
4639                         bits[ret].size = next->size;
4640                         lookup = next->start + next->size;
4641                         ret++;
4642                         if (ret == bits_nr)
4643                                 break;
4644                         next = next_cache_extent(next);
4645                         if (!next)
4646                                 break;
4647                 }
4648         }
4649         return ret;
4650 }
4651
4652 static void free_chunk_record(struct cache_extent *cache)
4653 {
4654         struct chunk_record *rec;
4655
4656         rec = container_of(cache, struct chunk_record, cache);
4657         list_del_init(&rec->list);
4658         list_del_init(&rec->dextents);
4659         free(rec);
4660 }
4661
4662 void free_chunk_cache_tree(struct cache_tree *chunk_cache)
4663 {
4664         cache_tree_free_extents(chunk_cache, free_chunk_record);
4665 }
4666
4667 static void free_device_record(struct rb_node *node)
4668 {
4669         struct device_record *rec;
4670
4671         rec = container_of(node, struct device_record, node);
4672         free(rec);
4673 }
4674
4675 FREE_RB_BASED_TREE(device_cache, free_device_record);
4676
4677 int insert_block_group_record(struct block_group_tree *tree,
4678                               struct block_group_record *bg_rec)
4679 {
4680         int ret;
4681
4682         ret = insert_cache_extent(&tree->tree, &bg_rec->cache);
4683         if (ret)
4684                 return ret;
4685
4686         list_add_tail(&bg_rec->list, &tree->block_groups);
4687         return 0;
4688 }
4689
4690 static void free_block_group_record(struct cache_extent *cache)
4691 {
4692         struct block_group_record *rec;
4693
4694         rec = container_of(cache, struct block_group_record, cache);
4695         list_del_init(&rec->list);
4696         free(rec);
4697 }
4698
4699 void free_block_group_tree(struct block_group_tree *tree)
4700 {
4701         cache_tree_free_extents(&tree->tree, free_block_group_record);
4702 }
4703
4704 int insert_device_extent_record(struct device_extent_tree *tree,
4705                                 struct device_extent_record *de_rec)
4706 {
4707         int ret;
4708
4709         /*
4710          * Device extent is a bit different from the other extents, because
4711          * the extents which belong to the different devices may have the
4712          * same start and size, so we need use the special extent cache
4713          * search/insert functions.
4714          */
4715         ret = insert_cache_extent2(&tree->tree, &de_rec->cache);
4716         if (ret)
4717                 return ret;
4718
4719         list_add_tail(&de_rec->chunk_list, &tree->no_chunk_orphans);
4720         list_add_tail(&de_rec->device_list, &tree->no_device_orphans);
4721         return 0;
4722 }
4723
4724 static void free_device_extent_record(struct cache_extent *cache)
4725 {
4726         struct device_extent_record *rec;
4727
4728         rec = container_of(cache, struct device_extent_record, cache);
4729         if (!list_empty(&rec->chunk_list))
4730                 list_del_init(&rec->chunk_list);
4731         if (!list_empty(&rec->device_list))
4732                 list_del_init(&rec->device_list);
4733         free(rec);
4734 }
4735
4736 void free_device_extent_tree(struct device_extent_tree *tree)
4737 {
4738         cache_tree_free_extents(&tree->tree, free_device_extent_record);
4739 }
4740
4741 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4742 static int process_extent_ref_v0(struct cache_tree *extent_cache,
4743                                  struct extent_buffer *leaf, int slot)
4744 {
4745         struct btrfs_extent_ref_v0 *ref0;
4746         struct btrfs_key key;
4747
4748         btrfs_item_key_to_cpu(leaf, &key, slot);
4749         ref0 = btrfs_item_ptr(leaf, slot, struct btrfs_extent_ref_v0);
4750         if (btrfs_ref_objectid_v0(leaf, ref0) < BTRFS_FIRST_FREE_OBJECTID) {
4751                 add_tree_backref(extent_cache, key.objectid, key.offset, 0, 0);
4752         } else {
4753                 add_data_backref(extent_cache, key.objectid, key.offset, 0,
4754                                  0, 0, btrfs_ref_count_v0(leaf, ref0), 0, 0);
4755         }
4756         return 0;
4757 }
4758 #endif
4759
4760 struct chunk_record *btrfs_new_chunk_record(struct extent_buffer *leaf,
4761                                             struct btrfs_key *key,
4762                                             int slot)
4763 {
4764         struct btrfs_chunk *ptr;
4765         struct chunk_record *rec;
4766         int num_stripes, i;
4767
4768         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4769         num_stripes = btrfs_chunk_num_stripes(leaf, ptr);
4770
4771         rec = malloc(btrfs_chunk_record_size(num_stripes));
4772         if (!rec) {
4773                 fprintf(stderr, "memory allocation failed\n");
4774                 exit(-1);
4775         }
4776
4777         memset(rec, 0, btrfs_chunk_record_size(num_stripes));
4778
4779         INIT_LIST_HEAD(&rec->list);
4780         INIT_LIST_HEAD(&rec->dextents);
4781         rec->bg_rec = NULL;
4782
4783         rec->cache.start = key->offset;
4784         rec->cache.size = btrfs_chunk_length(leaf, ptr);
4785
4786         rec->generation = btrfs_header_generation(leaf);
4787
4788         rec->objectid = key->objectid;
4789         rec->type = key->type;
4790         rec->offset = key->offset;
4791
4792         rec->length = rec->cache.size;
4793         rec->owner = btrfs_chunk_owner(leaf, ptr);
4794         rec->stripe_len = btrfs_chunk_stripe_len(leaf, ptr);
4795         rec->type_flags = btrfs_chunk_type(leaf, ptr);
4796         rec->io_width = btrfs_chunk_io_width(leaf, ptr);
4797         rec->io_align = btrfs_chunk_io_align(leaf, ptr);
4798         rec->sector_size = btrfs_chunk_sector_size(leaf, ptr);
4799         rec->num_stripes = num_stripes;
4800         rec->sub_stripes = btrfs_chunk_sub_stripes(leaf, ptr);
4801
4802         for (i = 0; i < rec->num_stripes; ++i) {
4803                 rec->stripes[i].devid =
4804                         btrfs_stripe_devid_nr(leaf, ptr, i);
4805                 rec->stripes[i].offset =
4806                         btrfs_stripe_offset_nr(leaf, ptr, i);
4807                 read_extent_buffer(leaf, rec->stripes[i].dev_uuid,
4808                                 (unsigned long)btrfs_stripe_dev_uuid_nr(ptr, i),
4809                                 BTRFS_UUID_SIZE);
4810         }
4811
4812         return rec;
4813 }
4814
4815 static int process_chunk_item(struct cache_tree *chunk_cache,
4816                               struct btrfs_key *key, struct extent_buffer *eb,
4817                               int slot)
4818 {
4819         struct chunk_record *rec;
4820         int ret = 0;
4821
4822         rec = btrfs_new_chunk_record(eb, key, slot);
4823         ret = insert_cache_extent(chunk_cache, &rec->cache);
4824         if (ret) {
4825                 fprintf(stderr, "Chunk[%llu, %llu] existed.\n",
4826                         rec->offset, rec->length);
4827                 free(rec);
4828         }
4829
4830         return ret;
4831 }
4832
4833 static int process_device_item(struct rb_root *dev_cache,
4834                 struct btrfs_key *key, struct extent_buffer *eb, int slot)
4835 {
4836         struct btrfs_dev_item *ptr;
4837         struct device_record *rec;
4838         int ret = 0;
4839
4840         ptr = btrfs_item_ptr(eb,
4841                 slot, struct btrfs_dev_item);
4842
4843         rec = malloc(sizeof(*rec));
4844         if (!rec) {
4845                 fprintf(stderr, "memory allocation failed\n");
4846                 return -ENOMEM;
4847         }
4848
4849         rec->devid = key->offset;
4850         rec->generation = btrfs_header_generation(eb);
4851
4852         rec->objectid = key->objectid;
4853         rec->type = key->type;
4854         rec->offset = key->offset;
4855
4856         rec->devid = btrfs_device_id(eb, ptr);
4857         rec->total_byte = btrfs_device_total_bytes(eb, ptr);
4858         rec->byte_used = btrfs_device_bytes_used(eb, ptr);
4859
4860         ret = rb_insert(dev_cache, &rec->node, device_record_compare);
4861         if (ret) {
4862                 fprintf(stderr, "Device[%llu] existed.\n", rec->devid);
4863                 free(rec);
4864         }
4865
4866         return ret;
4867 }
4868
4869 struct block_group_record *
4870 btrfs_new_block_group_record(struct extent_buffer *leaf, struct btrfs_key *key,
4871                              int slot)
4872 {
4873         struct btrfs_block_group_item *ptr;
4874         struct block_group_record *rec;
4875
4876         rec = malloc(sizeof(*rec));
4877         if (!rec) {
4878                 fprintf(stderr, "memory allocation failed\n");
4879                 exit(-1);
4880         }
4881         memset(rec, 0, sizeof(*rec));
4882
4883         rec->cache.start = key->objectid;
4884         rec->cache.size = key->offset;
4885
4886         rec->generation = btrfs_header_generation(leaf);
4887
4888         rec->objectid = key->objectid;
4889         rec->type = key->type;
4890         rec->offset = key->offset;
4891
4892         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_block_group_item);
4893         rec->flags = btrfs_disk_block_group_flags(leaf, ptr);
4894
4895         INIT_LIST_HEAD(&rec->list);
4896
4897         return rec;
4898 }
4899
4900 static int process_block_group_item(struct block_group_tree *block_group_cache,
4901                                     struct btrfs_key *key,
4902                                     struct extent_buffer *eb, int slot)
4903 {
4904         struct block_group_record *rec;
4905         int ret = 0;
4906
4907         rec = btrfs_new_block_group_record(eb, key, slot);
4908         ret = insert_block_group_record(block_group_cache, rec);
4909         if (ret) {
4910                 fprintf(stderr, "Block Group[%llu, %llu] existed.\n",
4911                         rec->objectid, rec->offset);
4912                 free(rec);
4913         }
4914
4915         return ret;
4916 }
4917
4918 struct device_extent_record *
4919 btrfs_new_device_extent_record(struct extent_buffer *leaf,
4920                                struct btrfs_key *key, int slot)
4921 {
4922         struct device_extent_record *rec;
4923         struct btrfs_dev_extent *ptr;
4924
4925         rec = malloc(sizeof(*rec));
4926         if (!rec) {
4927                 fprintf(stderr, "memory allocation failed\n");
4928                 exit(-1);
4929         }
4930         memset(rec, 0, sizeof(*rec));
4931
4932         rec->cache.objectid = key->objectid;
4933         rec->cache.start = key->offset;
4934
4935         rec->generation = btrfs_header_generation(leaf);
4936
4937         rec->objectid = key->objectid;
4938         rec->type = key->type;
4939         rec->offset = key->offset;
4940
4941         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
4942         rec->chunk_objecteid =
4943                 btrfs_dev_extent_chunk_objectid(leaf, ptr);
4944         rec->chunk_offset =
4945                 btrfs_dev_extent_chunk_offset(leaf, ptr);
4946         rec->length = btrfs_dev_extent_length(leaf, ptr);
4947         rec->cache.size = rec->length;
4948
4949         INIT_LIST_HEAD(&rec->chunk_list);
4950         INIT_LIST_HEAD(&rec->device_list);
4951
4952         return rec;
4953 }
4954
4955 static int
4956 process_device_extent_item(struct device_extent_tree *dev_extent_cache,
4957                            struct btrfs_key *key, struct extent_buffer *eb,
4958                            int slot)
4959 {
4960         struct device_extent_record *rec;
4961         int ret;
4962
4963         rec = btrfs_new_device_extent_record(eb, key, slot);
4964         ret = insert_device_extent_record(dev_extent_cache, rec);
4965         if (ret) {
4966                 fprintf(stderr,
4967                         "Device extent[%llu, %llu, %llu] existed.\n",
4968                         rec->objectid, rec->offset, rec->length);
4969                 free(rec);
4970         }
4971
4972         return ret;
4973 }
4974
4975 static int process_extent_item(struct btrfs_root *root,
4976                                struct cache_tree *extent_cache,
4977                                struct extent_buffer *eb, int slot)
4978 {
4979         struct btrfs_extent_item *ei;
4980         struct btrfs_extent_inline_ref *iref;
4981         struct btrfs_extent_data_ref *dref;
4982         struct btrfs_shared_data_ref *sref;
4983         struct btrfs_key key;
4984         unsigned long end;
4985         unsigned long ptr;
4986         int type;
4987         u32 item_size = btrfs_item_size_nr(eb, slot);
4988         u64 refs = 0;
4989         u64 offset;
4990         u64 num_bytes;
4991         int metadata = 0;
4992
4993         btrfs_item_key_to_cpu(eb, &key, slot);
4994
4995         if (key.type == BTRFS_METADATA_ITEM_KEY) {
4996                 metadata = 1;
4997                 num_bytes = root->leafsize;
4998         } else {
4999                 num_bytes = key.offset;
5000         }
5001
5002         if (item_size < sizeof(*ei)) {
5003 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5004                 struct btrfs_extent_item_v0 *ei0;
5005                 BUG_ON(item_size != sizeof(*ei0));
5006                 ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
5007                 refs = btrfs_extent_refs_v0(eb, ei0);
5008 #else
5009                 BUG();
5010 #endif
5011                 return add_extent_rec(extent_cache, NULL, 0, key.objectid,
5012                                       num_bytes, refs, 0, 0, 0, metadata, 1,
5013                                       num_bytes);
5014         }
5015
5016         ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
5017         refs = btrfs_extent_refs(eb, ei);
5018
5019         add_extent_rec(extent_cache, NULL, 0, key.objectid, num_bytes,
5020                        refs, 0, 0, 0, metadata, 1, num_bytes);
5021
5022         ptr = (unsigned long)(ei + 1);
5023         if (btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK &&
5024             key.type == BTRFS_EXTENT_ITEM_KEY)
5025                 ptr += sizeof(struct btrfs_tree_block_info);
5026
5027         end = (unsigned long)ei + item_size;
5028         while (ptr < end) {
5029                 iref = (struct btrfs_extent_inline_ref *)ptr;
5030                 type = btrfs_extent_inline_ref_type(eb, iref);
5031                 offset = btrfs_extent_inline_ref_offset(eb, iref);
5032                 switch (type) {
5033                 case BTRFS_TREE_BLOCK_REF_KEY:
5034                         add_tree_backref(extent_cache, key.objectid,
5035                                          0, offset, 0);
5036                         break;
5037                 case BTRFS_SHARED_BLOCK_REF_KEY:
5038                         add_tree_backref(extent_cache, key.objectid,
5039                                          offset, 0, 0);
5040                         break;
5041                 case BTRFS_EXTENT_DATA_REF_KEY:
5042                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
5043                         add_data_backref(extent_cache, key.objectid, 0,
5044                                         btrfs_extent_data_ref_root(eb, dref),
5045                                         btrfs_extent_data_ref_objectid(eb,
5046                                                                        dref),
5047                                         btrfs_extent_data_ref_offset(eb, dref),
5048                                         btrfs_extent_data_ref_count(eb, dref),
5049                                         0, num_bytes);
5050                         break;
5051                 case BTRFS_SHARED_DATA_REF_KEY:
5052                         sref = (struct btrfs_shared_data_ref *)(iref + 1);
5053                         add_data_backref(extent_cache, key.objectid, offset,
5054                                         0, 0, 0,
5055                                         btrfs_shared_data_ref_count(eb, sref),
5056                                         0, num_bytes);
5057                         break;
5058                 default:
5059                         fprintf(stderr, "corrupt extent record: key %Lu %u %Lu\n",
5060                                 key.objectid, key.type, num_bytes);
5061                         goto out;
5062                 }
5063                 ptr += btrfs_extent_inline_ref_size(type);
5064         }
5065         WARN_ON(ptr > end);
5066 out:
5067         return 0;
5068 }
5069
5070 static int check_cache_range(struct btrfs_root *root,
5071                              struct btrfs_block_group_cache *cache,
5072                              u64 offset, u64 bytes)
5073 {
5074         struct btrfs_free_space *entry;
5075         u64 *logical;
5076         u64 bytenr;
5077         int stripe_len;
5078         int i, nr, ret;
5079
5080         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
5081                 bytenr = btrfs_sb_offset(i);
5082                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
5083                                        cache->key.objectid, bytenr, 0,
5084                                        &logical, &nr, &stripe_len);
5085                 if (ret)
5086                         return ret;
5087
5088                 while (nr--) {
5089                         if (logical[nr] + stripe_len <= offset)
5090                                 continue;
5091                         if (offset + bytes <= logical[nr])
5092                                 continue;
5093                         if (logical[nr] == offset) {
5094                                 if (stripe_len >= bytes) {
5095                                         kfree(logical);
5096                                         return 0;
5097                                 }
5098                                 bytes -= stripe_len;
5099                                 offset += stripe_len;
5100                         } else if (logical[nr] < offset) {
5101                                 if (logical[nr] + stripe_len >=
5102                                     offset + bytes) {
5103                                         kfree(logical);
5104                                         return 0;
5105                                 }
5106                                 bytes = (offset + bytes) -
5107                                         (logical[nr] + stripe_len);
5108                                 offset = logical[nr] + stripe_len;
5109                         } else {
5110                                 /*
5111                                  * Could be tricky, the super may land in the
5112                                  * middle of the area we're checking.  First
5113                                  * check the easiest case, it's at the end.
5114                                  */
5115                                 if (logical[nr] + stripe_len >=
5116                                     bytes + offset) {
5117                                         bytes = logical[nr] - offset;
5118                                         continue;
5119                                 }
5120
5121                                 /* Check the left side */
5122                                 ret = check_cache_range(root, cache,
5123                                                         offset,
5124                                                         logical[nr] - offset);
5125                                 if (ret) {
5126                                         kfree(logical);
5127                                         return ret;
5128                                 }
5129
5130                                 /* Now we continue with the right side */
5131                                 bytes = (offset + bytes) -
5132                                         (logical[nr] + stripe_len);
5133                                 offset = logical[nr] + stripe_len;
5134                         }
5135                 }
5136
5137                 kfree(logical);
5138         }
5139
5140         entry = btrfs_find_free_space(cache->free_space_ctl, offset, bytes);
5141         if (!entry) {
5142                 fprintf(stderr, "There is no free space entry for %Lu-%Lu\n",
5143                         offset, offset+bytes);
5144                 return -EINVAL;
5145         }
5146
5147         if (entry->offset != offset) {
5148                 fprintf(stderr, "Wanted offset %Lu, found %Lu\n", offset,
5149                         entry->offset);
5150                 return -EINVAL;
5151         }
5152
5153         if (entry->bytes != bytes) {
5154                 fprintf(stderr, "Wanted bytes %Lu, found %Lu for off %Lu\n",
5155                         bytes, entry->bytes, offset);
5156                 return -EINVAL;
5157         }
5158
5159         unlink_free_space(cache->free_space_ctl, entry);
5160         free(entry);
5161         return 0;
5162 }
5163
5164 static int verify_space_cache(struct btrfs_root *root,
5165                               struct btrfs_block_group_cache *cache)
5166 {
5167         struct btrfs_path *path;
5168         struct extent_buffer *leaf;
5169         struct btrfs_key key;
5170         u64 last;
5171         int ret = 0;
5172
5173         path = btrfs_alloc_path();
5174         if (!path)
5175                 return -ENOMEM;
5176
5177         root = root->fs_info->extent_root;
5178
5179         last = max_t(u64, cache->key.objectid, BTRFS_SUPER_INFO_OFFSET);
5180
5181         key.objectid = last;
5182         key.offset = 0;
5183         key.type = BTRFS_EXTENT_ITEM_KEY;
5184
5185         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5186         if (ret < 0)
5187                 goto out;
5188         ret = 0;
5189         while (1) {
5190                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5191                         ret = btrfs_next_leaf(root, path);
5192                         if (ret < 0)
5193                                 goto out;
5194                         if (ret > 0) {
5195                                 ret = 0;
5196                                 break;
5197                         }
5198                 }
5199                 leaf = path->nodes[0];
5200                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5201                 if (key.objectid >= cache->key.offset + cache->key.objectid)
5202                         break;
5203                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
5204                     key.type != BTRFS_METADATA_ITEM_KEY) {
5205                         path->slots[0]++;
5206                         continue;
5207                 }
5208
5209                 if (last == key.objectid) {
5210                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
5211                                 last = key.objectid + key.offset;
5212                         else
5213                                 last = key.objectid + root->leafsize;
5214                         path->slots[0]++;
5215                         continue;
5216                 }
5217
5218                 ret = check_cache_range(root, cache, last,
5219                                         key.objectid - last);
5220                 if (ret)
5221                         break;
5222                 if (key.type == BTRFS_EXTENT_ITEM_KEY)
5223                         last = key.objectid + key.offset;
5224                 else
5225                         last = key.objectid + root->leafsize;
5226                 path->slots[0]++;
5227         }
5228
5229         if (last < cache->key.objectid + cache->key.offset)
5230                 ret = check_cache_range(root, cache, last,
5231                                         cache->key.objectid +
5232                                         cache->key.offset - last);
5233
5234 out:
5235         btrfs_free_path(path);
5236
5237         if (!ret &&
5238             !RB_EMPTY_ROOT(&cache->free_space_ctl->free_space_offset)) {
5239                 fprintf(stderr, "There are still entries left in the space "
5240                         "cache\n");
5241                 ret = -EINVAL;
5242         }
5243
5244         return ret;
5245 }
5246
5247 static int check_space_cache(struct btrfs_root *root)
5248 {
5249         struct btrfs_block_group_cache *cache;
5250         u64 start = BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE;
5251         int ret;
5252         int error = 0;
5253
5254         if (btrfs_super_cache_generation(root->fs_info->super_copy) != -1ULL &&
5255             btrfs_super_generation(root->fs_info->super_copy) !=
5256             btrfs_super_cache_generation(root->fs_info->super_copy)) {
5257                 printf("cache and super generation don't match, space cache "
5258                        "will be invalidated\n");
5259                 return 0;
5260         }
5261
5262         while (1) {
5263                 cache = btrfs_lookup_first_block_group(root->fs_info, start);
5264                 if (!cache)
5265                         break;
5266
5267                 start = cache->key.objectid + cache->key.offset;
5268                 if (!cache->free_space_ctl) {
5269                         if (btrfs_init_free_space_ctl(cache,
5270                                                       root->sectorsize)) {
5271                                 ret = -ENOMEM;
5272                                 break;
5273                         }
5274                 } else {
5275                         btrfs_remove_free_space_cache(cache);
5276                 }
5277
5278                 ret = load_free_space_cache(root->fs_info, cache);
5279                 if (!ret)
5280                         continue;
5281
5282                 ret = verify_space_cache(root, cache);
5283                 if (ret) {
5284                         fprintf(stderr, "cache appears valid but isnt %Lu\n",
5285                                 cache->key.objectid);
5286                         error++;
5287                 }
5288         }
5289
5290         return error ? -EINVAL : 0;
5291 }
5292
5293 static int check_extent_csums(struct btrfs_root *root, u64 bytenr,
5294                         u64 num_bytes, unsigned long leaf_offset,
5295                         struct extent_buffer *eb) {
5296
5297         u64 offset = 0;
5298         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
5299         char *data;
5300         unsigned long csum_offset;
5301         u32 csum;
5302         u32 csum_expected;
5303         u64 read_len;
5304         u64 data_checked = 0;
5305         u64 tmp;
5306         int ret = 0;
5307         int mirror;
5308         int num_copies;
5309
5310         if (num_bytes % root->sectorsize)
5311                 return -EINVAL;
5312
5313         data = malloc(num_bytes);
5314         if (!data)
5315                 return -ENOMEM;
5316
5317         while (offset < num_bytes) {
5318                 mirror = 0;
5319 again:
5320                 read_len = num_bytes - offset;
5321                 /* read as much space once a time */
5322                 ret = read_extent_data(root, data + offset,
5323                                 bytenr + offset, &read_len, mirror);
5324                 if (ret)
5325                         goto out;
5326                 data_checked = 0;
5327                 /* verify every 4k data's checksum */
5328                 while (data_checked < read_len) {
5329                         csum = ~(u32)0;
5330                         tmp = offset + data_checked;
5331
5332                         csum = btrfs_csum_data(NULL, (char *)data + tmp,
5333                                                csum, root->sectorsize);
5334                         btrfs_csum_final(csum, (char *)&csum);
5335
5336                         csum_offset = leaf_offset +
5337                                  tmp / root->sectorsize * csum_size;
5338                         read_extent_buffer(eb, (char *)&csum_expected,
5339                                            csum_offset, csum_size);
5340                         /* try another mirror */
5341                         if (csum != csum_expected) {
5342                                 fprintf(stderr, "mirror %d bytenr %llu csum %u expected csum %u\n",
5343                                                 mirror, bytenr + tmp,
5344                                                 csum, csum_expected);
5345                                 num_copies = btrfs_num_copies(
5346                                                 &root->fs_info->mapping_tree,
5347                                                 bytenr, num_bytes);
5348                                 if (mirror < num_copies - 1) {
5349                                         mirror += 1;
5350                                         goto again;
5351                                 }
5352                         }
5353                         data_checked += root->sectorsize;
5354                 }
5355                 offset += read_len;
5356         }
5357 out:
5358         free(data);
5359         return ret;
5360 }
5361
5362 static int check_extent_exists(struct btrfs_root *root, u64 bytenr,
5363                                u64 num_bytes)
5364 {
5365         struct btrfs_path *path;
5366         struct extent_buffer *leaf;
5367         struct btrfs_key key;
5368         int ret;
5369
5370         path = btrfs_alloc_path();
5371         if (!path) {
5372                 fprintf(stderr, "Error allocing path\n");
5373                 return -ENOMEM;
5374         }
5375
5376         key.objectid = bytenr;
5377         key.type = BTRFS_EXTENT_ITEM_KEY;
5378         key.offset = (u64)-1;
5379
5380 again:
5381         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
5382                                 0, 0);
5383         if (ret < 0) {
5384                 fprintf(stderr, "Error looking up extent record %d\n", ret);
5385                 btrfs_free_path(path);
5386                 return ret;
5387         } else if (ret) {
5388                 if (path->slots[0] > 0) {
5389                         path->slots[0]--;
5390                 } else {
5391                         ret = btrfs_prev_leaf(root, path);
5392                         if (ret < 0) {
5393                                 goto out;
5394                         } else if (ret > 0) {
5395                                 ret = 0;
5396                                 goto out;
5397                         }
5398                 }
5399         }
5400
5401         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
5402
5403         /*
5404          * Block group items come before extent items if they have the same
5405          * bytenr, so walk back one more just in case.  Dear future traveler,
5406          * first congrats on mastering time travel.  Now if it's not too much
5407          * trouble could you go back to 2006 and tell Chris to make the
5408          * BLOCK_GROUP_ITEM_KEY (and BTRFS_*_REF_KEY) lower than the
5409          * EXTENT_ITEM_KEY please?
5410          */
5411         while (key.type > BTRFS_EXTENT_ITEM_KEY) {
5412                 if (path->slots[0] > 0) {
5413                         path->slots[0]--;
5414                 } else {
5415                         ret = btrfs_prev_leaf(root, path);
5416                         if (ret < 0) {
5417                                 goto out;
5418                         } else if (ret > 0) {
5419                                 ret = 0;
5420                                 goto out;
5421                         }
5422                 }
5423                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
5424         }
5425
5426         while (num_bytes) {
5427                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5428                         ret = btrfs_next_leaf(root, path);
5429                         if (ret < 0) {
5430                                 fprintf(stderr, "Error going to next leaf "
5431                                         "%d\n", ret);
5432                                 btrfs_free_path(path);
5433                                 return ret;
5434                         } else if (ret) {
5435                                 break;
5436                         }
5437                 }
5438                 leaf = path->nodes[0];
5439                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5440                 if (key.type != BTRFS_EXTENT_ITEM_KEY) {
5441                         path->slots[0]++;
5442                         continue;
5443                 }
5444                 if (key.objectid + key.offset < bytenr) {
5445                         path->slots[0]++;
5446                         continue;
5447                 }
5448                 if (key.objectid > bytenr + num_bytes)
5449                         break;
5450
5451                 if (key.objectid == bytenr) {
5452                         if (key.offset >= num_bytes) {
5453                                 num_bytes = 0;
5454                                 break;
5455                         }
5456                         num_bytes -= key.offset;
5457                         bytenr += key.offset;
5458                 } else if (key.objectid < bytenr) {
5459                         if (key.objectid + key.offset >= bytenr + num_bytes) {
5460                                 num_bytes = 0;
5461                                 break;
5462                         }
5463                         num_bytes = (bytenr + num_bytes) -
5464                                 (key.objectid + key.offset);
5465                         bytenr = key.objectid + key.offset;
5466                 } else {
5467                         if (key.objectid + key.offset < bytenr + num_bytes) {
5468                                 u64 new_start = key.objectid + key.offset;
5469                                 u64 new_bytes = bytenr + num_bytes - new_start;
5470
5471                                 /*
5472                                  * Weird case, the extent is in the middle of
5473                                  * our range, we'll have to search one side
5474                                  * and then the other.  Not sure if this happens
5475                                  * in real life, but no harm in coding it up
5476                                  * anyway just in case.
5477                                  */
5478                                 btrfs_release_path(path);
5479                                 ret = check_extent_exists(root, new_start,
5480                                                           new_bytes);
5481                                 if (ret) {
5482                                         fprintf(stderr, "Right section didn't "
5483                                                 "have a record\n");
5484                                         break;
5485                                 }
5486                                 num_bytes = key.objectid - bytenr;
5487                                 goto again;
5488                         }
5489                         num_bytes = key.objectid - bytenr;
5490                 }
5491                 path->slots[0]++;
5492         }
5493         ret = 0;
5494
5495 out:
5496         if (num_bytes && !ret) {
5497                 fprintf(stderr, "There are no extents for csum range "
5498                         "%Lu-%Lu\n", bytenr, bytenr+num_bytes);
5499                 ret = 1;
5500         }
5501
5502         btrfs_free_path(path);
5503         return ret;
5504 }
5505
5506 static int check_csums(struct btrfs_root *root)
5507 {
5508         struct btrfs_path *path;
5509         struct extent_buffer *leaf;
5510         struct btrfs_key key;
5511         u64 offset = 0, num_bytes = 0;
5512         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
5513         int errors = 0;
5514         int ret;
5515         u64 data_len;
5516         unsigned long leaf_offset;
5517
5518         root = root->fs_info->csum_root;
5519         if (!extent_buffer_uptodate(root->node)) {
5520                 fprintf(stderr, "No valid csum tree found\n");
5521                 return -ENOENT;
5522         }
5523
5524         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
5525         key.type = BTRFS_EXTENT_CSUM_KEY;
5526         key.offset = 0;
5527
5528         path = btrfs_alloc_path();
5529         if (!path)
5530                 return -ENOMEM;
5531
5532         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5533         if (ret < 0) {
5534                 fprintf(stderr, "Error searching csum tree %d\n", ret);
5535                 btrfs_free_path(path);
5536                 return ret;
5537         }
5538
5539         if (ret > 0 && path->slots[0])
5540                 path->slots[0]--;
5541         ret = 0;
5542
5543         while (1) {
5544                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5545                         ret = btrfs_next_leaf(root, path);
5546                         if (ret < 0) {
5547                                 fprintf(stderr, "Error going to next leaf "
5548                                         "%d\n", ret);
5549                                 break;
5550                         }
5551                         if (ret)
5552                                 break;
5553                 }
5554                 leaf = path->nodes[0];
5555
5556                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5557                 if (key.type != BTRFS_EXTENT_CSUM_KEY) {
5558                         path->slots[0]++;
5559                         continue;
5560                 }
5561
5562                 data_len = (btrfs_item_size_nr(leaf, path->slots[0]) /
5563                               csum_size) * root->sectorsize;
5564                 if (!check_data_csum)
5565                         goto skip_csum_check;
5566                 leaf_offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
5567                 ret = check_extent_csums(root, key.offset, data_len,
5568                                          leaf_offset, leaf);
5569                 if (ret)
5570                         break;
5571 skip_csum_check:
5572                 if (!num_bytes) {
5573                         offset = key.offset;
5574                 } else if (key.offset != offset + num_bytes) {
5575                         ret = check_extent_exists(root, offset, num_bytes);
5576                         if (ret) {
5577                                 fprintf(stderr, "Csum exists for %Lu-%Lu but "
5578                                         "there is no extent record\n",
5579                                         offset, offset+num_bytes);
5580                                 errors++;
5581                         }
5582                         offset = key.offset;
5583                         num_bytes = 0;
5584                 }
5585                 num_bytes += data_len;
5586                 path->slots[0]++;
5587         }
5588
5589         btrfs_free_path(path);
5590         return errors;
5591 }
5592
5593 static int is_dropped_key(struct btrfs_key *key,
5594                           struct btrfs_key *drop_key) {
5595         if (key->objectid < drop_key->objectid)
5596                 return 1;
5597         else if (key->objectid == drop_key->objectid) {
5598                 if (key->type < drop_key->type)
5599                         return 1;
5600                 else if (key->type == drop_key->type) {
5601                         if (key->offset < drop_key->offset)
5602                                 return 1;
5603                 }
5604         }
5605         return 0;
5606 }
5607
5608 /*
5609  * Here are the rules for FULL_BACKREF.
5610  *
5611  * 1) If BTRFS_HEADER_FLAG_RELOC is set then we have FULL_BACKREF set.
5612  * 2) If btrfs_header_owner(buf) no longer points to buf then we have
5613  *      FULL_BACKREF set.
5614  * 3) We cow'ed the block walking down a reloc tree.  This is impossible to tell
5615  *    if it happened after the relocation occurred since we'll have dropped the
5616  *    reloc root, so it's entirely possible to have FULL_BACKREF set on buf and
5617  *    have no real way to know for sure.
5618  *
5619  * We process the blocks one root at a time, and we start from the lowest root
5620  * objectid and go to the highest.  So we can just lookup the owner backref for
5621  * the record and if we don't find it then we know it doesn't exist and we have
5622  * a FULL BACKREF.
5623  *
5624  * FIXME: if we ever start reclaiming root objectid's then we need to fix this
5625  * assumption and simply indicate that we _think_ that the FULL BACKREF needs to
5626  * be set or not and then we can check later once we've gathered all the refs.
5627  */
5628 static int calc_extent_flag(struct btrfs_root *root,
5629                            struct cache_tree *extent_cache,
5630                            struct extent_buffer *buf,
5631                            struct root_item_record *ri,
5632                            u64 *flags)
5633 {
5634         struct extent_record *rec;
5635         struct cache_extent *cache;
5636         struct tree_backref *tback;
5637         u64 owner = 0;
5638
5639         cache = lookup_cache_extent(extent_cache, buf->start, 1);
5640         /* we have added this extent before */
5641         BUG_ON(!cache);
5642         rec = container_of(cache, struct extent_record, cache);
5643
5644         /*
5645          * Except file/reloc tree, we can not have
5646          * FULL BACKREF MODE
5647          */
5648         if (ri->objectid < BTRFS_FIRST_FREE_OBJECTID)
5649                 goto normal;
5650         /*
5651          * root node
5652          */
5653         if (buf->start == ri->bytenr)
5654                 goto normal;
5655
5656         if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
5657                 goto full_backref;
5658
5659         owner = btrfs_header_owner(buf);
5660         if (owner == ri->objectid)
5661                 goto normal;
5662
5663         tback = find_tree_backref(rec, 0, owner);
5664         if (!tback)
5665                 goto full_backref;
5666 normal:
5667         *flags = 0;
5668         if (rec->flag_block_full_backref != -1 &&
5669             rec->flag_block_full_backref != 0)
5670                 rec->bad_full_backref = 1;
5671         return 0;
5672 full_backref:
5673         *flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5674         if (rec->flag_block_full_backref != -1 &&
5675             rec->flag_block_full_backref != 1)
5676                 rec->bad_full_backref = 1;
5677         return 0;
5678 }
5679
5680 static int run_next_block(struct btrfs_root *root,
5681                           struct block_info *bits,
5682                           int bits_nr,
5683                           u64 *last,
5684                           struct cache_tree *pending,
5685                           struct cache_tree *seen,
5686                           struct cache_tree *reada,
5687                           struct cache_tree *nodes,
5688                           struct cache_tree *extent_cache,
5689                           struct cache_tree *chunk_cache,
5690                           struct rb_root *dev_cache,
5691                           struct block_group_tree *block_group_cache,
5692                           struct device_extent_tree *dev_extent_cache,
5693                           struct root_item_record *ri)
5694 {
5695         struct extent_buffer *buf;
5696         struct extent_record *rec = NULL;
5697         u64 bytenr;
5698         u32 size;
5699         u64 parent;
5700         u64 owner;
5701         u64 flags;
5702         u64 ptr;
5703         u64 gen = 0;
5704         int ret = 0;
5705         int i;
5706         int nritems;
5707         struct btrfs_key key;
5708         struct cache_extent *cache;
5709         int reada_bits;
5710
5711         nritems = pick_next_pending(pending, reada, nodes, *last, bits,
5712                                     bits_nr, &reada_bits);
5713         if (nritems == 0)
5714                 return 1;
5715
5716         if (!reada_bits) {
5717                 for(i = 0; i < nritems; i++) {
5718                         ret = add_cache_extent(reada, bits[i].start,
5719                                                bits[i].size);
5720                         if (ret == -EEXIST)
5721                                 continue;
5722
5723                         /* fixme, get the parent transid */
5724                         readahead_tree_block(root, bits[i].start,
5725                                              bits[i].size, 0);
5726                 }
5727         }
5728         *last = bits[0].start;
5729         bytenr = bits[0].start;
5730         size = bits[0].size;
5731
5732         cache = lookup_cache_extent(pending, bytenr, size);
5733         if (cache) {
5734                 remove_cache_extent(pending, cache);
5735                 free(cache);
5736         }
5737         cache = lookup_cache_extent(reada, bytenr, size);
5738         if (cache) {
5739                 remove_cache_extent(reada, cache);
5740                 free(cache);
5741         }
5742         cache = lookup_cache_extent(nodes, bytenr, size);
5743         if (cache) {
5744                 remove_cache_extent(nodes, cache);
5745                 free(cache);
5746         }
5747         cache = lookup_cache_extent(extent_cache, bytenr, size);
5748         if (cache) {
5749                 rec = container_of(cache, struct extent_record, cache);
5750                 gen = rec->parent_generation;
5751         }
5752
5753         /* fixme, get the real parent transid */
5754         buf = read_tree_block(root, bytenr, size, gen);
5755         if (!extent_buffer_uptodate(buf)) {
5756                 record_bad_block_io(root->fs_info,
5757                                     extent_cache, bytenr, size);
5758                 goto out;
5759         }
5760
5761         nritems = btrfs_header_nritems(buf);
5762
5763         flags = 0;
5764         if (!init_extent_tree) {
5765                 ret = btrfs_lookup_extent_info(NULL, root, bytenr,
5766                                        btrfs_header_level(buf), 1, NULL,
5767                                        &flags);
5768                 if (ret < 0) {
5769                         ret = calc_extent_flag(root, extent_cache, buf, ri, &flags);
5770                         if (ret < 0) {
5771                                 fprintf(stderr, "Couldn't calc extent flags\n");
5772                                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5773                         }
5774                 }
5775         } else {
5776                 flags = 0;
5777                 ret = calc_extent_flag(root, extent_cache, buf, ri, &flags);
5778                 if (ret < 0) {
5779                         fprintf(stderr, "Couldn't calc extent flags\n");
5780                         flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5781                 }
5782         }
5783
5784         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5785                 if (ri != NULL &&
5786                     ri->objectid != BTRFS_TREE_RELOC_OBJECTID &&
5787                     ri->objectid == btrfs_header_owner(buf)) {
5788                         /*
5789                          * Ok we got to this block from it's original owner and
5790                          * we have FULL_BACKREF set.  Relocation can leave
5791                          * converted blocks over so this is altogether possible,
5792                          * however it's not possible if the generation > the
5793                          * last snapshot, so check for this case.
5794                          */
5795                         if (!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC) &&
5796                             btrfs_header_generation(buf) > ri->last_snapshot) {
5797                                 flags &= ~BTRFS_BLOCK_FLAG_FULL_BACKREF;
5798                                 rec->bad_full_backref = 1;
5799                         }
5800                 }
5801         } else {
5802                 if (ri != NULL &&
5803                     (ri->objectid == BTRFS_TREE_RELOC_OBJECTID ||
5804                      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
5805                         flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5806                         rec->bad_full_backref = 1;
5807                 }
5808         }
5809
5810         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5811                 rec->flag_block_full_backref = 1;
5812                 parent = bytenr;
5813                 owner = 0;
5814         } else {
5815                 rec->flag_block_full_backref = 0;
5816                 parent = 0;
5817                 owner = btrfs_header_owner(buf);
5818         }
5819
5820         ret = check_block(root, extent_cache, buf, flags);
5821         if (ret)
5822                 goto out;
5823
5824         if (btrfs_is_leaf(buf)) {
5825                 btree_space_waste += btrfs_leaf_free_space(root, buf);
5826                 for (i = 0; i < nritems; i++) {
5827                         struct btrfs_file_extent_item *fi;
5828                         btrfs_item_key_to_cpu(buf, &key, i);
5829                         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
5830                                 process_extent_item(root, extent_cache, buf,
5831                                                     i);
5832                                 continue;
5833                         }
5834                         if (key.type == BTRFS_METADATA_ITEM_KEY) {
5835                                 process_extent_item(root, extent_cache, buf,
5836                                                     i);
5837                                 continue;
5838                         }
5839                         if (key.type == BTRFS_EXTENT_CSUM_KEY) {
5840                                 total_csum_bytes +=
5841                                         btrfs_item_size_nr(buf, i);
5842                                 continue;
5843                         }
5844                         if (key.type == BTRFS_CHUNK_ITEM_KEY) {
5845                                 process_chunk_item(chunk_cache, &key, buf, i);
5846                                 continue;
5847                         }
5848                         if (key.type == BTRFS_DEV_ITEM_KEY) {
5849                                 process_device_item(dev_cache, &key, buf, i);
5850                                 continue;
5851                         }
5852                         if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5853                                 process_block_group_item(block_group_cache,
5854                                         &key, buf, i);
5855                                 continue;
5856                         }
5857                         if (key.type == BTRFS_DEV_EXTENT_KEY) {
5858                                 process_device_extent_item(dev_extent_cache,
5859                                         &key, buf, i);
5860                                 continue;
5861
5862                         }
5863                         if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
5864 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5865                                 process_extent_ref_v0(extent_cache, buf, i);
5866 #else
5867                                 BUG();
5868 #endif
5869                                 continue;
5870                         }
5871
5872                         if (key.type == BTRFS_TREE_BLOCK_REF_KEY) {
5873                                 add_tree_backref(extent_cache, key.objectid, 0,
5874                                                  key.offset, 0);
5875                                 continue;
5876                         }
5877                         if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
5878                                 add_tree_backref(extent_cache, key.objectid,
5879                                                  key.offset, 0, 0);
5880                                 continue;
5881                         }
5882                         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
5883                                 struct btrfs_extent_data_ref *ref;
5884                                 ref = btrfs_item_ptr(buf, i,
5885                                                 struct btrfs_extent_data_ref);
5886                                 add_data_backref(extent_cache,
5887                                         key.objectid, 0,
5888                                         btrfs_extent_data_ref_root(buf, ref),
5889                                         btrfs_extent_data_ref_objectid(buf,
5890                                                                        ref),
5891                                         btrfs_extent_data_ref_offset(buf, ref),
5892                                         btrfs_extent_data_ref_count(buf, ref),
5893                                         0, root->sectorsize);
5894                                 continue;
5895                         }
5896                         if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
5897                                 struct btrfs_shared_data_ref *ref;
5898                                 ref = btrfs_item_ptr(buf, i,
5899                                                 struct btrfs_shared_data_ref);
5900                                 add_data_backref(extent_cache,
5901                                         key.objectid, key.offset, 0, 0, 0,
5902                                         btrfs_shared_data_ref_count(buf, ref),
5903                                         0, root->sectorsize);
5904                                 continue;
5905                         }
5906                         if (key.type == BTRFS_ORPHAN_ITEM_KEY) {
5907                                 struct bad_item *bad;
5908
5909                                 if (key.objectid == BTRFS_ORPHAN_OBJECTID)
5910                                         continue;
5911                                 if (!owner)
5912                                         continue;
5913                                 bad = malloc(sizeof(struct bad_item));
5914                                 if (!bad)
5915                                         continue;
5916                                 INIT_LIST_HEAD(&bad->list);
5917                                 memcpy(&bad->key, &key,
5918                                        sizeof(struct btrfs_key));
5919                                 bad->root_id = owner;
5920                                 list_add_tail(&bad->list, &delete_items);
5921                                 continue;
5922                         }
5923                         if (key.type != BTRFS_EXTENT_DATA_KEY)
5924                                 continue;
5925                         fi = btrfs_item_ptr(buf, i,
5926                                             struct btrfs_file_extent_item);
5927                         if (btrfs_file_extent_type(buf, fi) ==
5928                             BTRFS_FILE_EXTENT_INLINE)
5929                                 continue;
5930                         if (btrfs_file_extent_disk_bytenr(buf, fi) == 0)
5931                                 continue;
5932
5933                         data_bytes_allocated +=
5934                                 btrfs_file_extent_disk_num_bytes(buf, fi);
5935                         if (data_bytes_allocated < root->sectorsize) {
5936                                 abort();
5937                         }
5938                         data_bytes_referenced +=
5939                                 btrfs_file_extent_num_bytes(buf, fi);
5940                         add_data_backref(extent_cache,
5941                                 btrfs_file_extent_disk_bytenr(buf, fi),
5942                                 parent, owner, key.objectid, key.offset -
5943                                 btrfs_file_extent_offset(buf, fi), 1, 1,
5944                                 btrfs_file_extent_disk_num_bytes(buf, fi));
5945                 }
5946         } else {
5947                 int level;
5948                 struct btrfs_key first_key;
5949
5950                 first_key.objectid = 0;
5951
5952                 if (nritems > 0)
5953                         btrfs_item_key_to_cpu(buf, &first_key, 0);
5954                 level = btrfs_header_level(buf);
5955                 for (i = 0; i < nritems; i++) {
5956                         ptr = btrfs_node_blockptr(buf, i);
5957                         size = btrfs_level_size(root, level - 1);
5958                         btrfs_node_key_to_cpu(buf, &key, i);
5959                         if (ri != NULL) {
5960                                 if ((level == ri->drop_level)
5961                                     && is_dropped_key(&key, &ri->drop_key)) {
5962                                         continue;
5963                                 }
5964                         }
5965                         ret = add_extent_rec(extent_cache, &key,
5966                                              btrfs_node_ptr_generation(buf, i),
5967                                              ptr, size, 0, 0, 1, 0, 1, 0,
5968                                              size);
5969                         BUG_ON(ret);
5970
5971                         add_tree_backref(extent_cache, ptr, parent, owner, 1);
5972
5973                         if (level > 1) {
5974                                 add_pending(nodes, seen, ptr, size);
5975                         } else {
5976                                 add_pending(pending, seen, ptr, size);
5977                         }
5978                 }
5979                 btree_space_waste += (BTRFS_NODEPTRS_PER_BLOCK(root) -
5980                                       nritems) * sizeof(struct btrfs_key_ptr);
5981         }
5982         total_btree_bytes += buf->len;
5983         if (fs_root_objectid(btrfs_header_owner(buf)))
5984                 total_fs_tree_bytes += buf->len;
5985         if (btrfs_header_owner(buf) == BTRFS_EXTENT_TREE_OBJECTID)
5986                 total_extent_tree_bytes += buf->len;
5987         if (!found_old_backref &&
5988             btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID &&
5989             btrfs_header_backref_rev(buf) == BTRFS_MIXED_BACKREF_REV &&
5990             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
5991                 found_old_backref = 1;
5992 out:
5993         free_extent_buffer(buf);
5994         return ret;
5995 }
5996
5997 static int add_root_to_pending(struct extent_buffer *buf,
5998                                struct cache_tree *extent_cache,
5999                                struct cache_tree *pending,
6000                                struct cache_tree *seen,
6001                                struct cache_tree *nodes,
6002                                u64 objectid)
6003 {
6004         if (btrfs_header_level(buf) > 0)
6005                 add_pending(nodes, seen, buf->start, buf->len);
6006         else
6007                 add_pending(pending, seen, buf->start, buf->len);
6008         add_extent_rec(extent_cache, NULL, 0, buf->start, buf->len,
6009                        0, 1, 1, 0, 1, 0, buf->len);
6010
6011         if (objectid == BTRFS_TREE_RELOC_OBJECTID ||
6012             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
6013                 add_tree_backref(extent_cache, buf->start, buf->start,
6014                                  0, 1);
6015         else
6016                 add_tree_backref(extent_cache, buf->start, 0, objectid, 1);
6017         return 0;
6018 }
6019
6020 /* as we fix the tree, we might be deleting blocks that
6021  * we're tracking for repair.  This hook makes sure we
6022  * remove any backrefs for blocks as we are fixing them.
6023  */
6024 static int free_extent_hook(struct btrfs_trans_handle *trans,
6025                             struct btrfs_root *root,
6026                             u64 bytenr, u64 num_bytes, u64 parent,
6027                             u64 root_objectid, u64 owner, u64 offset,
6028                             int refs_to_drop)
6029 {
6030         struct extent_record *rec;
6031         struct cache_extent *cache;
6032         int is_data;
6033         struct cache_tree *extent_cache = root->fs_info->fsck_extent_cache;
6034
6035         is_data = owner >= BTRFS_FIRST_FREE_OBJECTID;
6036         cache = lookup_cache_extent(extent_cache, bytenr, num_bytes);
6037         if (!cache)
6038                 return 0;
6039
6040         rec = container_of(cache, struct extent_record, cache);
6041         if (is_data) {
6042                 struct data_backref *back;
6043                 back = find_data_backref(rec, parent, root_objectid, owner,
6044                                          offset, 1, bytenr, num_bytes);
6045                 if (!back)
6046                         goto out;
6047                 if (back->node.found_ref) {
6048                         back->found_ref -= refs_to_drop;
6049                         if (rec->refs)
6050                                 rec->refs -= refs_to_drop;
6051                 }
6052                 if (back->node.found_extent_tree) {
6053                         back->num_refs -= refs_to_drop;
6054                         if (rec->extent_item_refs)
6055                                 rec->extent_item_refs -= refs_to_drop;
6056                 }
6057                 if (back->found_ref == 0)
6058                         back->node.found_ref = 0;
6059                 if (back->num_refs == 0)
6060                         back->node.found_extent_tree = 0;
6061
6062                 if (!back->node.found_extent_tree && back->node.found_ref) {
6063                         list_del(&back->node.list);
6064                         free(back);
6065                 }
6066         } else {
6067                 struct tree_backref *back;
6068                 back = find_tree_backref(rec, parent, root_objectid);
6069                 if (!back)
6070                         goto out;
6071                 if (back->node.found_ref) {
6072                         if (rec->refs)
6073                                 rec->refs--;
6074                         back->node.found_ref = 0;
6075                 }
6076                 if (back->node.found_extent_tree) {
6077                         if (rec->extent_item_refs)
6078                                 rec->extent_item_refs--;
6079                         back->node.found_extent_tree = 0;
6080                 }
6081                 if (!back->node.found_extent_tree && back->node.found_ref) {
6082                         list_del(&back->node.list);
6083                         free(back);
6084                 }
6085         }
6086         maybe_free_extent_rec(extent_cache, rec);
6087 out:
6088         return 0;
6089 }
6090
6091 static int delete_extent_records(struct btrfs_trans_handle *trans,
6092                                  struct btrfs_root *root,
6093                                  struct btrfs_path *path,
6094                                  u64 bytenr, u64 new_len)
6095 {
6096         struct btrfs_key key;
6097         struct btrfs_key found_key;
6098         struct extent_buffer *leaf;
6099         int ret;
6100         int slot;
6101
6102
6103         key.objectid = bytenr;
6104         key.type = (u8)-1;
6105         key.offset = (u64)-1;
6106
6107         while(1) {
6108                 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
6109                                         &key, path, 0, 1);
6110                 if (ret < 0)
6111                         break;
6112
6113                 if (ret > 0) {
6114                         ret = 0;
6115                         if (path->slots[0] == 0)
6116                                 break;
6117                         path->slots[0]--;
6118                 }
6119                 ret = 0;
6120
6121                 leaf = path->nodes[0];
6122                 slot = path->slots[0];
6123
6124                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6125                 if (found_key.objectid != bytenr)
6126                         break;
6127
6128                 if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
6129                     found_key.type != BTRFS_METADATA_ITEM_KEY &&
6130                     found_key.type != BTRFS_TREE_BLOCK_REF_KEY &&
6131                     found_key.type != BTRFS_EXTENT_DATA_REF_KEY &&
6132                     found_key.type != BTRFS_EXTENT_REF_V0_KEY &&
6133                     found_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
6134                     found_key.type != BTRFS_SHARED_DATA_REF_KEY) {
6135                         btrfs_release_path(path);
6136                         if (found_key.type == 0) {
6137                                 if (found_key.offset == 0)
6138                                         break;
6139                                 key.offset = found_key.offset - 1;
6140                                 key.type = found_key.type;
6141                         }
6142                         key.type = found_key.type - 1;
6143                         key.offset = (u64)-1;
6144                         continue;
6145                 }
6146
6147                 fprintf(stderr, "repair deleting extent record: key %Lu %u %Lu\n",
6148                         found_key.objectid, found_key.type, found_key.offset);
6149
6150                 ret = btrfs_del_item(trans, root->fs_info->extent_root, path);
6151                 if (ret)
6152                         break;
6153                 btrfs_release_path(path);
6154
6155                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
6156                     found_key.type == BTRFS_METADATA_ITEM_KEY) {
6157                         u64 bytes = (found_key.type == BTRFS_EXTENT_ITEM_KEY) ?
6158                                 found_key.offset : root->leafsize;
6159
6160                         ret = btrfs_update_block_group(trans, root, bytenr,
6161                                                        bytes, 0, 0);
6162                         if (ret)
6163                                 break;
6164                 }
6165         }
6166
6167         btrfs_release_path(path);
6168         return ret;
6169 }
6170
6171 /*
6172  * for a single backref, this will allocate a new extent
6173  * and add the backref to it.
6174  */
6175 static int record_extent(struct btrfs_trans_handle *trans,
6176                          struct btrfs_fs_info *info,
6177                          struct btrfs_path *path,
6178                          struct extent_record *rec,
6179                          struct extent_backref *back,
6180                          int allocated, u64 flags)
6181 {
6182         int ret;
6183         struct btrfs_root *extent_root = info->extent_root;
6184         struct extent_buffer *leaf;
6185         struct btrfs_key ins_key;
6186         struct btrfs_extent_item *ei;
6187         struct tree_backref *tback;
6188         struct data_backref *dback;
6189         struct btrfs_tree_block_info *bi;
6190
6191         if (!back->is_data)
6192                 rec->max_size = max_t(u64, rec->max_size,
6193                                     info->extent_root->leafsize);
6194
6195         if (!allocated) {
6196                 u32 item_size = sizeof(*ei);
6197
6198                 if (!back->is_data)
6199                         item_size += sizeof(*bi);
6200
6201                 ins_key.objectid = rec->start;
6202                 ins_key.offset = rec->max_size;
6203                 ins_key.type = BTRFS_EXTENT_ITEM_KEY;
6204
6205                 ret = btrfs_insert_empty_item(trans, extent_root, path,
6206                                         &ins_key, item_size);
6207                 if (ret)
6208                         goto fail;
6209
6210                 leaf = path->nodes[0];
6211                 ei = btrfs_item_ptr(leaf, path->slots[0],
6212                                     struct btrfs_extent_item);
6213
6214                 btrfs_set_extent_refs(leaf, ei, 0);
6215                 btrfs_set_extent_generation(leaf, ei, rec->generation);
6216
6217                 if (back->is_data) {
6218                         btrfs_set_extent_flags(leaf, ei,
6219                                                BTRFS_EXTENT_FLAG_DATA);
6220                 } else {
6221                         struct btrfs_disk_key copy_key;;
6222
6223                         tback = (struct tree_backref *)back;
6224                         bi = (struct btrfs_tree_block_info *)(ei + 1);
6225                         memset_extent_buffer(leaf, 0, (unsigned long)bi,
6226                                              sizeof(*bi));
6227
6228                         btrfs_set_disk_key_objectid(&copy_key,
6229                                                     rec->info_objectid);
6230                         btrfs_set_disk_key_type(&copy_key, 0);
6231                         btrfs_set_disk_key_offset(&copy_key, 0);
6232
6233                         btrfs_set_tree_block_level(leaf, bi, rec->info_level);
6234                         btrfs_set_tree_block_key(leaf, bi, &copy_key);
6235
6236                         btrfs_set_extent_flags(leaf, ei,
6237                                                BTRFS_EXTENT_FLAG_TREE_BLOCK | flags);
6238                 }
6239
6240                 btrfs_mark_buffer_dirty(leaf);
6241                 ret = btrfs_update_block_group(trans, extent_root, rec->start,
6242                                                rec->max_size, 1, 0);
6243                 if (ret)
6244                         goto fail;
6245                 btrfs_release_path(path);
6246         }
6247
6248         if (back->is_data) {
6249                 u64 parent;
6250                 int i;
6251
6252                 dback = (struct data_backref *)back;
6253                 if (back->full_backref)
6254                         parent = dback->parent;
6255                 else
6256                         parent = 0;
6257
6258                 for (i = 0; i < dback->found_ref; i++) {
6259                         /* if parent != 0, we're doing a full backref
6260                          * passing BTRFS_FIRST_FREE_OBJECTID as the owner
6261                          * just makes the backref allocator create a data
6262                          * backref
6263                          */
6264                         ret = btrfs_inc_extent_ref(trans, info->extent_root,
6265                                                    rec->start, rec->max_size,
6266                                                    parent,
6267                                                    dback->root,
6268                                                    parent ?
6269                                                    BTRFS_FIRST_FREE_OBJECTID :
6270                                                    dback->owner,
6271                                                    dback->offset);
6272                         if (ret)
6273                                 break;
6274                 }
6275                 fprintf(stderr, "adding new data backref"
6276                                 " on %llu %s %llu owner %llu"
6277                                 " offset %llu found %d\n",
6278                                 (unsigned long long)rec->start,
6279                                 back->full_backref ?
6280                                 "parent" : "root",
6281                                 back->full_backref ?
6282                                 (unsigned long long)parent :
6283                                 (unsigned long long)dback->root,
6284                                 (unsigned long long)dback->owner,
6285                                 (unsigned long long)dback->offset,
6286                                 dback->found_ref);
6287         } else {
6288                 u64 parent;
6289
6290                 tback = (struct tree_backref *)back;
6291                 if (back->full_backref)
6292                         parent = tback->parent;
6293                 else
6294                         parent = 0;
6295
6296                 ret = btrfs_inc_extent_ref(trans, info->extent_root,
6297                                            rec->start, rec->max_size,
6298                                            parent, tback->root, 0, 0);
6299                 fprintf(stderr, "adding new tree backref on "
6300                         "start %llu len %llu parent %llu root %llu\n",
6301                         rec->start, rec->max_size, parent, tback->root);
6302         }
6303         if (ret)
6304                 goto fail;
6305 fail:
6306         btrfs_release_path(path);
6307         return ret;
6308 }
6309
6310 struct extent_entry {
6311         u64 bytenr;
6312         u64 bytes;
6313         int count;
6314         int broken;
6315         struct list_head list;
6316 };
6317
6318 static struct extent_entry *find_entry(struct list_head *entries,
6319                                        u64 bytenr, u64 bytes)
6320 {
6321         struct extent_entry *entry = NULL;
6322
6323         list_for_each_entry(entry, entries, list) {
6324                 if (entry->bytenr == bytenr && entry->bytes == bytes)
6325                         return entry;
6326         }
6327
6328         return NULL;
6329 }
6330
6331 static struct extent_entry *find_most_right_entry(struct list_head *entries)
6332 {
6333         struct extent_entry *entry, *best = NULL, *prev = NULL;
6334
6335         list_for_each_entry(entry, entries, list) {
6336                 if (!prev) {
6337                         prev = entry;
6338                         continue;
6339                 }
6340
6341                 /*
6342                  * If there are as many broken entries as entries then we know
6343                  * not to trust this particular entry.
6344                  */
6345                 if (entry->broken == entry->count)
6346                         continue;
6347
6348                 /*
6349                  * If our current entry == best then we can't be sure our best
6350                  * is really the best, so we need to keep searching.
6351                  */
6352                 if (best && best->count == entry->count) {
6353                         prev = entry;
6354                         best = NULL;
6355                         continue;
6356                 }
6357
6358                 /* Prev == entry, not good enough, have to keep searching */
6359                 if (!prev->broken && prev->count == entry->count)
6360                         continue;
6361
6362                 if (!best)
6363                         best = (prev->count > entry->count) ? prev : entry;
6364                 else if (best->count < entry->count)
6365                         best = entry;
6366                 prev = entry;
6367         }
6368
6369         return best;
6370 }
6371
6372 static int repair_ref(struct btrfs_fs_info *info, struct btrfs_path *path,
6373                       struct data_backref *dback, struct extent_entry *entry)
6374 {
6375         struct btrfs_trans_handle *trans;
6376         struct btrfs_root *root;
6377         struct btrfs_file_extent_item *fi;
6378         struct extent_buffer *leaf;
6379         struct btrfs_key key;
6380         u64 bytenr, bytes;
6381         int ret, err;
6382
6383         key.objectid = dback->root;
6384         key.type = BTRFS_ROOT_ITEM_KEY;
6385         key.offset = (u64)-1;
6386         root = btrfs_read_fs_root(info, &key);
6387         if (IS_ERR(root)) {
6388                 fprintf(stderr, "Couldn't find root for our ref\n");
6389                 return -EINVAL;
6390         }
6391
6392         /*
6393          * The backref points to the original offset of the extent if it was
6394          * split, so we need to search down to the offset we have and then walk
6395          * forward until we find the backref we're looking for.
6396          */
6397         key.objectid = dback->owner;
6398         key.type = BTRFS_EXTENT_DATA_KEY;
6399         key.offset = dback->offset;
6400         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6401         if (ret < 0) {
6402                 fprintf(stderr, "Error looking up ref %d\n", ret);
6403                 return ret;
6404         }
6405
6406         while (1) {
6407                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
6408                         ret = btrfs_next_leaf(root, path);
6409                         if (ret) {
6410                                 fprintf(stderr, "Couldn't find our ref, next\n");
6411                                 return -EINVAL;
6412                         }
6413                 }
6414                 leaf = path->nodes[0];
6415                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6416                 if (key.objectid != dback->owner ||
6417                     key.type != BTRFS_EXTENT_DATA_KEY) {
6418                         fprintf(stderr, "Couldn't find our ref, search\n");
6419                         return -EINVAL;
6420                 }
6421                 fi = btrfs_item_ptr(leaf, path->slots[0],
6422                                     struct btrfs_file_extent_item);
6423                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6424                 bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
6425
6426                 if (bytenr == dback->disk_bytenr && bytes == dback->bytes)
6427                         break;
6428                 path->slots[0]++;
6429         }
6430
6431         btrfs_release_path(path);
6432
6433         trans = btrfs_start_transaction(root, 1);
6434         if (IS_ERR(trans))
6435                 return PTR_ERR(trans);
6436
6437         /*
6438          * Ok we have the key of the file extent we want to fix, now we can cow
6439          * down to the thing and fix it.
6440          */
6441         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6442         if (ret < 0) {
6443                 fprintf(stderr, "Error cowing down to ref [%Lu, %u, %Lu]: %d\n",
6444                         key.objectid, key.type, key.offset, ret);
6445                 goto out;
6446         }
6447         if (ret > 0) {
6448                 fprintf(stderr, "Well that's odd, we just found this key "
6449                         "[%Lu, %u, %Lu]\n", key.objectid, key.type,
6450                         key.offset);
6451                 ret = -EINVAL;
6452                 goto out;
6453         }
6454         leaf = path->nodes[0];
6455         fi = btrfs_item_ptr(leaf, path->slots[0],
6456                             struct btrfs_file_extent_item);
6457
6458         if (btrfs_file_extent_compression(leaf, fi) &&
6459             dback->disk_bytenr != entry->bytenr) {
6460                 fprintf(stderr, "Ref doesn't match the record start and is "
6461                         "compressed, please take a btrfs-image of this file "
6462                         "system and send it to a btrfs developer so they can "
6463                         "complete this functionality for bytenr %Lu\n",
6464                         dback->disk_bytenr);
6465                 ret = -EINVAL;
6466                 goto out;
6467         }
6468
6469         if (dback->node.broken && dback->disk_bytenr != entry->bytenr) {
6470                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
6471         } else if (dback->disk_bytenr > entry->bytenr) {
6472                 u64 off_diff, offset;
6473
6474                 off_diff = dback->disk_bytenr - entry->bytenr;
6475                 offset = btrfs_file_extent_offset(leaf, fi);
6476                 if (dback->disk_bytenr + offset +
6477                     btrfs_file_extent_num_bytes(leaf, fi) >
6478                     entry->bytenr + entry->bytes) {
6479                         fprintf(stderr, "Ref is past the entry end, please "
6480                                 "take a btrfs-image of this file system and "
6481                                 "send it to a btrfs developer, ref %Lu\n",
6482                                 dback->disk_bytenr);
6483                         ret = -EINVAL;
6484                         goto out;
6485                 }
6486                 offset += off_diff;
6487                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
6488                 btrfs_set_file_extent_offset(leaf, fi, offset);
6489         } else if (dback->disk_bytenr < entry->bytenr) {
6490                 u64 offset;
6491
6492                 offset = btrfs_file_extent_offset(leaf, fi);
6493                 if (dback->disk_bytenr + offset < entry->bytenr) {
6494                         fprintf(stderr, "Ref is before the entry start, please"
6495                                 " take a btrfs-image of this file system and "
6496                                 "send it to a btrfs developer, ref %Lu\n",
6497                                 dback->disk_bytenr);
6498                         ret = -EINVAL;
6499                         goto out;
6500                 }
6501
6502                 offset += dback->disk_bytenr;
6503                 offset -= entry->bytenr;
6504                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
6505                 btrfs_set_file_extent_offset(leaf, fi, offset);
6506         }
6507
6508         btrfs_set_file_extent_disk_num_bytes(leaf, fi, entry->bytes);
6509
6510         /*
6511          * Chances are if disk_num_bytes were wrong then so is ram_bytes, but
6512          * only do this if we aren't using compression, otherwise it's a
6513          * trickier case.
6514          */
6515         if (!btrfs_file_extent_compression(leaf, fi))
6516                 btrfs_set_file_extent_ram_bytes(leaf, fi, entry->bytes);
6517         else
6518                 printf("ram bytes may be wrong?\n");
6519         btrfs_mark_buffer_dirty(leaf);
6520 out:
6521         err = btrfs_commit_transaction(trans, root);
6522         btrfs_release_path(path);
6523         return ret ? ret : err;
6524 }
6525
6526 static int verify_backrefs(struct btrfs_fs_info *info, struct btrfs_path *path,
6527                            struct extent_record *rec)
6528 {
6529         struct extent_backref *back;
6530         struct data_backref *dback;
6531         struct extent_entry *entry, *best = NULL;
6532         LIST_HEAD(entries);
6533         int nr_entries = 0;
6534         int broken_entries = 0;
6535         int ret = 0;
6536         short mismatch = 0;
6537
6538         /*
6539          * Metadata is easy and the backrefs should always agree on bytenr and
6540          * size, if not we've got bigger issues.
6541          */
6542         if (rec->metadata)
6543                 return 0;
6544
6545         list_for_each_entry(back, &rec->backrefs, list) {
6546                 if (back->full_backref || !back->is_data)
6547                         continue;
6548
6549                 dback = (struct data_backref *)back;
6550
6551                 /*
6552                  * We only pay attention to backrefs that we found a real
6553                  * backref for.
6554                  */
6555                 if (dback->found_ref == 0)
6556                         continue;
6557
6558                 /*
6559                  * For now we only catch when the bytes don't match, not the
6560                  * bytenr.  We can easily do this at the same time, but I want
6561                  * to have a fs image to test on before we just add repair
6562                  * functionality willy-nilly so we know we won't screw up the
6563                  * repair.
6564                  */
6565
6566                 entry = find_entry(&entries, dback->disk_bytenr,
6567                                    dback->bytes);
6568                 if (!entry) {
6569                         entry = malloc(sizeof(struct extent_entry));
6570                         if (!entry) {
6571                                 ret = -ENOMEM;
6572                                 goto out;
6573                         }
6574                         memset(entry, 0, sizeof(*entry));
6575                         entry->bytenr = dback->disk_bytenr;
6576                         entry->bytes = dback->bytes;
6577                         list_add_tail(&entry->list, &entries);
6578                         nr_entries++;
6579                 }
6580
6581                 /*
6582                  * If we only have on entry we may think the entries agree when
6583                  * in reality they don't so we have to do some extra checking.
6584                  */
6585                 if (dback->disk_bytenr != rec->start ||
6586                     dback->bytes != rec->nr || back->broken)
6587                         mismatch = 1;
6588
6589                 if (back->broken) {
6590                         entry->broken++;
6591                         broken_entries++;
6592                 }
6593
6594                 entry->count++;
6595         }
6596
6597         /* Yay all the backrefs agree, carry on good sir */
6598         if (nr_entries <= 1 && !mismatch)
6599                 goto out;
6600
6601         fprintf(stderr, "attempting to repair backref discrepency for bytenr "
6602                 "%Lu\n", rec->start);
6603
6604         /*
6605          * First we want to see if the backrefs can agree amongst themselves who
6606          * is right, so figure out which one of the entries has the highest
6607          * count.
6608          */
6609         best = find_most_right_entry(&entries);
6610
6611         /*
6612          * Ok so we may have an even split between what the backrefs think, so
6613          * this is where we use the extent ref to see what it thinks.
6614          */
6615         if (!best) {
6616                 entry = find_entry(&entries, rec->start, rec->nr);
6617                 if (!entry && (!broken_entries || !rec->found_rec)) {
6618                         fprintf(stderr, "Backrefs don't agree with each other "
6619                                 "and extent record doesn't agree with anybody,"
6620                                 " so we can't fix bytenr %Lu bytes %Lu\n",
6621                                 rec->start, rec->nr);
6622                         ret = -EINVAL;
6623                         goto out;
6624                 } else if (!entry) {
6625                         /*
6626                          * Ok our backrefs were broken, we'll assume this is the
6627                          * correct value and add an entry for this range.
6628                          */
6629                         entry = malloc(sizeof(struct extent_entry));
6630                         if (!entry) {
6631                                 ret = -ENOMEM;
6632                                 goto out;
6633                         }
6634                         memset(entry, 0, sizeof(*entry));
6635                         entry->bytenr = rec->start;
6636                         entry->bytes = rec->nr;
6637                         list_add_tail(&entry->list, &entries);
6638                         nr_entries++;
6639                 }
6640                 entry->count++;
6641                 best = find_most_right_entry(&entries);
6642                 if (!best) {
6643                         fprintf(stderr, "Backrefs and extent record evenly "
6644                                 "split on who is right, this is going to "
6645                                 "require user input to fix bytenr %Lu bytes "
6646                                 "%Lu\n", rec->start, rec->nr);
6647                         ret = -EINVAL;
6648                         goto out;
6649                 }
6650         }
6651
6652         /*
6653          * I don't think this can happen currently as we'll abort() if we catch
6654          * this case higher up, but in case somebody removes that we still can't
6655          * deal with it properly here yet, so just bail out of that's the case.
6656          */
6657         if (best->bytenr != rec->start) {
6658                 fprintf(stderr, "Extent start and backref starts don't match, "
6659                         "please use btrfs-image on this file system and send "
6660                         "it to a btrfs developer so they can make fsck fix "
6661                         "this particular case.  bytenr is %Lu, bytes is %Lu\n",
6662                         rec->start, rec->nr);
6663                 ret = -EINVAL;
6664                 goto out;
6665         }
6666
6667         /*
6668          * Ok great we all agreed on an extent record, let's go find the real
6669          * references and fix up the ones that don't match.
6670          */
6671         list_for_each_entry(back, &rec->backrefs, list) {
6672                 if (back->full_backref || !back->is_data)
6673                         continue;
6674
6675                 dback = (struct data_backref *)back;
6676
6677                 /*
6678                  * Still ignoring backrefs that don't have a real ref attached
6679                  * to them.
6680                  */
6681                 if (dback->found_ref == 0)
6682                         continue;
6683
6684                 if (dback->bytes == best->bytes &&
6685                     dback->disk_bytenr == best->bytenr)
6686                         continue;
6687
6688                 ret = repair_ref(info, path, dback, best);
6689                 if (ret)
6690                         goto out;
6691         }
6692
6693         /*
6694          * Ok we messed with the actual refs, which means we need to drop our
6695          * entire cache and go back and rescan.  I know this is a huge pain and
6696          * adds a lot of extra work, but it's the only way to be safe.  Once all
6697          * the backrefs agree we may not need to do anything to the extent
6698          * record itself.
6699          */
6700         ret = -EAGAIN;
6701 out:
6702         while (!list_empty(&entries)) {
6703                 entry = list_entry(entries.next, struct extent_entry, list);
6704                 list_del_init(&entry->list);
6705                 free(entry);
6706         }
6707         return ret;
6708 }
6709
6710 static int process_duplicates(struct btrfs_root *root,
6711                               struct cache_tree *extent_cache,
6712                               struct extent_record *rec)
6713 {
6714         struct extent_record *good, *tmp;
6715         struct cache_extent *cache;
6716         int ret;
6717
6718         /*
6719          * If we found a extent record for this extent then return, or if we
6720          * have more than one duplicate we are likely going to need to delete
6721          * something.
6722          */
6723         if (rec->found_rec || rec->num_duplicates > 1)
6724                 return 0;
6725
6726         /* Shouldn't happen but just in case */
6727         BUG_ON(!rec->num_duplicates);
6728
6729         /*
6730          * So this happens if we end up with a backref that doesn't match the
6731          * actual extent entry.  So either the backref is bad or the extent
6732          * entry is bad.  Either way we want to have the extent_record actually
6733          * reflect what we found in the extent_tree, so we need to take the
6734          * duplicate out and use that as the extent_record since the only way we
6735          * get a duplicate is if we find a real life BTRFS_EXTENT_ITEM_KEY.
6736          */
6737         remove_cache_extent(extent_cache, &rec->cache);
6738
6739         good = list_entry(rec->dups.next, struct extent_record, list);
6740         list_del_init(&good->list);
6741         INIT_LIST_HEAD(&good->backrefs);
6742         INIT_LIST_HEAD(&good->dups);
6743         good->cache.start = good->start;
6744         good->cache.size = good->nr;
6745         good->content_checked = 0;
6746         good->owner_ref_checked = 0;
6747         good->num_duplicates = 0;
6748         good->refs = rec->refs;
6749         list_splice_init(&rec->backrefs, &good->backrefs);
6750         while (1) {
6751                 cache = lookup_cache_extent(extent_cache, good->start,
6752                                             good->nr);
6753                 if (!cache)
6754                         break;
6755                 tmp = container_of(cache, struct extent_record, cache);
6756
6757                 /*
6758                  * If we find another overlapping extent and it's found_rec is
6759                  * set then it's a duplicate and we need to try and delete
6760                  * something.
6761                  */
6762                 if (tmp->found_rec || tmp->num_duplicates > 0) {
6763                         if (list_empty(&good->list))
6764                                 list_add_tail(&good->list,
6765                                               &duplicate_extents);
6766                         good->num_duplicates += tmp->num_duplicates + 1;
6767                         list_splice_init(&tmp->dups, &good->dups);
6768                         list_del_init(&tmp->list);
6769                         list_add_tail(&tmp->list, &good->dups);
6770                         remove_cache_extent(extent_cache, &tmp->cache);
6771                         continue;
6772                 }
6773
6774                 /*
6775                  * Ok we have another non extent item backed extent rec, so lets
6776                  * just add it to this extent and carry on like we did above.
6777                  */
6778                 good->refs += tmp->refs;
6779                 list_splice_init(&tmp->backrefs, &good->backrefs);
6780                 remove_cache_extent(extent_cache, &tmp->cache);
6781                 free(tmp);
6782         }
6783         ret = insert_cache_extent(extent_cache, &good->cache);
6784         BUG_ON(ret);
6785         free(rec);
6786         return good->num_duplicates ? 0 : 1;
6787 }
6788
6789 static int delete_duplicate_records(struct btrfs_root *root,
6790                                     struct extent_record *rec)
6791 {
6792         struct btrfs_trans_handle *trans;
6793         LIST_HEAD(delete_list);
6794         struct btrfs_path *path;
6795         struct extent_record *tmp, *good, *n;
6796         int nr_del = 0;
6797         int ret = 0, err;
6798         struct btrfs_key key;
6799
6800         path = btrfs_alloc_path();
6801         if (!path) {
6802                 ret = -ENOMEM;
6803                 goto out;
6804         }
6805
6806         good = rec;
6807         /* Find the record that covers all of the duplicates. */
6808         list_for_each_entry(tmp, &rec->dups, list) {
6809                 if (good->start < tmp->start)
6810                         continue;
6811                 if (good->nr > tmp->nr)
6812                         continue;
6813
6814                 if (tmp->start + tmp->nr < good->start + good->nr) {
6815                         fprintf(stderr, "Ok we have overlapping extents that "
6816                                 "aren't completely covered by eachother, this "
6817                                 "is going to require more careful thought.  "
6818                                 "The extents are [%Lu-%Lu] and [%Lu-%Lu]\n",
6819                                 tmp->start, tmp->nr, good->start, good->nr);
6820                         abort();
6821                 }
6822                 good = tmp;
6823         }
6824
6825         if (good != rec)
6826                 list_add_tail(&rec->list, &delete_list);
6827
6828         list_for_each_entry_safe(tmp, n, &rec->dups, list) {
6829                 if (tmp == good)
6830                         continue;
6831                 list_move_tail(&tmp->list, &delete_list);
6832         }
6833
6834         root = root->fs_info->extent_root;
6835         trans = btrfs_start_transaction(root, 1);
6836         if (IS_ERR(trans)) {
6837                 ret = PTR_ERR(trans);
6838                 goto out;
6839         }
6840
6841         list_for_each_entry(tmp, &delete_list, list) {
6842                 if (tmp->found_rec == 0)
6843                         continue;
6844                 key.objectid = tmp->start;
6845                 key.type = BTRFS_EXTENT_ITEM_KEY;
6846                 key.offset = tmp->nr;
6847
6848                 /* Shouldn't happen but just in case */
6849                 if (tmp->metadata) {
6850                         fprintf(stderr, "Well this shouldn't happen, extent "
6851                                 "record overlaps but is metadata? "
6852                                 "[%Lu, %Lu]\n", tmp->start, tmp->nr);
6853                         abort();
6854                 }
6855
6856                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6857                 if (ret) {
6858                         if (ret > 0)
6859                                 ret = -EINVAL;
6860                         break;
6861                 }
6862                 ret = btrfs_del_item(trans, root, path);
6863                 if (ret)
6864                         break;
6865                 btrfs_release_path(path);
6866                 nr_del++;
6867         }
6868         err = btrfs_commit_transaction(trans, root);
6869         if (err && !ret)
6870                 ret = err;
6871 out:
6872         while (!list_empty(&delete_list)) {
6873                 tmp = list_entry(delete_list.next, struct extent_record, list);
6874                 list_del_init(&tmp->list);
6875                 if (tmp == rec)
6876                         continue;
6877                 free(tmp);
6878         }
6879
6880         while (!list_empty(&rec->dups)) {
6881                 tmp = list_entry(rec->dups.next, struct extent_record, list);
6882                 list_del_init(&tmp->list);
6883                 free(tmp);
6884         }
6885
6886         btrfs_free_path(path);
6887
6888         if (!ret && !nr_del)
6889                 rec->num_duplicates = 0;
6890
6891         return ret ? ret : nr_del;
6892 }
6893
6894 static int find_possible_backrefs(struct btrfs_fs_info *info,
6895                                   struct btrfs_path *path,
6896                                   struct cache_tree *extent_cache,
6897                                   struct extent_record *rec)
6898 {
6899         struct btrfs_root *root;
6900         struct extent_backref *back;
6901         struct data_backref *dback;
6902         struct cache_extent *cache;
6903         struct btrfs_file_extent_item *fi;
6904         struct btrfs_key key;
6905         u64 bytenr, bytes;
6906         int ret;
6907
6908         list_for_each_entry(back, &rec->backrefs, list) {
6909                 /* Don't care about full backrefs (poor unloved backrefs) */
6910                 if (back->full_backref || !back->is_data)
6911                         continue;
6912
6913                 dback = (struct data_backref *)back;
6914
6915                 /* We found this one, we don't need to do a lookup */
6916                 if (dback->found_ref)
6917                         continue;
6918
6919                 key.objectid = dback->root;
6920                 key.type = BTRFS_ROOT_ITEM_KEY;
6921                 key.offset = (u64)-1;
6922
6923                 root = btrfs_read_fs_root(info, &key);
6924
6925                 /* No root, definitely a bad ref, skip */
6926                 if (IS_ERR(root) && PTR_ERR(root) == -ENOENT)
6927                         continue;
6928                 /* Other err, exit */
6929                 if (IS_ERR(root))
6930                         return PTR_ERR(root);
6931
6932                 key.objectid = dback->owner;
6933                 key.type = BTRFS_EXTENT_DATA_KEY;
6934                 key.offset = dback->offset;
6935                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6936                 if (ret) {
6937                         btrfs_release_path(path);
6938                         if (ret < 0)
6939                                 return ret;
6940                         /* Didn't find it, we can carry on */
6941                         ret = 0;
6942                         continue;
6943                 }
6944
6945                 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6946                                     struct btrfs_file_extent_item);
6947                 bytenr = btrfs_file_extent_disk_bytenr(path->nodes[0], fi);
6948                 bytes = btrfs_file_extent_disk_num_bytes(path->nodes[0], fi);
6949                 btrfs_release_path(path);
6950                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
6951                 if (cache) {
6952                         struct extent_record *tmp;
6953                         tmp = container_of(cache, struct extent_record, cache);
6954
6955                         /*
6956                          * If we found an extent record for the bytenr for this
6957                          * particular backref then we can't add it to our
6958                          * current extent record.  We only want to add backrefs
6959                          * that don't have a corresponding extent item in the
6960                          * extent tree since they likely belong to this record
6961                          * and we need to fix it if it doesn't match bytenrs.
6962                          */
6963                         if  (tmp->found_rec)
6964                                 continue;
6965                 }
6966
6967                 dback->found_ref += 1;
6968                 dback->disk_bytenr = bytenr;
6969                 dback->bytes = bytes;
6970
6971                 /*
6972                  * Set this so the verify backref code knows not to trust the
6973                  * values in this backref.
6974                  */
6975                 back->broken = 1;
6976         }
6977
6978         return 0;
6979 }
6980
6981 /*
6982  * Record orphan data ref into corresponding root.
6983  *
6984  * Return 0 if the extent item contains data ref and recorded.
6985  * Return 1 if the extent item contains no useful data ref
6986  *   On that case, it may contains only shared_dataref or metadata backref
6987  *   or the file extent exists(this should be handled by the extent bytenr
6988  *   recovery routine)
6989  * Return <0 if something goes wrong.
6990  */
6991 static int record_orphan_data_extents(struct btrfs_fs_info *fs_info,
6992                                       struct extent_record *rec)
6993 {
6994         struct btrfs_key key;
6995         struct btrfs_root *dest_root;
6996         struct extent_backref *back;
6997         struct data_backref *dback;
6998         struct orphan_data_extent *orphan;
6999         struct btrfs_path *path;
7000         int recorded_data_ref = 0;
7001         int ret = 0;
7002
7003         if (rec->metadata)
7004                 return 1;
7005         path = btrfs_alloc_path();
7006         if (!path)
7007                 return -ENOMEM;
7008         list_for_each_entry(back, &rec->backrefs, list) {
7009                 if (back->full_backref || !back->is_data ||
7010                     !back->found_extent_tree)
7011                         continue;
7012                 dback = (struct data_backref *)back;
7013                 if (dback->found_ref)
7014                         continue;
7015                 key.objectid = dback->root;
7016                 key.type = BTRFS_ROOT_ITEM_KEY;
7017                 key.offset = (u64)-1;
7018
7019                 dest_root = btrfs_read_fs_root(fs_info, &key);
7020
7021                 /* For non-exist root we just skip it */
7022                 if (IS_ERR(dest_root) || !dest_root)
7023                         continue;
7024
7025                 key.objectid = dback->owner;
7026                 key.type = BTRFS_EXTENT_DATA_KEY;
7027                 key.offset = dback->offset;
7028
7029                 ret = btrfs_search_slot(NULL, dest_root, &key, path, 0, 0);
7030                 /*
7031                  * For ret < 0, it's OK since the fs-tree may be corrupted,
7032                  * we need to record it for inode/file extent rebuild.
7033                  * For ret > 0, we record it only for file extent rebuild.
7034                  * For ret == 0, the file extent exists but only bytenr
7035                  * mismatch, let the original bytenr fix routine to handle,
7036                  * don't record it.
7037                  */
7038                 if (ret == 0)
7039                         continue;
7040                 ret = 0;
7041                 orphan = malloc(sizeof(*orphan));
7042                 if (!orphan) {
7043                         ret = -ENOMEM;
7044                         goto out;
7045                 }
7046                 INIT_LIST_HEAD(&orphan->list);
7047                 orphan->root = dback->root;
7048                 orphan->objectid = dback->owner;
7049                 orphan->offset = dback->offset;
7050                 orphan->disk_bytenr = rec->cache.start;
7051                 orphan->disk_len = rec->cache.size;
7052                 list_add(&dest_root->orphan_data_extents, &orphan->list);
7053                 recorded_data_ref = 1;
7054         }
7055 out:
7056         btrfs_free_path(path);
7057         if (!ret)
7058                 return !recorded_data_ref;
7059         else
7060                 return ret;
7061 }
7062
7063 /*
7064  * when an incorrect extent item is found, this will delete
7065  * all of the existing entries for it and recreate them
7066  * based on what the tree scan found.
7067  */
7068 static int fixup_extent_refs(struct btrfs_fs_info *info,
7069                              struct cache_tree *extent_cache,
7070                              struct extent_record *rec)
7071 {
7072         struct btrfs_trans_handle *trans = NULL;
7073         int ret;
7074         struct btrfs_path *path;
7075         struct list_head *cur = rec->backrefs.next;
7076         struct cache_extent *cache;
7077         struct extent_backref *back;
7078         int allocated = 0;
7079         u64 flags = 0;
7080
7081         if (rec->flag_block_full_backref)
7082                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7083
7084         path = btrfs_alloc_path();
7085         if (!path)
7086                 return -ENOMEM;
7087
7088         if (rec->refs != rec->extent_item_refs && !rec->metadata) {
7089                 /*
7090                  * Sometimes the backrefs themselves are so broken they don't
7091                  * get attached to any meaningful rec, so first go back and
7092                  * check any of our backrefs that we couldn't find and throw
7093                  * them into the list if we find the backref so that
7094                  * verify_backrefs can figure out what to do.
7095                  */
7096                 ret = find_possible_backrefs(info, path, extent_cache, rec);
7097                 if (ret < 0)
7098                         goto out;
7099         }
7100
7101         /* step one, make sure all of the backrefs agree */
7102         ret = verify_backrefs(info, path, rec);
7103         if (ret < 0)
7104                 goto out;
7105
7106         trans = btrfs_start_transaction(info->extent_root, 1);
7107         if (IS_ERR(trans)) {
7108                 ret = PTR_ERR(trans);
7109                 goto out;
7110         }
7111
7112         /* step two, delete all the existing records */
7113         ret = delete_extent_records(trans, info->extent_root, path,
7114                                     rec->start, rec->max_size);
7115
7116         if (ret < 0)
7117                 goto out;
7118
7119         /* was this block corrupt?  If so, don't add references to it */
7120         cache = lookup_cache_extent(info->corrupt_blocks,
7121                                     rec->start, rec->max_size);
7122         if (cache) {
7123                 ret = 0;
7124                 goto out;
7125         }
7126
7127         /* step three, recreate all the refs we did find */
7128         while(cur != &rec->backrefs) {
7129                 back = list_entry(cur, struct extent_backref, list);
7130                 cur = cur->next;
7131
7132                 /*
7133                  * if we didn't find any references, don't create a
7134                  * new extent record
7135                  */
7136                 if (!back->found_ref)
7137                         continue;
7138
7139                 rec->bad_full_backref = 0;
7140                 ret = record_extent(trans, info, path, rec, back, allocated, flags);
7141                 allocated = 1;
7142
7143                 if (ret)
7144                         goto out;
7145         }
7146 out:
7147         if (trans) {
7148                 int err = btrfs_commit_transaction(trans, info->extent_root);
7149                 if (!ret)
7150                         ret = err;
7151         }
7152
7153         btrfs_free_path(path);
7154         return ret;
7155 }
7156
7157 static int fixup_extent_flags(struct btrfs_fs_info *fs_info,
7158                               struct extent_record *rec)
7159 {
7160         struct btrfs_trans_handle *trans;
7161         struct btrfs_root *root = fs_info->extent_root;
7162         struct btrfs_path *path;
7163         struct btrfs_extent_item *ei;
7164         struct btrfs_key key;
7165         u64 flags;
7166         int ret = 0;
7167
7168         key.objectid = rec->start;
7169         if (rec->metadata) {
7170                 key.type = BTRFS_METADATA_ITEM_KEY;
7171                 key.offset = rec->info_level;
7172         } else {
7173                 key.type = BTRFS_EXTENT_ITEM_KEY;
7174                 key.offset = rec->max_size;
7175         }
7176
7177         path = btrfs_alloc_path();
7178         if (!path)
7179                 return -ENOMEM;
7180
7181         trans = btrfs_start_transaction(root, 0);
7182         if (IS_ERR(trans)) {
7183                 btrfs_free_path(path);
7184                 return PTR_ERR(trans);
7185         }
7186
7187         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
7188         if (ret < 0) {
7189                 btrfs_free_path(path);
7190                 btrfs_commit_transaction(trans, root);
7191                 return ret;
7192         } else if (ret) {
7193                 fprintf(stderr, "Didn't find extent for %llu\n",
7194                         (unsigned long long)rec->start);
7195                 btrfs_free_path(path);
7196                 btrfs_commit_transaction(trans, root);
7197                 return -ENOENT;
7198         }
7199
7200         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
7201                             struct btrfs_extent_item);
7202         flags = btrfs_extent_flags(path->nodes[0], ei);
7203         if (rec->flag_block_full_backref) {
7204                 fprintf(stderr, "setting full backref on %llu\n",
7205                         (unsigned long long)key.objectid);
7206                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7207         } else {
7208                 fprintf(stderr, "clearing full backref on %llu\n",
7209                         (unsigned long long)key.objectid);
7210                 flags &= ~BTRFS_BLOCK_FLAG_FULL_BACKREF;
7211         }
7212         btrfs_set_extent_flags(path->nodes[0], ei, flags);
7213         btrfs_mark_buffer_dirty(path->nodes[0]);
7214         btrfs_free_path(path);
7215         return btrfs_commit_transaction(trans, root);
7216 }
7217
7218 /* right now we only prune from the extent allocation tree */
7219 static int prune_one_block(struct btrfs_trans_handle *trans,
7220                            struct btrfs_fs_info *info,
7221                            struct btrfs_corrupt_block *corrupt)
7222 {
7223         int ret;
7224         struct btrfs_path path;
7225         struct extent_buffer *eb;
7226         u64 found;
7227         int slot;
7228         int nritems;
7229         int level = corrupt->level + 1;
7230
7231         btrfs_init_path(&path);
7232 again:
7233         /* we want to stop at the parent to our busted block */
7234         path.lowest_level = level;
7235
7236         ret = btrfs_search_slot(trans, info->extent_root,
7237                                 &corrupt->key, &path, -1, 1);
7238
7239         if (ret < 0)
7240                 goto out;
7241
7242         eb = path.nodes[level];
7243         if (!eb) {
7244                 ret = -ENOENT;
7245                 goto out;
7246         }
7247
7248         /*
7249          * hopefully the search gave us the block we want to prune,
7250          * lets try that first
7251          */
7252         slot = path.slots[level];
7253         found =  btrfs_node_blockptr(eb, slot);
7254         if (found == corrupt->cache.start)
7255                 goto del_ptr;
7256
7257         nritems = btrfs_header_nritems(eb);
7258
7259         /* the search failed, lets scan this node and hope we find it */
7260         for (slot = 0; slot < nritems; slot++) {
7261                 found =  btrfs_node_blockptr(eb, slot);
7262                 if (found == corrupt->cache.start)
7263                         goto del_ptr;
7264         }
7265         /*
7266          * we couldn't find the bad block.  TODO, search all the nodes for pointers
7267          * to this block
7268          */
7269         if (eb == info->extent_root->node) {
7270                 ret = -ENOENT;
7271                 goto out;
7272         } else {
7273                 level++;
7274                 btrfs_release_path(&path);
7275                 goto again;
7276         }
7277
7278 del_ptr:
7279         printk("deleting pointer to block %Lu\n", corrupt->cache.start);
7280         ret = btrfs_del_ptr(trans, info->extent_root, &path, level, slot);
7281
7282 out:
7283         btrfs_release_path(&path);
7284         return ret;
7285 }
7286
7287 static int prune_corrupt_blocks(struct btrfs_fs_info *info)
7288 {
7289         struct btrfs_trans_handle *trans = NULL;
7290         struct cache_extent *cache;
7291         struct btrfs_corrupt_block *corrupt;
7292
7293         while (1) {
7294                 cache = search_cache_extent(info->corrupt_blocks, 0);
7295                 if (!cache)
7296                         break;
7297                 if (!trans) {
7298                         trans = btrfs_start_transaction(info->extent_root, 1);
7299                         if (IS_ERR(trans))
7300                                 return PTR_ERR(trans);
7301                 }
7302                 corrupt = container_of(cache, struct btrfs_corrupt_block, cache);
7303                 prune_one_block(trans, info, corrupt);
7304                 remove_cache_extent(info->corrupt_blocks, cache);
7305         }
7306         if (trans)
7307                 return btrfs_commit_transaction(trans, info->extent_root);
7308         return 0;
7309 }
7310
7311 static void reset_cached_block_groups(struct btrfs_fs_info *fs_info)
7312 {
7313         struct btrfs_block_group_cache *cache;
7314         u64 start, end;
7315         int ret;
7316
7317         while (1) {
7318                 ret = find_first_extent_bit(&fs_info->free_space_cache, 0,
7319                                             &start, &end, EXTENT_DIRTY);
7320                 if (ret)
7321                         break;
7322                 clear_extent_dirty(&fs_info->free_space_cache, start, end,
7323                                    GFP_NOFS);
7324         }
7325
7326         start = 0;
7327         while (1) {
7328                 cache = btrfs_lookup_first_block_group(fs_info, start);
7329                 if (!cache)
7330                         break;
7331                 if (cache->cached)
7332                         cache->cached = 0;
7333                 start = cache->key.objectid + cache->key.offset;
7334         }
7335 }
7336
7337 static int check_extent_refs(struct btrfs_root *root,
7338                              struct cache_tree *extent_cache)
7339 {
7340         struct extent_record *rec;
7341         struct cache_extent *cache;
7342         int err = 0;
7343         int ret = 0;
7344         int fixed = 0;
7345         int had_dups = 0;
7346         int recorded = 0;
7347
7348         if (repair) {
7349                 /*
7350                  * if we're doing a repair, we have to make sure
7351                  * we don't allocate from the problem extents.
7352                  * In the worst case, this will be all the
7353                  * extents in the FS
7354                  */
7355                 cache = search_cache_extent(extent_cache, 0);
7356                 while(cache) {
7357                         rec = container_of(cache, struct extent_record, cache);
7358                         set_extent_dirty(root->fs_info->excluded_extents,
7359                                          rec->start,
7360                                          rec->start + rec->max_size - 1,
7361                                          GFP_NOFS);
7362                         cache = next_cache_extent(cache);
7363                 }
7364
7365                 /* pin down all the corrupted blocks too */
7366                 cache = search_cache_extent(root->fs_info->corrupt_blocks, 0);
7367                 while(cache) {
7368                         set_extent_dirty(root->fs_info->excluded_extents,
7369                                          cache->start,
7370                                          cache->start + cache->size - 1,
7371                                          GFP_NOFS);
7372                         cache = next_cache_extent(cache);
7373                 }
7374                 prune_corrupt_blocks(root->fs_info);
7375                 reset_cached_block_groups(root->fs_info);
7376         }
7377
7378         reset_cached_block_groups(root->fs_info);
7379
7380         /*
7381          * We need to delete any duplicate entries we find first otherwise we
7382          * could mess up the extent tree when we have backrefs that actually
7383          * belong to a different extent item and not the weird duplicate one.
7384          */
7385         while (repair && !list_empty(&duplicate_extents)) {
7386                 rec = list_entry(duplicate_extents.next, struct extent_record,
7387                                  list);
7388                 list_del_init(&rec->list);
7389
7390                 /* Sometimes we can find a backref before we find an actual
7391                  * extent, so we need to process it a little bit to see if there
7392                  * truly are multiple EXTENT_ITEM_KEY's for the same range, or
7393                  * if this is a backref screwup.  If we need to delete stuff
7394                  * process_duplicates() will return 0, otherwise it will return
7395                  * 1 and we
7396                  */
7397                 if (process_duplicates(root, extent_cache, rec))
7398                         continue;
7399                 ret = delete_duplicate_records(root, rec);
7400                 if (ret < 0)
7401                         return ret;
7402                 /*
7403                  * delete_duplicate_records will return the number of entries
7404                  * deleted, so if it's greater than 0 then we know we actually
7405                  * did something and we need to remove.
7406                  */
7407                 if (ret)
7408                         had_dups = 1;
7409         }
7410
7411         if (had_dups)
7412                 return -EAGAIN;
7413
7414         while(1) {
7415                 int cur_err = 0;
7416
7417                 fixed = 0;
7418                 recorded = 0;
7419                 cache = search_cache_extent(extent_cache, 0);
7420                 if (!cache)
7421                         break;
7422                 rec = container_of(cache, struct extent_record, cache);
7423                 if (rec->num_duplicates) {
7424                         fprintf(stderr, "extent item %llu has multiple extent "
7425                                 "items\n", (unsigned long long)rec->start);
7426                         err = 1;
7427                         cur_err = 1;
7428                 }
7429
7430                 if (rec->refs != rec->extent_item_refs) {
7431                         fprintf(stderr, "ref mismatch on [%llu %llu] ",
7432                                 (unsigned long long)rec->start,
7433                                 (unsigned long long)rec->nr);
7434                         fprintf(stderr, "extent item %llu, found %llu\n",
7435                                 (unsigned long long)rec->extent_item_refs,
7436                                 (unsigned long long)rec->refs);
7437                         ret = record_orphan_data_extents(root->fs_info, rec);
7438                         if (ret < 0)
7439                                 goto repair_abort;
7440                         if (ret == 0) {
7441                                 recorded = 1;
7442                         } else {
7443                                 /*
7444                                  * we can't use the extent to repair file
7445                                  * extent, let the fallback method handle it.
7446                                  */
7447                                 if (!fixed && repair) {
7448                                         ret = fixup_extent_refs(
7449                                                         root->fs_info,
7450                                                         extent_cache, rec);
7451                                         if (ret)
7452                                                 goto repair_abort;
7453                                         fixed = 1;
7454                                 }
7455                         }
7456                         err = 1;
7457                         cur_err = 1;
7458                 }
7459                 if (all_backpointers_checked(rec, 1)) {
7460                         fprintf(stderr, "backpointer mismatch on [%llu %llu]\n",
7461                                 (unsigned long long)rec->start,
7462                                 (unsigned long long)rec->nr);
7463
7464                         if (!fixed && !recorded && repair) {
7465                                 ret = fixup_extent_refs(root->fs_info,
7466                                                         extent_cache, rec);
7467                                 if (ret)
7468                                         goto repair_abort;
7469                                 fixed = 1;
7470                         }
7471                         cur_err = 1;
7472                         err = 1;
7473                 }
7474                 if (!rec->owner_ref_checked) {
7475                         fprintf(stderr, "owner ref check failed [%llu %llu]\n",
7476                                 (unsigned long long)rec->start,
7477                                 (unsigned long long)rec->nr);
7478                         if (!fixed && !recorded && repair) {
7479                                 ret = fixup_extent_refs(root->fs_info,
7480                                                         extent_cache, rec);
7481                                 if (ret)
7482                                         goto repair_abort;
7483                                 fixed = 1;
7484                         }
7485                         err = 1;
7486                         cur_err = 1;
7487                 }
7488                 if (rec->bad_full_backref) {
7489                         fprintf(stderr, "bad full backref, on [%llu]\n",
7490                                 (unsigned long long)rec->start);
7491                         if (repair) {
7492                                 ret = fixup_extent_flags(root->fs_info, rec);
7493                                 if (ret)
7494                                         goto repair_abort;
7495                                 fixed = 1;
7496                         }
7497                         err = 1;
7498                         cur_err = 1;
7499                 }
7500                 /*
7501                  * Although it's not a extent ref's problem, we reuse this
7502                  * routine for error reporting.
7503                  * No repair function yet.
7504                  */
7505                 if (rec->crossing_stripes) {
7506                         fprintf(stderr,
7507                                 "bad metadata [%llu, %llu) crossing stripe boundary\n",
7508                                 rec->start, rec->start + rec->max_size);
7509                         err = 1;
7510                         cur_err = 1;
7511                 }
7512
7513                 remove_cache_extent(extent_cache, cache);
7514                 free_all_extent_backrefs(rec);
7515                 if (!init_extent_tree && repair && (!cur_err || fixed))
7516                         clear_extent_dirty(root->fs_info->excluded_extents,
7517                                            rec->start,
7518                                            rec->start + rec->max_size - 1,
7519                                            GFP_NOFS);
7520                 free(rec);
7521         }
7522 repair_abort:
7523         if (repair) {
7524                 if (ret && ret != -EAGAIN) {
7525                         fprintf(stderr, "failed to repair damaged filesystem, aborting\n");
7526                         exit(1);
7527                 } else if (!ret) {
7528                         struct btrfs_trans_handle *trans;
7529
7530                         root = root->fs_info->extent_root;
7531                         trans = btrfs_start_transaction(root, 1);
7532                         if (IS_ERR(trans)) {
7533                                 ret = PTR_ERR(trans);
7534                                 goto repair_abort;
7535                         }
7536
7537                         btrfs_fix_block_accounting(trans, root);
7538                         ret = btrfs_commit_transaction(trans, root);
7539                         if (ret)
7540                                 goto repair_abort;
7541                 }
7542                 if (err)
7543                         fprintf(stderr, "repaired damaged extent references\n");
7544                 return ret;
7545         }
7546         return err;
7547 }
7548
7549 u64 calc_stripe_length(u64 type, u64 length, int num_stripes)
7550 {
7551         u64 stripe_size;
7552
7553         if (type & BTRFS_BLOCK_GROUP_RAID0) {
7554                 stripe_size = length;
7555                 stripe_size /= num_stripes;
7556         } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
7557                 stripe_size = length * 2;
7558                 stripe_size /= num_stripes;
7559         } else if (type & BTRFS_BLOCK_GROUP_RAID5) {
7560                 stripe_size = length;
7561                 stripe_size /= (num_stripes - 1);
7562         } else if (type & BTRFS_BLOCK_GROUP_RAID6) {
7563                 stripe_size = length;
7564                 stripe_size /= (num_stripes - 2);
7565         } else {
7566                 stripe_size = length;
7567         }
7568         return stripe_size;
7569 }
7570
7571 /*
7572  * Check the chunk with its block group/dev list ref:
7573  * Return 0 if all refs seems valid.
7574  * Return 1 if part of refs seems valid, need later check for rebuild ref
7575  * like missing block group and needs to search extent tree to rebuild them.
7576  * Return -1 if essential refs are missing and unable to rebuild.
7577  */
7578 static int check_chunk_refs(struct chunk_record *chunk_rec,
7579                             struct block_group_tree *block_group_cache,
7580                             struct device_extent_tree *dev_extent_cache,
7581                             int silent)
7582 {
7583         struct cache_extent *block_group_item;
7584         struct block_group_record *block_group_rec;
7585         struct cache_extent *dev_extent_item;
7586         struct device_extent_record *dev_extent_rec;
7587         u64 devid;
7588         u64 offset;
7589         u64 length;
7590         int metadump_v2 = 0;
7591         int i;
7592         int ret = 0;
7593
7594         block_group_item = lookup_cache_extent(&block_group_cache->tree,
7595                                                chunk_rec->offset,
7596                                                chunk_rec->length);
7597         if (block_group_item) {
7598                 block_group_rec = container_of(block_group_item,
7599                                                struct block_group_record,
7600                                                cache);
7601                 if (chunk_rec->length != block_group_rec->offset ||
7602                     chunk_rec->offset != block_group_rec->objectid ||
7603                     (!metadump_v2 &&
7604                      chunk_rec->type_flags != block_group_rec->flags)) {
7605                         if (!silent)
7606                                 fprintf(stderr,
7607                                         "Chunk[%llu, %u, %llu]: length(%llu), offset(%llu), type(%llu) mismatch with block group[%llu, %u, %llu]: offset(%llu), objectid(%llu), flags(%llu)\n",
7608                                         chunk_rec->objectid,
7609                                         chunk_rec->type,
7610                                         chunk_rec->offset,
7611                                         chunk_rec->length,
7612                                         chunk_rec->offset,
7613                                         chunk_rec->type_flags,
7614                                         block_group_rec->objectid,
7615                                         block_group_rec->type,
7616                                         block_group_rec->offset,
7617                                         block_group_rec->offset,
7618                                         block_group_rec->objectid,
7619                                         block_group_rec->flags);
7620                         ret = -1;
7621                 } else {
7622                         list_del_init(&block_group_rec->list);
7623                         chunk_rec->bg_rec = block_group_rec;
7624                 }
7625         } else {
7626                 if (!silent)
7627                         fprintf(stderr,
7628                                 "Chunk[%llu, %u, %llu]: length(%llu), offset(%llu), type(%llu) is not found in block group\n",
7629                                 chunk_rec->objectid,
7630                                 chunk_rec->type,
7631                                 chunk_rec->offset,
7632                                 chunk_rec->length,
7633                                 chunk_rec->offset,
7634                                 chunk_rec->type_flags);
7635                 ret = 1;
7636         }
7637
7638         if (metadump_v2)
7639                 return ret;
7640
7641         length = calc_stripe_length(chunk_rec->type_flags, chunk_rec->length,
7642                                     chunk_rec->num_stripes);
7643         for (i = 0; i < chunk_rec->num_stripes; ++i) {
7644                 devid = chunk_rec->stripes[i].devid;
7645                 offset = chunk_rec->stripes[i].offset;
7646                 dev_extent_item = lookup_cache_extent2(&dev_extent_cache->tree,
7647                                                        devid, offset, length);
7648                 if (dev_extent_item) {
7649                         dev_extent_rec = container_of(dev_extent_item,
7650                                                 struct device_extent_record,
7651                                                 cache);
7652                         if (dev_extent_rec->objectid != devid ||
7653                             dev_extent_rec->offset != offset ||
7654                             dev_extent_rec->chunk_offset != chunk_rec->offset ||
7655                             dev_extent_rec->length != length) {
7656                                 if (!silent)
7657                                         fprintf(stderr,
7658                                                 "Chunk[%llu, %u, %llu] stripe[%llu, %llu] dismatch dev extent[%llu, %llu, %llu]\n",
7659                                                 chunk_rec->objectid,
7660                                                 chunk_rec->type,
7661                                                 chunk_rec->offset,
7662                                                 chunk_rec->stripes[i].devid,
7663                                                 chunk_rec->stripes[i].offset,
7664                                                 dev_extent_rec->objectid,
7665                                                 dev_extent_rec->offset,
7666                                                 dev_extent_rec->length);
7667                                 ret = -1;
7668                         } else {
7669                                 list_move(&dev_extent_rec->chunk_list,
7670                                           &chunk_rec->dextents);
7671                         }
7672                 } else {
7673                         if (!silent)
7674                                 fprintf(stderr,
7675                                         "Chunk[%llu, %u, %llu] stripe[%llu, %llu] is not found in dev extent\n",
7676                                         chunk_rec->objectid,
7677                                         chunk_rec->type,
7678                                         chunk_rec->offset,
7679                                         chunk_rec->stripes[i].devid,
7680                                         chunk_rec->stripes[i].offset);
7681                         ret = -1;
7682                 }
7683         }
7684         return ret;
7685 }
7686
7687 /* check btrfs_chunk -> btrfs_dev_extent / btrfs_block_group_item */
7688 int check_chunks(struct cache_tree *chunk_cache,
7689                  struct block_group_tree *block_group_cache,
7690                  struct device_extent_tree *dev_extent_cache,
7691                  struct list_head *good, struct list_head *bad,
7692                  struct list_head *rebuild, int silent)
7693 {
7694         struct cache_extent *chunk_item;
7695         struct chunk_record *chunk_rec;
7696         struct block_group_record *bg_rec;
7697         struct device_extent_record *dext_rec;
7698         int err;
7699         int ret = 0;
7700
7701         chunk_item = first_cache_extent(chunk_cache);
7702         while (chunk_item) {
7703                 chunk_rec = container_of(chunk_item, struct chunk_record,
7704                                          cache);
7705                 err = check_chunk_refs(chunk_rec, block_group_cache,
7706                                        dev_extent_cache, silent);
7707                 if (err < 0)
7708                         ret = err;
7709                 if (err == 0 && good)
7710                         list_add_tail(&chunk_rec->list, good);
7711                 if (err > 0 && rebuild)
7712                         list_add_tail(&chunk_rec->list, rebuild);
7713                 if (err < 0 && bad)
7714                         list_add_tail(&chunk_rec->list, bad);
7715                 chunk_item = next_cache_extent(chunk_item);
7716         }
7717
7718         list_for_each_entry(bg_rec, &block_group_cache->block_groups, list) {
7719                 if (!silent)
7720                         fprintf(stderr,
7721                                 "Block group[%llu, %llu] (flags = %llu) didn't find the relative chunk.\n",
7722                                 bg_rec->objectid,
7723                                 bg_rec->offset,
7724                                 bg_rec->flags);
7725                 if (!ret)
7726                         ret = 1;
7727         }
7728
7729         list_for_each_entry(dext_rec, &dev_extent_cache->no_chunk_orphans,
7730                             chunk_list) {
7731                 if (!silent)
7732                         fprintf(stderr,
7733                                 "Device extent[%llu, %llu, %llu] didn't find the relative chunk.\n",
7734                                 dext_rec->objectid,
7735                                 dext_rec->offset,
7736                                 dext_rec->length);
7737                 if (!ret)
7738                         ret = 1;
7739         }
7740         return ret;
7741 }
7742
7743
7744 static int check_device_used(struct device_record *dev_rec,
7745                              struct device_extent_tree *dext_cache)
7746 {
7747         struct cache_extent *cache;
7748         struct device_extent_record *dev_extent_rec;
7749         u64 total_byte = 0;
7750
7751         cache = search_cache_extent2(&dext_cache->tree, dev_rec->devid, 0);
7752         while (cache) {
7753                 dev_extent_rec = container_of(cache,
7754                                               struct device_extent_record,
7755                                               cache);
7756                 if (dev_extent_rec->objectid != dev_rec->devid)
7757                         break;
7758
7759                 list_del_init(&dev_extent_rec->device_list);
7760                 total_byte += dev_extent_rec->length;
7761                 cache = next_cache_extent(cache);
7762         }
7763
7764         if (total_byte != dev_rec->byte_used) {
7765                 fprintf(stderr,
7766                         "Dev extent's total-byte(%llu) is not equal to byte-used(%llu) in dev[%llu, %u, %llu]\n",
7767                         total_byte, dev_rec->byte_used, dev_rec->objectid,
7768                         dev_rec->type, dev_rec->offset);
7769                 return -1;
7770         } else {
7771                 return 0;
7772         }
7773 }
7774
7775 /* check btrfs_dev_item -> btrfs_dev_extent */
7776 static int check_devices(struct rb_root *dev_cache,
7777                          struct device_extent_tree *dev_extent_cache)
7778 {
7779         struct rb_node *dev_node;
7780         struct device_record *dev_rec;
7781         struct device_extent_record *dext_rec;
7782         int err;
7783         int ret = 0;
7784
7785         dev_node = rb_first(dev_cache);
7786         while (dev_node) {
7787                 dev_rec = container_of(dev_node, struct device_record, node);
7788                 err = check_device_used(dev_rec, dev_extent_cache);
7789                 if (err)
7790                         ret = err;
7791
7792                 dev_node = rb_next(dev_node);
7793         }
7794         list_for_each_entry(dext_rec, &dev_extent_cache->no_device_orphans,
7795                             device_list) {
7796                 fprintf(stderr,
7797                         "Device extent[%llu, %llu, %llu] didn't find its device.\n",
7798                         dext_rec->objectid, dext_rec->offset, dext_rec->length);
7799                 if (!ret)
7800                         ret = 1;
7801         }
7802         return ret;
7803 }
7804
7805 static int add_root_item_to_list(struct list_head *head,
7806                                   u64 objectid, u64 bytenr, u64 last_snapshot,
7807                                   u8 level, u8 drop_level,
7808                                   int level_size, struct btrfs_key *drop_key)
7809 {
7810
7811         struct root_item_record *ri_rec;
7812         ri_rec = malloc(sizeof(*ri_rec));
7813         if (!ri_rec)
7814                 return -ENOMEM;
7815         ri_rec->bytenr = bytenr;
7816         ri_rec->objectid = objectid;
7817         ri_rec->level = level;
7818         ri_rec->level_size = level_size;
7819         ri_rec->drop_level = drop_level;
7820         ri_rec->last_snapshot = last_snapshot;
7821         if (drop_key)
7822                 memcpy(&ri_rec->drop_key, drop_key, sizeof(*drop_key));
7823         list_add_tail(&ri_rec->list, head);
7824
7825         return 0;
7826 }
7827
7828 static void free_root_item_list(struct list_head *list)
7829 {
7830         struct root_item_record *ri_rec;
7831
7832         while (!list_empty(list)) {
7833                 ri_rec = list_first_entry(list, struct root_item_record,
7834                                           list);
7835                 list_del_init(&ri_rec->list);
7836                 free(ri_rec);
7837         }
7838 }
7839
7840 static int deal_root_from_list(struct list_head *list,
7841                                struct btrfs_root *root,
7842                                struct block_info *bits,
7843                                int bits_nr,
7844                                struct cache_tree *pending,
7845                                struct cache_tree *seen,
7846                                struct cache_tree *reada,
7847                                struct cache_tree *nodes,
7848                                struct cache_tree *extent_cache,
7849                                struct cache_tree *chunk_cache,
7850                                struct rb_root *dev_cache,
7851                                struct block_group_tree *block_group_cache,
7852                                struct device_extent_tree *dev_extent_cache)
7853 {
7854         int ret = 0;
7855         u64 last;
7856
7857         while (!list_empty(list)) {
7858                 struct root_item_record *rec;
7859                 struct extent_buffer *buf;
7860                 rec = list_entry(list->next,
7861                                  struct root_item_record, list);
7862                 last = 0;
7863                 buf = read_tree_block(root->fs_info->tree_root,
7864                                       rec->bytenr, rec->level_size, 0);
7865                 if (!extent_buffer_uptodate(buf)) {
7866                         free_extent_buffer(buf);
7867                         ret = -EIO;
7868                         break;
7869                 }
7870                 add_root_to_pending(buf, extent_cache, pending,
7871                                     seen, nodes, rec->objectid);
7872                 /*
7873                  * To rebuild extent tree, we need deal with snapshot
7874                  * one by one, otherwise we deal with node firstly which
7875                  * can maximize readahead.
7876                  */
7877                 while (1) {
7878                         ret = run_next_block(root, bits, bits_nr, &last,
7879                                              pending, seen, reada, nodes,
7880                                              extent_cache, chunk_cache,
7881                                              dev_cache, block_group_cache,
7882                                              dev_extent_cache, rec);
7883                         if (ret != 0)
7884                                 break;
7885                 }
7886                 free_extent_buffer(buf);
7887                 list_del(&rec->list);
7888                 free(rec);
7889                 if (ret < 0)
7890                         break;
7891         }
7892         while (ret >= 0) {
7893                 ret = run_next_block(root, bits, bits_nr, &last, pending, seen,
7894                                      reada, nodes, extent_cache, chunk_cache,
7895                                      dev_cache, block_group_cache,
7896                                      dev_extent_cache, NULL);
7897                 if (ret != 0) {
7898                         if (ret > 0)
7899                                 ret = 0;
7900                         break;
7901                 }
7902         }
7903         return ret;
7904 }
7905
7906 static int check_chunks_and_extents(struct btrfs_root *root)
7907 {
7908         struct rb_root dev_cache;
7909         struct cache_tree chunk_cache;
7910         struct block_group_tree block_group_cache;
7911         struct device_extent_tree dev_extent_cache;
7912         struct cache_tree extent_cache;
7913         struct cache_tree seen;
7914         struct cache_tree pending;
7915         struct cache_tree reada;
7916         struct cache_tree nodes;
7917         struct extent_io_tree excluded_extents;
7918         struct cache_tree corrupt_blocks;
7919         struct btrfs_path path;
7920         struct btrfs_key key;
7921         struct btrfs_key found_key;
7922         int ret, err = 0;
7923         struct block_info *bits;
7924         int bits_nr;
7925         struct extent_buffer *leaf;
7926         int slot;
7927         struct btrfs_root_item ri;
7928         struct list_head dropping_trees;
7929         struct list_head normal_trees;
7930         struct btrfs_root *root1;
7931         u64 objectid;
7932         u32 level_size;
7933         u8 level;
7934
7935         dev_cache = RB_ROOT;
7936         cache_tree_init(&chunk_cache);
7937         block_group_tree_init(&block_group_cache);
7938         device_extent_tree_init(&dev_extent_cache);
7939
7940         cache_tree_init(&extent_cache);
7941         cache_tree_init(&seen);
7942         cache_tree_init(&pending);
7943         cache_tree_init(&nodes);
7944         cache_tree_init(&reada);
7945         cache_tree_init(&corrupt_blocks);
7946         extent_io_tree_init(&excluded_extents);
7947         INIT_LIST_HEAD(&dropping_trees);
7948         INIT_LIST_HEAD(&normal_trees);
7949
7950         if (repair) {
7951                 root->fs_info->excluded_extents = &excluded_extents;
7952                 root->fs_info->fsck_extent_cache = &extent_cache;
7953                 root->fs_info->free_extent_hook = free_extent_hook;
7954                 root->fs_info->corrupt_blocks = &corrupt_blocks;
7955         }
7956
7957         bits_nr = 1024;
7958         bits = malloc(bits_nr * sizeof(struct block_info));
7959         if (!bits) {
7960                 perror("malloc");
7961                 exit(1);
7962         }
7963
7964 again:
7965         root1 = root->fs_info->tree_root;
7966         level = btrfs_header_level(root1->node);
7967         ret = add_root_item_to_list(&normal_trees, root1->root_key.objectid,
7968                                     root1->node->start, 0, level, 0,
7969                                     btrfs_level_size(root1, level), NULL);
7970         if (ret < 0)
7971                 goto out;
7972         root1 = root->fs_info->chunk_root;
7973         level = btrfs_header_level(root1->node);
7974         ret = add_root_item_to_list(&normal_trees, root1->root_key.objectid,
7975                                     root1->node->start, 0, level, 0,
7976                                     btrfs_level_size(root1, level), NULL);
7977         if (ret < 0)
7978                 goto out;
7979         btrfs_init_path(&path);
7980         key.offset = 0;
7981         key.objectid = 0;
7982         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
7983         ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
7984                                         &key, &path, 0, 0);
7985         if (ret < 0)
7986                 goto out;
7987         while(1) {
7988                 leaf = path.nodes[0];
7989                 slot = path.slots[0];
7990                 if (slot >= btrfs_header_nritems(path.nodes[0])) {
7991                         ret = btrfs_next_leaf(root, &path);
7992                         if (ret != 0)
7993                                 break;
7994                         leaf = path.nodes[0];
7995                         slot = path.slots[0];
7996                 }
7997                 btrfs_item_key_to_cpu(leaf, &found_key, path.slots[0]);
7998                 if (btrfs_key_type(&found_key) == BTRFS_ROOT_ITEM_KEY) {
7999                         unsigned long offset;
8000                         u64 last_snapshot;
8001
8002                         offset = btrfs_item_ptr_offset(leaf, path.slots[0]);
8003                         read_extent_buffer(leaf, &ri, offset, sizeof(ri));
8004                         last_snapshot = btrfs_root_last_snapshot(&ri);
8005                         if (btrfs_disk_key_objectid(&ri.drop_progress) == 0) {
8006                                 level = btrfs_root_level(&ri);
8007                                 level_size = btrfs_level_size(root, level);
8008                                 ret = add_root_item_to_list(&normal_trees,
8009                                                 found_key.objectid,
8010                                                 btrfs_root_bytenr(&ri),
8011                                                 last_snapshot, level,
8012                                                 0, level_size, NULL);
8013                                 if (ret < 0)
8014                                         goto out;
8015                         } else {
8016                                 level = btrfs_root_level(&ri);
8017                                 level_size = btrfs_level_size(root, level);
8018                                 objectid = found_key.objectid;
8019                                 btrfs_disk_key_to_cpu(&found_key,
8020                                                       &ri.drop_progress);
8021                                 ret = add_root_item_to_list(&dropping_trees,
8022                                                 objectid,
8023                                                 btrfs_root_bytenr(&ri),
8024                                                 last_snapshot, level,
8025                                                 ri.drop_level,
8026                                                 level_size, &found_key);
8027                                 if (ret < 0)
8028                                         goto out;
8029                         }
8030                 }
8031                 path.slots[0]++;
8032         }
8033         btrfs_release_path(&path);
8034
8035         /*
8036          * check_block can return -EAGAIN if it fixes something, please keep
8037          * this in mind when dealing with return values from these functions, if
8038          * we get -EAGAIN we want to fall through and restart the loop.
8039          */
8040         ret = deal_root_from_list(&normal_trees, root, bits, bits_nr, &pending,
8041                                   &seen, &reada, &nodes, &extent_cache,
8042                                   &chunk_cache, &dev_cache, &block_group_cache,
8043                                   &dev_extent_cache);
8044         if (ret < 0) {
8045                 if (ret == -EAGAIN)
8046                         goto loop;
8047                 goto out;
8048         }
8049         ret = deal_root_from_list(&dropping_trees, root, bits, bits_nr,
8050                                   &pending, &seen, &reada, &nodes,
8051                                   &extent_cache, &chunk_cache, &dev_cache,
8052                                   &block_group_cache, &dev_extent_cache);
8053         if (ret < 0) {
8054                 if (ret == -EAGAIN)
8055                         goto loop;
8056                 goto out;
8057         }
8058
8059         err = check_chunks(&chunk_cache, &block_group_cache,
8060                            &dev_extent_cache, NULL, NULL, NULL, 0);
8061         if (err) {
8062                 if (err == -EAGAIN)
8063                         goto loop;
8064                 if (!ret)
8065                         ret = err;
8066         }
8067
8068         ret = check_extent_refs(root, &extent_cache);
8069         if (ret < 0) {
8070                 if (ret == -EAGAIN)
8071                         goto loop;
8072                 goto out;
8073         }
8074
8075         err = check_devices(&dev_cache, &dev_extent_cache);
8076         if (err && !ret)
8077                 ret = err;
8078
8079 out:
8080         if (repair) {
8081                 free_corrupt_blocks_tree(root->fs_info->corrupt_blocks);
8082                 extent_io_tree_cleanup(&excluded_extents);
8083                 root->fs_info->fsck_extent_cache = NULL;
8084                 root->fs_info->free_extent_hook = NULL;
8085                 root->fs_info->corrupt_blocks = NULL;
8086                 root->fs_info->excluded_extents = NULL;
8087         }
8088         free(bits);
8089         free_chunk_cache_tree(&chunk_cache);
8090         free_device_cache_tree(&dev_cache);
8091         free_block_group_tree(&block_group_cache);
8092         free_device_extent_tree(&dev_extent_cache);
8093         free_extent_cache_tree(&seen);
8094         free_extent_cache_tree(&pending);
8095         free_extent_cache_tree(&reada);
8096         free_extent_cache_tree(&nodes);
8097         return ret;
8098 loop:
8099         free_corrupt_blocks_tree(root->fs_info->corrupt_blocks);
8100         free_extent_cache_tree(&seen);
8101         free_extent_cache_tree(&pending);
8102         free_extent_cache_tree(&reada);
8103         free_extent_cache_tree(&nodes);
8104         free_chunk_cache_tree(&chunk_cache);
8105         free_block_group_tree(&block_group_cache);
8106         free_device_cache_tree(&dev_cache);
8107         free_device_extent_tree(&dev_extent_cache);
8108         free_extent_record_cache(root->fs_info, &extent_cache);
8109         free_root_item_list(&normal_trees);
8110         free_root_item_list(&dropping_trees);
8111         extent_io_tree_cleanup(&excluded_extents);
8112         goto again;
8113 }
8114
8115 static int btrfs_fsck_reinit_root(struct btrfs_trans_handle *trans,
8116                            struct btrfs_root *root, int overwrite)
8117 {
8118         struct extent_buffer *c;
8119         struct extent_buffer *old = root->node;
8120         int level;
8121         int ret;
8122         struct btrfs_disk_key disk_key = {0,0,0};
8123
8124         level = 0;
8125
8126         if (overwrite) {
8127                 c = old;
8128                 extent_buffer_get(c);
8129                 goto init;
8130         }
8131         c = btrfs_alloc_free_block(trans, root,
8132                                    btrfs_level_size(root, 0),
8133                                    root->root_key.objectid,
8134                                    &disk_key, level, 0, 0);
8135         if (IS_ERR(c)) {
8136                 c = old;
8137                 extent_buffer_get(c);
8138                 overwrite = 1;
8139         }
8140 init:
8141         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
8142         btrfs_set_header_level(c, level);
8143         btrfs_set_header_bytenr(c, c->start);
8144         btrfs_set_header_generation(c, trans->transid);
8145         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
8146         btrfs_set_header_owner(c, root->root_key.objectid);
8147
8148         write_extent_buffer(c, root->fs_info->fsid,
8149                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
8150
8151         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
8152                             btrfs_header_chunk_tree_uuid(c),
8153                             BTRFS_UUID_SIZE);
8154
8155         btrfs_mark_buffer_dirty(c);
8156         /*
8157          * this case can happen in the following case:
8158          *
8159          * 1.overwrite previous root.
8160          *
8161          * 2.reinit reloc data root, this is because we skip pin
8162          * down reloc data tree before which means we can allocate
8163          * same block bytenr here.
8164          */
8165         if (old->start == c->start) {
8166                 btrfs_set_root_generation(&root->root_item,
8167                                           trans->transid);
8168                 root->root_item.level = btrfs_header_level(root->node);
8169                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
8170                                         &root->root_key, &root->root_item);
8171                 if (ret) {
8172                         free_extent_buffer(c);
8173                         return ret;
8174                 }
8175         }
8176         free_extent_buffer(old);
8177         root->node = c;
8178         add_root_to_dirty_list(root);
8179         return 0;
8180 }
8181
8182 static int pin_down_tree_blocks(struct btrfs_fs_info *fs_info,
8183                                 struct extent_buffer *eb, int tree_root)
8184 {
8185         struct extent_buffer *tmp;
8186         struct btrfs_root_item *ri;
8187         struct btrfs_key key;
8188         u64 bytenr;
8189         u32 leafsize;
8190         int level = btrfs_header_level(eb);
8191         int nritems;
8192         int ret;
8193         int i;
8194
8195         /*
8196          * If we have pinned this block before, don't pin it again.
8197          * This can not only avoid forever loop with broken filesystem
8198          * but also give us some speedups.
8199          */
8200         if (test_range_bit(&fs_info->pinned_extents, eb->start,
8201                            eb->start + eb->len - 1, EXTENT_DIRTY, 0))
8202                 return 0;
8203
8204         btrfs_pin_extent(fs_info, eb->start, eb->len);
8205
8206         leafsize = btrfs_super_leafsize(fs_info->super_copy);
8207         nritems = btrfs_header_nritems(eb);
8208         for (i = 0; i < nritems; i++) {
8209                 if (level == 0) {
8210                         btrfs_item_key_to_cpu(eb, &key, i);
8211                         if (key.type != BTRFS_ROOT_ITEM_KEY)
8212                                 continue;
8213                         /* Skip the extent root and reloc roots */
8214                         if (key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
8215                             key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
8216                             key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
8217                                 continue;
8218                         ri = btrfs_item_ptr(eb, i, struct btrfs_root_item);
8219                         bytenr = btrfs_disk_root_bytenr(eb, ri);
8220
8221                         /*
8222                          * If at any point we start needing the real root we
8223                          * will have to build a stump root for the root we are
8224                          * in, but for now this doesn't actually use the root so
8225                          * just pass in extent_root.
8226                          */
8227                         tmp = read_tree_block(fs_info->extent_root, bytenr,
8228                                               leafsize, 0);
8229                         if (!extent_buffer_uptodate(tmp)) {
8230                                 fprintf(stderr, "Error reading root block\n");
8231                                 return -EIO;
8232                         }
8233                         ret = pin_down_tree_blocks(fs_info, tmp, 0);
8234                         free_extent_buffer(tmp);
8235                         if (ret)
8236                                 return ret;
8237                 } else {
8238                         bytenr = btrfs_node_blockptr(eb, i);
8239
8240                         /* If we aren't the tree root don't read the block */
8241                         if (level == 1 && !tree_root) {
8242                                 btrfs_pin_extent(fs_info, bytenr, leafsize);
8243                                 continue;
8244                         }
8245
8246                         tmp = read_tree_block(fs_info->extent_root, bytenr,
8247                                               leafsize, 0);
8248                         if (!extent_buffer_uptodate(tmp)) {
8249                                 fprintf(stderr, "Error reading tree block\n");
8250                                 return -EIO;
8251                         }
8252                         ret = pin_down_tree_blocks(fs_info, tmp, tree_root);
8253                         free_extent_buffer(tmp);
8254                         if (ret)
8255                                 return ret;
8256                 }
8257         }
8258
8259         return 0;
8260 }
8261
8262 static int pin_metadata_blocks(struct btrfs_fs_info *fs_info)
8263 {
8264         int ret;
8265
8266         ret = pin_down_tree_blocks(fs_info, fs_info->chunk_root->node, 0);
8267         if (ret)
8268                 return ret;
8269
8270         return pin_down_tree_blocks(fs_info, fs_info->tree_root->node, 1);
8271 }
8272
8273 static int reset_block_groups(struct btrfs_fs_info *fs_info)
8274 {
8275         struct btrfs_block_group_cache *cache;
8276         struct btrfs_path *path;
8277         struct extent_buffer *leaf;
8278         struct btrfs_chunk *chunk;
8279         struct btrfs_key key;
8280         int ret;
8281         u64 start;
8282
8283         path = btrfs_alloc_path();
8284         if (!path)
8285                 return -ENOMEM;
8286
8287         key.objectid = 0;
8288         key.type = BTRFS_CHUNK_ITEM_KEY;
8289         key.offset = 0;
8290
8291         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
8292         if (ret < 0) {
8293                 btrfs_free_path(path);
8294                 return ret;
8295         }
8296
8297         /*
8298          * We do this in case the block groups were screwed up and had alloc
8299          * bits that aren't actually set on the chunks.  This happens with
8300          * restored images every time and could happen in real life I guess.
8301          */
8302         fs_info->avail_data_alloc_bits = 0;
8303         fs_info->avail_metadata_alloc_bits = 0;
8304         fs_info->avail_system_alloc_bits = 0;
8305
8306         /* First we need to create the in-memory block groups */
8307         while (1) {
8308                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8309                         ret = btrfs_next_leaf(fs_info->chunk_root, path);
8310                         if (ret < 0) {
8311                                 btrfs_free_path(path);
8312                                 return ret;
8313                         }
8314                         if (ret) {
8315                                 ret = 0;
8316                                 break;
8317                         }
8318                 }
8319                 leaf = path->nodes[0];
8320                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
8321                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
8322                         path->slots[0]++;
8323                         continue;
8324                 }
8325
8326                 chunk = btrfs_item_ptr(leaf, path->slots[0],
8327                                        struct btrfs_chunk);
8328                 btrfs_add_block_group(fs_info, 0,
8329                                       btrfs_chunk_type(leaf, chunk),
8330                                       key.objectid, key.offset,
8331                                       btrfs_chunk_length(leaf, chunk));
8332                 set_extent_dirty(&fs_info->free_space_cache, key.offset,
8333                                  key.offset + btrfs_chunk_length(leaf, chunk),
8334                                  GFP_NOFS);
8335                 path->slots[0]++;
8336         }
8337         start = 0;
8338         while (1) {
8339                 cache = btrfs_lookup_first_block_group(fs_info, start);
8340                 if (!cache)
8341                         break;
8342                 cache->cached = 1;
8343                 start = cache->key.objectid + cache->key.offset;
8344         }
8345
8346         btrfs_free_path(path);
8347         return 0;
8348 }
8349
8350 static int reset_balance(struct btrfs_trans_handle *trans,
8351                          struct btrfs_fs_info *fs_info)
8352 {
8353         struct btrfs_root *root = fs_info->tree_root;
8354         struct btrfs_path *path;
8355         struct extent_buffer *leaf;
8356         struct btrfs_key key;
8357         int del_slot, del_nr = 0;
8358         int ret;
8359         int found = 0;
8360
8361         path = btrfs_alloc_path();
8362         if (!path)
8363                 return -ENOMEM;
8364
8365         key.objectid = BTRFS_BALANCE_OBJECTID;
8366         key.type = BTRFS_BALANCE_ITEM_KEY;
8367         key.offset = 0;
8368
8369         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8370         if (ret) {
8371                 if (ret > 0)
8372                         ret = 0;
8373                 if (!ret)
8374                         goto reinit_data_reloc;
8375                 else
8376                         goto out;
8377         }
8378
8379         ret = btrfs_del_item(trans, root, path);
8380         if (ret)
8381                 goto out;
8382         btrfs_release_path(path);
8383
8384         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
8385         key.type = BTRFS_ROOT_ITEM_KEY;
8386         key.offset = 0;
8387
8388         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8389         if (ret < 0)
8390                 goto out;
8391         while (1) {
8392                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8393                         if (!found)
8394                                 break;
8395
8396                         if (del_nr) {
8397                                 ret = btrfs_del_items(trans, root, path,
8398                                                       del_slot, del_nr);
8399                                 del_nr = 0;
8400                                 if (ret)
8401                                         goto out;
8402                         }
8403                         key.offset++;
8404                         btrfs_release_path(path);
8405
8406                         found = 0;
8407                         ret = btrfs_search_slot(trans, root, &key, path,
8408                                                 -1, 1);
8409                         if (ret < 0)
8410                                 goto out;
8411                         continue;
8412                 }
8413                 found = 1;
8414                 leaf = path->nodes[0];
8415                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
8416                 if (key.objectid > BTRFS_TREE_RELOC_OBJECTID)
8417                         break;
8418                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8419                         path->slots[0]++;
8420                         continue;
8421                 }
8422                 if (!del_nr) {
8423                         del_slot = path->slots[0];
8424                         del_nr = 1;
8425                 } else {
8426                         del_nr++;
8427                 }
8428                 path->slots[0]++;
8429         }
8430
8431         if (del_nr) {
8432                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
8433                 if (ret)
8434                         goto out;
8435         }
8436         btrfs_release_path(path);
8437
8438 reinit_data_reloc:
8439         key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
8440         key.type = BTRFS_ROOT_ITEM_KEY;
8441         key.offset = (u64)-1;
8442         root = btrfs_read_fs_root(fs_info, &key);
8443         if (IS_ERR(root)) {
8444                 fprintf(stderr, "Error reading data reloc tree\n");
8445                 ret = PTR_ERR(root);
8446                 goto out;
8447         }
8448         record_root_in_trans(trans, root);
8449         ret = btrfs_fsck_reinit_root(trans, root, 0);
8450         if (ret)
8451                 goto out;
8452         ret = btrfs_make_root_dir(trans, root, BTRFS_FIRST_FREE_OBJECTID);
8453 out:
8454         btrfs_free_path(path);
8455         return ret;
8456 }
8457
8458 static int reinit_extent_tree(struct btrfs_trans_handle *trans,
8459                               struct btrfs_fs_info *fs_info)
8460 {
8461         u64 start = 0;
8462         int ret;
8463
8464         /*
8465          * The only reason we don't do this is because right now we're just
8466          * walking the trees we find and pinning down their bytes, we don't look
8467          * at any of the leaves.  In order to do mixed groups we'd have to check
8468          * the leaves of any fs roots and pin down the bytes for any file
8469          * extents we find.  Not hard but why do it if we don't have to?
8470          */
8471         if (btrfs_fs_incompat(fs_info, BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)) {
8472                 fprintf(stderr, "We don't support re-initing the extent tree "
8473                         "for mixed block groups yet, please notify a btrfs "
8474                         "developer you want to do this so they can add this "
8475                         "functionality.\n");
8476                 return -EINVAL;
8477         }
8478
8479         /*
8480          * first we need to walk all of the trees except the extent tree and pin
8481          * down the bytes that are in use so we don't overwrite any existing
8482          * metadata.
8483          */
8484         ret = pin_metadata_blocks(fs_info);
8485         if (ret) {
8486                 fprintf(stderr, "error pinning down used bytes\n");
8487                 return ret;
8488         }
8489
8490         /*
8491          * Need to drop all the block groups since we're going to recreate all
8492          * of them again.
8493          */
8494         btrfs_free_block_groups(fs_info);
8495         ret = reset_block_groups(fs_info);
8496         if (ret) {
8497                 fprintf(stderr, "error resetting the block groups\n");
8498                 return ret;
8499         }
8500
8501         /* Ok we can allocate now, reinit the extent root */
8502         ret = btrfs_fsck_reinit_root(trans, fs_info->extent_root, 0);
8503         if (ret) {
8504                 fprintf(stderr, "extent root initialization failed\n");
8505                 /*
8506                  * When the transaction code is updated we should end the
8507                  * transaction, but for now progs only knows about commit so
8508                  * just return an error.
8509                  */
8510                 return ret;
8511         }
8512
8513         /*
8514          * Now we have all the in-memory block groups setup so we can make
8515          * allocations properly, and the metadata we care about is safe since we
8516          * pinned all of it above.
8517          */
8518         while (1) {
8519                 struct btrfs_block_group_cache *cache;
8520
8521                 cache = btrfs_lookup_first_block_group(fs_info, start);
8522                 if (!cache)
8523                         break;
8524                 start = cache->key.objectid + cache->key.offset;
8525                 ret = btrfs_insert_item(trans, fs_info->extent_root,
8526                                         &cache->key, &cache->item,
8527                                         sizeof(cache->item));
8528                 if (ret) {
8529                         fprintf(stderr, "Error adding block group\n");
8530                         return ret;
8531                 }
8532                 btrfs_extent_post_op(trans, fs_info->extent_root);
8533         }
8534
8535         ret = reset_balance(trans, fs_info);
8536         if (ret)
8537                 fprintf(stderr, "error reseting the pending balance\n");
8538
8539         return ret;
8540 }
8541
8542 static int recow_extent_buffer(struct btrfs_root *root, struct extent_buffer *eb)
8543 {
8544         struct btrfs_path *path;
8545         struct btrfs_trans_handle *trans;
8546         struct btrfs_key key;
8547         int ret;
8548
8549         printf("Recowing metadata block %llu\n", eb->start);
8550         key.objectid = btrfs_header_owner(eb);
8551         key.type = BTRFS_ROOT_ITEM_KEY;
8552         key.offset = (u64)-1;
8553
8554         root = btrfs_read_fs_root(root->fs_info, &key);
8555         if (IS_ERR(root)) {
8556                 fprintf(stderr, "Couldn't find owner root %llu\n",
8557                         key.objectid);
8558                 return PTR_ERR(root);
8559         }
8560
8561         path = btrfs_alloc_path();
8562         if (!path)
8563                 return -ENOMEM;
8564
8565         trans = btrfs_start_transaction(root, 1);
8566         if (IS_ERR(trans)) {
8567                 btrfs_free_path(path);
8568                 return PTR_ERR(trans);
8569         }
8570
8571         path->lowest_level = btrfs_header_level(eb);
8572         if (path->lowest_level)
8573                 btrfs_node_key_to_cpu(eb, &key, 0);
8574         else
8575                 btrfs_item_key_to_cpu(eb, &key, 0);
8576
8577         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
8578         btrfs_commit_transaction(trans, root);
8579         btrfs_free_path(path);
8580         return ret;
8581 }
8582
8583 static int delete_bad_item(struct btrfs_root *root, struct bad_item *bad)
8584 {
8585         struct btrfs_path *path;
8586         struct btrfs_trans_handle *trans;
8587         struct btrfs_key key;
8588         int ret;
8589
8590         printf("Deleting bad item [%llu,%u,%llu]\n", bad->key.objectid,
8591                bad->key.type, bad->key.offset);
8592         key.objectid = bad->root_id;
8593         key.type = BTRFS_ROOT_ITEM_KEY;
8594         key.offset = (u64)-1;
8595
8596         root = btrfs_read_fs_root(root->fs_info, &key);
8597         if (IS_ERR(root)) {
8598                 fprintf(stderr, "Couldn't find owner root %llu\n",
8599                         key.objectid);
8600                 return PTR_ERR(root);
8601         }
8602
8603         path = btrfs_alloc_path();
8604         if (!path)
8605                 return -ENOMEM;
8606
8607         trans = btrfs_start_transaction(root, 1);
8608         if (IS_ERR(trans)) {
8609                 btrfs_free_path(path);
8610                 return PTR_ERR(trans);
8611         }
8612
8613         ret = btrfs_search_slot(trans, root, &bad->key, path, -1, 1);
8614         if (ret) {
8615                 if (ret > 0)
8616                         ret = 0;
8617                 goto out;
8618         }
8619         ret = btrfs_del_item(trans, root, path);
8620 out:
8621         btrfs_commit_transaction(trans, root);
8622         btrfs_free_path(path);
8623         return ret;
8624 }
8625
8626 static int zero_log_tree(struct btrfs_root *root)
8627 {
8628         struct btrfs_trans_handle *trans;
8629         int ret;
8630
8631         trans = btrfs_start_transaction(root, 1);
8632         if (IS_ERR(trans)) {
8633                 ret = PTR_ERR(trans);
8634                 return ret;
8635         }
8636         btrfs_set_super_log_root(root->fs_info->super_copy, 0);
8637         btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
8638         ret = btrfs_commit_transaction(trans, root);
8639         return ret;
8640 }
8641
8642 static int populate_csum(struct btrfs_trans_handle *trans,
8643                          struct btrfs_root *csum_root, char *buf, u64 start,
8644                          u64 len)
8645 {
8646         u64 offset = 0;
8647         u64 sectorsize;
8648         int ret = 0;
8649
8650         while (offset < len) {
8651                 sectorsize = csum_root->sectorsize;
8652                 ret = read_extent_data(csum_root, buf, start + offset,
8653                                        &sectorsize, 0);
8654                 if (ret)
8655                         break;
8656                 ret = btrfs_csum_file_block(trans, csum_root, start + len,
8657                                             start + offset, buf, sectorsize);
8658                 if (ret)
8659                         break;
8660                 offset += sectorsize;
8661         }
8662         return ret;
8663 }
8664
8665 static int fill_csum_tree_from_one_fs_root(struct btrfs_trans_handle *trans,
8666                                       struct btrfs_root *csum_root,
8667                                       struct btrfs_root *cur_root)
8668 {
8669         struct btrfs_path *path;
8670         struct btrfs_key key;
8671         struct extent_buffer *node;
8672         struct btrfs_file_extent_item *fi;
8673         char *buf = NULL;
8674         u64 start = 0;
8675         u64 len = 0;
8676         int slot = 0;
8677         int ret = 0;
8678
8679         path = btrfs_alloc_path();
8680         if (!path)
8681                 return -ENOMEM;
8682         buf = malloc(cur_root->fs_info->csum_root->sectorsize);
8683         if (!buf) {
8684                 ret = -ENOMEM;
8685                 goto out;
8686         }
8687
8688         key.objectid = 0;
8689         key.offset = 0;
8690         key.type = 0;
8691
8692         ret = btrfs_search_slot(NULL, cur_root, &key, path, 0, 0);
8693         if (ret < 0)
8694                 goto out;
8695         /* Iterate all regular file extents and fill its csum */
8696         while (1) {
8697                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
8698
8699                 if (key.type != BTRFS_EXTENT_DATA_KEY)
8700                         goto next;
8701                 node = path->nodes[0];
8702                 slot = path->slots[0];
8703                 fi = btrfs_item_ptr(node, slot, struct btrfs_file_extent_item);
8704                 if (btrfs_file_extent_type(node, fi) != BTRFS_FILE_EXTENT_REG)
8705                         goto next;
8706                 start = btrfs_file_extent_disk_bytenr(node, fi);
8707                 len = btrfs_file_extent_disk_num_bytes(node, fi);
8708
8709                 ret = populate_csum(trans, csum_root, buf, start, len);
8710                 if (ret == -EEXIST)
8711                         ret = 0;
8712                 if (ret < 0)
8713                         goto out;
8714 next:
8715                 /*
8716                  * TODO: if next leaf is corrupted, jump to nearest next valid
8717                  * leaf.
8718                  */
8719                 ret = btrfs_next_item(cur_root, path);
8720                 if (ret < 0)
8721                         goto out;
8722                 if (ret > 0) {
8723                         ret = 0;
8724                         goto out;
8725                 }
8726         }
8727
8728 out:
8729         btrfs_free_path(path);
8730         free(buf);
8731         return ret;
8732 }
8733
8734 static int fill_csum_tree_from_fs(struct btrfs_trans_handle *trans,
8735                                   struct btrfs_root *csum_root)
8736 {
8737         struct btrfs_fs_info *fs_info = csum_root->fs_info;
8738         struct btrfs_path *path;
8739         struct btrfs_root *tree_root = fs_info->tree_root;
8740         struct btrfs_root *cur_root;
8741         struct extent_buffer *node;
8742         struct btrfs_key key;
8743         int slot = 0;
8744         int ret = 0;
8745
8746         path = btrfs_alloc_path();
8747         if (!path)
8748                 return -ENOMEM;
8749
8750         key.objectid = BTRFS_FS_TREE_OBJECTID;
8751         key.offset = 0;
8752         key.type = BTRFS_ROOT_ITEM_KEY;
8753
8754         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
8755         if (ret < 0)
8756                 goto out;
8757         if (ret > 0) {
8758                 ret = -ENOENT;
8759                 goto out;
8760         }
8761
8762         while (1) {
8763                 node = path->nodes[0];
8764                 slot = path->slots[0];
8765                 btrfs_item_key_to_cpu(node, &key, slot);
8766                 if (key.objectid > BTRFS_LAST_FREE_OBJECTID)
8767                         goto out;
8768                 if (key.type != BTRFS_ROOT_ITEM_KEY)
8769                         goto next;
8770                 if (!is_fstree(key.objectid))
8771                         goto next;
8772                 key.offset = (u64)-1;
8773
8774                 cur_root = btrfs_read_fs_root(fs_info, &key);
8775                 if (IS_ERR(cur_root) || !cur_root) {
8776                         fprintf(stderr, "Fail to read fs/subvol tree: %lld\n",
8777                                 key.objectid);
8778                         goto out;
8779                 }
8780                 ret = fill_csum_tree_from_one_fs_root(trans, csum_root,
8781                                 cur_root);
8782                 if (ret < 0)
8783                         goto out;
8784 next:
8785                 ret = btrfs_next_item(tree_root, path);
8786                 if (ret > 0) {
8787                         ret = 0;
8788                         goto out;
8789                 }
8790                 if (ret < 0)
8791                         goto out;
8792         }
8793
8794 out:
8795         btrfs_free_path(path);
8796         return ret;
8797 }
8798
8799 static int fill_csum_tree_from_extent(struct btrfs_trans_handle *trans,
8800                                       struct btrfs_root *csum_root)
8801 {
8802         struct btrfs_root *extent_root = csum_root->fs_info->extent_root;
8803         struct btrfs_path *path;
8804         struct btrfs_extent_item *ei;
8805         struct extent_buffer *leaf;
8806         char *buf;
8807         struct btrfs_key key;
8808         int ret;
8809
8810         path = btrfs_alloc_path();
8811         if (!path)
8812                 return -ENOMEM;
8813
8814         key.objectid = 0;
8815         key.type = BTRFS_EXTENT_ITEM_KEY;
8816         key.offset = 0;
8817
8818         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
8819         if (ret < 0) {
8820                 btrfs_free_path(path);
8821                 return ret;
8822         }
8823
8824         buf = malloc(csum_root->sectorsize);
8825         if (!buf) {
8826                 btrfs_free_path(path);
8827                 return -ENOMEM;
8828         }
8829
8830         while (1) {
8831                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8832                         ret = btrfs_next_leaf(extent_root, path);
8833                         if (ret < 0)
8834                                 break;
8835                         if (ret) {
8836                                 ret = 0;
8837                                 break;
8838                         }
8839                 }
8840                 leaf = path->nodes[0];
8841
8842                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
8843                 if (key.type != BTRFS_EXTENT_ITEM_KEY) {
8844                         path->slots[0]++;
8845                         continue;
8846                 }
8847
8848                 ei = btrfs_item_ptr(leaf, path->slots[0],
8849                                     struct btrfs_extent_item);
8850                 if (!(btrfs_extent_flags(leaf, ei) &
8851                       BTRFS_EXTENT_FLAG_DATA)) {
8852                         path->slots[0]++;
8853                         continue;
8854                 }
8855
8856                 ret = populate_csum(trans, csum_root, buf, key.objectid,
8857                                     key.offset);
8858                 if (ret)
8859                         break;
8860                 path->slots[0]++;
8861         }
8862
8863         btrfs_free_path(path);
8864         free(buf);
8865         return ret;
8866 }
8867
8868 /*
8869  * Recalculate the csum and put it into the csum tree.
8870  *
8871  * Extent tree init will wipe out all the extent info, so in that case, we
8872  * can't depend on extent tree, but use fs tree.  If search_fs_tree is set, we
8873  * will use fs/subvol trees to init the csum tree.
8874  */
8875 static int fill_csum_tree(struct btrfs_trans_handle *trans,
8876                           struct btrfs_root *csum_root,
8877                           int search_fs_tree)
8878 {
8879         if (search_fs_tree)
8880                 return fill_csum_tree_from_fs(trans, csum_root);
8881         else
8882                 return fill_csum_tree_from_extent(trans, csum_root);
8883 }
8884
8885 struct root_item_info {
8886         /* level of the root */
8887         u8 level;
8888         /* number of nodes at this level, must be 1 for a root */
8889         int node_count;
8890         u64 bytenr;
8891         u64 gen;
8892         struct cache_extent cache_extent;
8893 };
8894
8895 static struct cache_tree *roots_info_cache = NULL;
8896
8897 static void free_roots_info_cache(void)
8898 {
8899         if (!roots_info_cache)
8900                 return;
8901
8902         while (!cache_tree_empty(roots_info_cache)) {
8903                 struct cache_extent *entry;
8904                 struct root_item_info *rii;
8905
8906                 entry = first_cache_extent(roots_info_cache);
8907                 if (!entry)
8908                         break;
8909                 remove_cache_extent(roots_info_cache, entry);
8910                 rii = container_of(entry, struct root_item_info, cache_extent);
8911                 free(rii);
8912         }
8913
8914         free(roots_info_cache);
8915         roots_info_cache = NULL;
8916 }
8917
8918 static int build_roots_info_cache(struct btrfs_fs_info *info)
8919 {
8920         int ret = 0;
8921         struct btrfs_key key;
8922         struct extent_buffer *leaf;
8923         struct btrfs_path *path;
8924
8925         if (!roots_info_cache) {
8926                 roots_info_cache = malloc(sizeof(*roots_info_cache));
8927                 if (!roots_info_cache)
8928                         return -ENOMEM;
8929                 cache_tree_init(roots_info_cache);
8930         }
8931
8932         path = btrfs_alloc_path();
8933         if (!path)
8934                 return -ENOMEM;
8935
8936         key.objectid = 0;
8937         key.type = BTRFS_EXTENT_ITEM_KEY;
8938         key.offset = 0;
8939
8940         ret = btrfs_search_slot(NULL, info->extent_root, &key, path, 0, 0);
8941         if (ret < 0)
8942                 goto out;
8943         leaf = path->nodes[0];
8944
8945         while (1) {
8946                 struct btrfs_key found_key;
8947                 struct btrfs_extent_item *ei;
8948                 struct btrfs_extent_inline_ref *iref;
8949                 int slot = path->slots[0];
8950                 int type;
8951                 u64 flags;
8952                 u64 root_id;
8953                 u8 level;
8954                 struct cache_extent *entry;
8955                 struct root_item_info *rii;
8956
8957                 if (slot >= btrfs_header_nritems(leaf)) {
8958                         ret = btrfs_next_leaf(info->extent_root, path);
8959                         if (ret < 0) {
8960                                 break;
8961                         } else if (ret) {
8962                                 ret = 0;
8963                                 break;
8964                         }
8965                         leaf = path->nodes[0];
8966                         slot = path->slots[0];
8967                 }
8968
8969                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8970
8971                 if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
8972                     found_key.type != BTRFS_METADATA_ITEM_KEY)
8973                         goto next;
8974
8975                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
8976                 flags = btrfs_extent_flags(leaf, ei);
8977
8978                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
8979                     !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
8980                         goto next;
8981
8982                 if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
8983                         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
8984                         level = found_key.offset;
8985                 } else {
8986                         struct btrfs_tree_block_info *info;
8987
8988                         info = (struct btrfs_tree_block_info *)(ei + 1);
8989                         iref = (struct btrfs_extent_inline_ref *)(info + 1);
8990                         level = btrfs_tree_block_level(leaf, info);
8991                 }
8992
8993                 /*
8994                  * For a root extent, it must be of the following type and the
8995                  * first (and only one) iref in the item.
8996                  */
8997                 type = btrfs_extent_inline_ref_type(leaf, iref);
8998                 if (type != BTRFS_TREE_BLOCK_REF_KEY)
8999                         goto next;
9000
9001                 root_id = btrfs_extent_inline_ref_offset(leaf, iref);
9002                 entry = lookup_cache_extent(roots_info_cache, root_id, 1);
9003                 if (!entry) {
9004                         rii = malloc(sizeof(struct root_item_info));
9005                         if (!rii) {
9006                                 ret = -ENOMEM;
9007                                 goto out;
9008                         }
9009                         rii->cache_extent.start = root_id;
9010                         rii->cache_extent.size = 1;
9011                         rii->level = (u8)-1;
9012                         entry = &rii->cache_extent;
9013                         ret = insert_cache_extent(roots_info_cache, entry);
9014                         ASSERT(ret == 0);
9015                 } else {
9016                         rii = container_of(entry, struct root_item_info,
9017                                            cache_extent);
9018                 }
9019
9020                 ASSERT(rii->cache_extent.start == root_id);
9021                 ASSERT(rii->cache_extent.size == 1);
9022
9023                 if (level > rii->level || rii->level == (u8)-1) {
9024                         rii->level = level;
9025                         rii->bytenr = found_key.objectid;
9026                         rii->gen = btrfs_extent_generation(leaf, ei);
9027                         rii->node_count = 1;
9028                 } else if (level == rii->level) {
9029                         rii->node_count++;
9030                 }
9031 next:
9032                 path->slots[0]++;
9033         }
9034
9035 out:
9036         btrfs_free_path(path);
9037
9038         return ret;
9039 }
9040
9041 static int maybe_repair_root_item(struct btrfs_fs_info *info,
9042                                   struct btrfs_path *path,
9043                                   const struct btrfs_key *root_key,
9044                                   const int read_only_mode)
9045 {
9046         const u64 root_id = root_key->objectid;
9047         struct cache_extent *entry;
9048         struct root_item_info *rii;
9049         struct btrfs_root_item ri;
9050         unsigned long offset;
9051
9052         entry = lookup_cache_extent(roots_info_cache, root_id, 1);
9053         if (!entry) {
9054                 fprintf(stderr,
9055                         "Error: could not find extent items for root %llu\n",
9056                         root_key->objectid);
9057                 return -ENOENT;
9058         }
9059
9060         rii = container_of(entry, struct root_item_info, cache_extent);
9061         ASSERT(rii->cache_extent.start == root_id);
9062         ASSERT(rii->cache_extent.size == 1);
9063
9064         if (rii->node_count != 1) {
9065                 fprintf(stderr,
9066                         "Error: could not find btree root extent for root %llu\n",
9067                         root_id);
9068                 return -ENOENT;
9069         }
9070
9071         offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
9072         read_extent_buffer(path->nodes[0], &ri, offset, sizeof(ri));
9073
9074         if (btrfs_root_bytenr(&ri) != rii->bytenr ||
9075             btrfs_root_level(&ri) != rii->level ||
9076             btrfs_root_generation(&ri) != rii->gen) {
9077
9078                 /*
9079                  * If we're in repair mode but our caller told us to not update
9080                  * the root item, i.e. just check if it needs to be updated, don't
9081                  * print this message, since the caller will call us again shortly
9082                  * for the same root item without read only mode (the caller will
9083                  * open a transaction first).
9084                  */
9085                 if (!(read_only_mode && repair))
9086                         fprintf(stderr,
9087                                 "%sroot item for root %llu,"
9088                                 " current bytenr %llu, current gen %llu, current level %u,"
9089                                 " new bytenr %llu, new gen %llu, new level %u\n",
9090                                 (read_only_mode ? "" : "fixing "),
9091                                 root_id,
9092                                 btrfs_root_bytenr(&ri), btrfs_root_generation(&ri),
9093                                 btrfs_root_level(&ri),
9094                                 rii->bytenr, rii->gen, rii->level);
9095
9096                 if (btrfs_root_generation(&ri) > rii->gen) {
9097                         fprintf(stderr,
9098                                 "root %llu has a root item with a more recent gen (%llu) compared to the found root node (%llu)\n",
9099                                 root_id, btrfs_root_generation(&ri), rii->gen);
9100                         return -EINVAL;
9101                 }
9102
9103                 if (!read_only_mode) {
9104                         btrfs_set_root_bytenr(&ri, rii->bytenr);
9105                         btrfs_set_root_level(&ri, rii->level);
9106                         btrfs_set_root_generation(&ri, rii->gen);
9107                         write_extent_buffer(path->nodes[0], &ri,
9108                                             offset, sizeof(ri));
9109                 }
9110
9111                 return 1;
9112         }
9113
9114         return 0;
9115 }
9116
9117 /*
9118  * A regression introduced in the 3.17 kernel (more specifically in 3.17-rc2),
9119  * caused read-only snapshots to be corrupted if they were created at a moment
9120  * when the source subvolume/snapshot had orphan items. The issue was that the
9121  * on-disk root items became incorrect, referring to the pre orphan cleanup root
9122  * node instead of the post orphan cleanup root node.
9123  * So this function, and its callees, just detects and fixes those cases. Even
9124  * though the regression was for read-only snapshots, this function applies to
9125  * any snapshot/subvolume root.
9126  * This must be run before any other repair code - not doing it so, makes other
9127  * repair code delete or modify backrefs in the extent tree for example, which
9128  * will result in an inconsistent fs after repairing the root items.
9129  */
9130 static int repair_root_items(struct btrfs_fs_info *info)
9131 {
9132         struct btrfs_path *path = NULL;
9133         struct btrfs_key key;
9134         struct extent_buffer *leaf;
9135         struct btrfs_trans_handle *trans = NULL;
9136         int ret = 0;
9137         int bad_roots = 0;
9138         int need_trans = 0;
9139
9140         ret = build_roots_info_cache(info);
9141         if (ret)
9142                 goto out;
9143
9144         path = btrfs_alloc_path();
9145         if (!path) {
9146                 ret = -ENOMEM;
9147                 goto out;
9148         }
9149
9150         key.objectid = BTRFS_FIRST_FREE_OBJECTID;
9151         key.type = BTRFS_ROOT_ITEM_KEY;
9152         key.offset = 0;
9153
9154 again:
9155         /*
9156          * Avoid opening and committing transactions if a leaf doesn't have
9157          * any root items that need to be fixed, so that we avoid rotating
9158          * backup roots unnecessarily.
9159          */
9160         if (need_trans) {
9161                 trans = btrfs_start_transaction(info->tree_root, 1);
9162                 if (IS_ERR(trans)) {
9163                         ret = PTR_ERR(trans);
9164                         goto out;
9165                 }
9166         }
9167
9168         ret = btrfs_search_slot(trans, info->tree_root, &key, path,
9169                                 0, trans ? 1 : 0);
9170         if (ret < 0)
9171                 goto out;
9172         leaf = path->nodes[0];
9173
9174         while (1) {
9175                 struct btrfs_key found_key;
9176
9177                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
9178                         int no_more_keys = find_next_key(path, &key);
9179
9180                         btrfs_release_path(path);
9181                         if (trans) {
9182                                 ret = btrfs_commit_transaction(trans,
9183                                                                info->tree_root);
9184                                 trans = NULL;
9185                                 if (ret < 0)
9186                                         goto out;
9187                         }
9188                         need_trans = 0;
9189                         if (no_more_keys)
9190                                 break;
9191                         goto again;
9192                 }
9193
9194                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9195
9196                 if (found_key.type != BTRFS_ROOT_ITEM_KEY)
9197                         goto next;
9198                 if (found_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
9199                         goto next;
9200
9201                 ret = maybe_repair_root_item(info, path, &found_key,
9202                                              trans ? 0 : 1);
9203                 if (ret < 0)
9204                         goto out;
9205                 if (ret) {
9206                         if (!trans && repair) {
9207                                 need_trans = 1;
9208                                 key = found_key;
9209                                 btrfs_release_path(path);
9210                                 goto again;
9211                         }
9212                         bad_roots++;
9213                 }
9214 next:
9215                 path->slots[0]++;
9216         }
9217         ret = 0;
9218 out:
9219         free_roots_info_cache();
9220         if (path)
9221                 btrfs_free_path(path);
9222         if (trans)
9223                 btrfs_commit_transaction(trans, info->tree_root);
9224         if (ret < 0)
9225                 return ret;
9226
9227         return bad_roots;
9228 }
9229
9230 const char * const cmd_check_usage[] = {
9231         "btrfs check [options] <device>",
9232         "Check an unmounted btrfs filesystem.",
9233         "",
9234         "-s|--super <superblock>     use this superblock copy",
9235         "-b|--backup                 use the backup root copy",
9236         "--repair                    try to repair the filesystem",
9237         "--init-csum-tree            create a new CRC tree",
9238         "--init-extent-tree          create a new extent tree",
9239         "--check-data-csum           verify checkums of data blocks",
9240         "--qgroup-report             print a report on qgroup consistency",
9241         "--subvol-extents <subvolid> print subvolume extents and sharing state",
9242         "--tree-root <bytenr>        use the given bytenr for the tree root",
9243         NULL
9244 };
9245
9246 int cmd_check(int argc, char **argv)
9247 {
9248         struct cache_tree root_cache;
9249         struct btrfs_root *root;
9250         struct btrfs_fs_info *info;
9251         u64 bytenr = 0;
9252         u64 subvolid = 0;
9253         u64 tree_root_bytenr = 0;
9254         char uuidbuf[BTRFS_UUID_UNPARSED_SIZE];
9255         int ret;
9256         u64 num;
9257         int init_csum_tree = 0;
9258         int readonly = 0;
9259         int qgroup_report = 0;
9260         enum btrfs_open_ctree_flags ctree_flags = OPEN_CTREE_EXCLUSIVE;
9261
9262         while(1) {
9263                 int c;
9264                 enum { OPT_REPAIR = 257, OPT_INIT_CSUM, OPT_INIT_EXTENT,
9265                         OPT_CHECK_CSUM, OPT_READONLY };
9266                 static const struct option long_options[] = {
9267                         { "super", required_argument, NULL, 's' },
9268                         { "repair", no_argument, NULL, OPT_REPAIR },
9269                         { "readonly", no_argument, NULL, OPT_READONLY },
9270                         { "init-csum-tree", no_argument, NULL, OPT_INIT_CSUM },
9271                         { "init-extent-tree", no_argument, NULL, OPT_INIT_EXTENT },
9272                         { "check-data-csum", no_argument, NULL, OPT_CHECK_CSUM },
9273                         { "backup", no_argument, NULL, 'b' },
9274                         { "subvol-extents", required_argument, NULL, 'E' },
9275                         { "qgroup-report", no_argument, NULL, 'Q' },
9276                         { "tree-root", required_argument, NULL, 'r' },
9277                         { NULL, 0, NULL, 0}
9278                 };
9279
9280                 c = getopt_long(argc, argv, "as:br:", long_options, NULL);
9281                 if (c < 0)
9282                         break;
9283                 switch(c) {
9284                         case 'a': /* ignored */ break;
9285                         case 'b':
9286                                 ctree_flags |= OPEN_CTREE_BACKUP_ROOT;
9287                                 break;
9288                         case 's':
9289                                 num = arg_strtou64(optarg);
9290                                 if (num >= BTRFS_SUPER_MIRROR_MAX) {
9291                                         fprintf(stderr,
9292                                                 "ERROR: super mirror should be less than: %d\n",
9293                                                 BTRFS_SUPER_MIRROR_MAX);
9294                                         exit(1);
9295                                 }
9296                                 bytenr = btrfs_sb_offset(((int)num));
9297                                 printf("using SB copy %llu, bytenr %llu\n", num,
9298                                        (unsigned long long)bytenr);
9299                                 break;
9300                         case 'Q':
9301                                 qgroup_report = 1;
9302                                 break;
9303                         case 'E':
9304                                 subvolid = arg_strtou64(optarg);
9305                                 break;
9306                         case 'r':
9307                                 tree_root_bytenr = arg_strtou64(optarg);
9308                                 break;
9309                         case '?':
9310                         case 'h':
9311                                 usage(cmd_check_usage);
9312                         case OPT_REPAIR:
9313                                 printf("enabling repair mode\n");
9314                                 repair = 1;
9315                                 ctree_flags |= OPEN_CTREE_WRITES;
9316                                 break;
9317                         case OPT_READONLY:
9318                                 readonly = 1;
9319                                 break;
9320                         case OPT_INIT_CSUM:
9321                                 printf("Creating a new CRC tree\n");
9322                                 init_csum_tree = 1;
9323                                 repair = 1;
9324                                 ctree_flags |= OPEN_CTREE_WRITES;
9325                                 break;
9326                         case OPT_INIT_EXTENT:
9327                                 init_extent_tree = 1;
9328                                 ctree_flags |= (OPEN_CTREE_WRITES |
9329                                                 OPEN_CTREE_NO_BLOCK_GROUPS);
9330                                 repair = 1;
9331                                 break;
9332                         case OPT_CHECK_CSUM:
9333                                 check_data_csum = 1;
9334                                 break;
9335                 }
9336         }
9337         argc = argc - optind;
9338
9339         if (check_argc_exact(argc, 1))
9340                 usage(cmd_check_usage);
9341
9342         /* This check is the only reason for --readonly to exist */
9343         if (readonly && repair) {
9344                 fprintf(stderr, "Repair options are not compatible with --readonly\n");
9345                 exit(1);
9346         }
9347
9348         radix_tree_init();
9349         cache_tree_init(&root_cache);
9350
9351         if((ret = check_mounted(argv[optind])) < 0) {
9352                 fprintf(stderr, "Could not check mount status: %s\n", strerror(-ret));
9353                 goto err_out;
9354         } else if(ret) {
9355                 fprintf(stderr, "%s is currently mounted. Aborting.\n", argv[optind]);
9356                 ret = -EBUSY;
9357                 goto err_out;
9358         }
9359
9360         /* only allow partial opening under repair mode */
9361         if (repair)
9362                 ctree_flags |= OPEN_CTREE_PARTIAL;
9363
9364         info = open_ctree_fs_info(argv[optind], bytenr, tree_root_bytenr,
9365                                   ctree_flags);
9366         if (!info) {
9367                 fprintf(stderr, "Couldn't open file system\n");
9368                 ret = -EIO;
9369                 goto err_out;
9370         }
9371
9372         root = info->fs_root;
9373
9374         /*
9375          * repair mode will force us to commit transaction which
9376          * will make us fail to load log tree when mounting.
9377          */
9378         if (repair && btrfs_super_log_root(info->super_copy)) {
9379                 ret = ask_user("repair mode will force to clear out log tree, Are you sure?");
9380                 if (!ret) {
9381                         ret = 1;
9382                         goto close_out;
9383                 }
9384                 ret = zero_log_tree(root);
9385                 if (ret) {
9386                         fprintf(stderr, "fail to zero log tree\n");
9387                         goto close_out;
9388                 }
9389         }
9390
9391         uuid_unparse(info->super_copy->fsid, uuidbuf);
9392         if (qgroup_report) {
9393                 printf("Print quota groups for %s\nUUID: %s\n", argv[optind],
9394                        uuidbuf);
9395                 ret = qgroup_verify_all(info);
9396                 if (ret == 0)
9397                         print_qgroup_report(1);
9398                 goto close_out;
9399         }
9400         if (subvolid) {
9401                 printf("Print extent state for subvolume %llu on %s\nUUID: %s\n",
9402                        subvolid, argv[optind], uuidbuf);
9403                 ret = print_extent_state(info, subvolid);
9404                 goto close_out;
9405         }
9406         printf("Checking filesystem on %s\nUUID: %s\n", argv[optind], uuidbuf);
9407
9408         if (!extent_buffer_uptodate(info->tree_root->node) ||
9409             !extent_buffer_uptodate(info->dev_root->node) ||
9410             !extent_buffer_uptodate(info->chunk_root->node)) {
9411                 fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
9412                 ret = -EIO;
9413                 goto close_out;
9414         }
9415
9416         if (init_extent_tree || init_csum_tree) {
9417                 struct btrfs_trans_handle *trans;
9418
9419                 trans = btrfs_start_transaction(info->extent_root, 0);
9420                 if (IS_ERR(trans)) {
9421                         fprintf(stderr, "Error starting transaction\n");
9422                         ret = PTR_ERR(trans);
9423                         goto close_out;
9424                 }
9425
9426                 if (init_extent_tree) {
9427                         printf("Creating a new extent tree\n");
9428                         ret = reinit_extent_tree(trans, info);
9429                         if (ret)
9430                                 goto close_out;
9431                 }
9432
9433                 if (init_csum_tree) {
9434                         fprintf(stderr, "Reinit crc root\n");
9435                         ret = btrfs_fsck_reinit_root(trans, info->csum_root, 0);
9436                         if (ret) {
9437                                 fprintf(stderr, "crc root initialization failed\n");
9438                                 ret = -EIO;
9439                                 goto close_out;
9440                         }
9441
9442                         ret = fill_csum_tree(trans, info->csum_root,
9443                                              init_extent_tree);
9444                         if (ret) {
9445                                 fprintf(stderr, "crc refilling failed\n");
9446                                 return -EIO;
9447                         }
9448                 }
9449                 /*
9450                  * Ok now we commit and run the normal fsck, which will add
9451                  * extent entries for all of the items it finds.
9452                  */
9453                 ret = btrfs_commit_transaction(trans, info->extent_root);
9454                 if (ret)
9455                         goto close_out;
9456         }
9457         if (!extent_buffer_uptodate(info->extent_root->node)) {
9458                 fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
9459                 ret = -EIO;
9460                 goto close_out;
9461         }
9462         if (!extent_buffer_uptodate(info->csum_root->node)) {
9463                 fprintf(stderr, "Checksum root corrupted, rerun with --init-csum-tree option\n");
9464                 ret = -EIO;
9465                 goto close_out;
9466         }
9467
9468         fprintf(stderr, "checking extents\n");
9469         ret = check_chunks_and_extents(root);
9470         if (ret)
9471                 fprintf(stderr, "Errors found in extent allocation tree or chunk allocation\n");
9472
9473         ret = repair_root_items(info);
9474         if (ret < 0)
9475                 goto close_out;
9476         if (repair) {
9477                 fprintf(stderr, "Fixed %d roots.\n", ret);
9478                 ret = 0;
9479         } else if (ret > 0) {
9480                 fprintf(stderr,
9481                        "Found %d roots with an outdated root item.\n",
9482                        ret);
9483                 fprintf(stderr,
9484                         "Please run a filesystem check with the option --repair to fix them.\n");
9485                 ret = 1;
9486                 goto close_out;
9487         }
9488
9489         fprintf(stderr, "checking free space cache\n");
9490         ret = check_space_cache(root);
9491         if (ret)
9492                 goto out;
9493
9494         /*
9495          * We used to have to have these hole extents in between our real
9496          * extents so if we don't have this flag set we need to make sure there
9497          * are no gaps in the file extents for inodes, otherwise we can just
9498          * ignore it when this happens.
9499          */
9500         no_holes = btrfs_fs_incompat(root->fs_info,
9501                                      BTRFS_FEATURE_INCOMPAT_NO_HOLES);
9502         fprintf(stderr, "checking fs roots\n");
9503         ret = check_fs_roots(root, &root_cache);
9504         if (ret)
9505                 goto out;
9506
9507         fprintf(stderr, "checking csums\n");
9508         ret = check_csums(root);
9509         if (ret)
9510                 goto out;
9511
9512         fprintf(stderr, "checking root refs\n");
9513         ret = check_root_refs(root, &root_cache);
9514         if (ret)
9515                 goto out;
9516
9517         while (repair && !list_empty(&root->fs_info->recow_ebs)) {
9518                 struct extent_buffer *eb;
9519
9520                 eb = list_first_entry(&root->fs_info->recow_ebs,
9521                                       struct extent_buffer, recow);
9522                 list_del_init(&eb->recow);
9523                 ret = recow_extent_buffer(root, eb);
9524                 if (ret)
9525                         break;
9526         }
9527
9528         while (!list_empty(&delete_items)) {
9529                 struct bad_item *bad;
9530
9531                 bad = list_first_entry(&delete_items, struct bad_item, list);
9532                 list_del_init(&bad->list);
9533                 if (repair)
9534                         ret = delete_bad_item(root, bad);
9535                 free(bad);
9536         }
9537
9538         if (info->quota_enabled) {
9539                 int err;
9540                 fprintf(stderr, "checking quota groups\n");
9541                 err = qgroup_verify_all(info);
9542                 if (err)
9543                         goto out;
9544         }
9545
9546         if (!list_empty(&root->fs_info->recow_ebs)) {
9547                 fprintf(stderr, "Transid errors in file system\n");
9548                 ret = 1;
9549         }
9550 out:
9551         print_qgroup_report(0);
9552         if (found_old_backref) { /*
9553                  * there was a disk format change when mixed
9554                  * backref was in testing tree. The old format
9555                  * existed about one week.
9556                  */
9557                 printf("\n * Found old mixed backref format. "
9558                        "The old format is not supported! *"
9559                        "\n * Please mount the FS in readonly mode, "
9560                        "backup data and re-format the FS. *\n\n");
9561                 ret = 1;
9562         }
9563         printf("found %llu bytes used err is %d\n",
9564                (unsigned long long)bytes_used, ret);
9565         printf("total csum bytes: %llu\n",(unsigned long long)total_csum_bytes);
9566         printf("total tree bytes: %llu\n",
9567                (unsigned long long)total_btree_bytes);
9568         printf("total fs tree bytes: %llu\n",
9569                (unsigned long long)total_fs_tree_bytes);
9570         printf("total extent tree bytes: %llu\n",
9571                (unsigned long long)total_extent_tree_bytes);
9572         printf("btree space waste bytes: %llu\n",
9573                (unsigned long long)btree_space_waste);
9574         printf("file data blocks allocated: %llu\n referenced %llu\n",
9575                 (unsigned long long)data_bytes_allocated,
9576                 (unsigned long long)data_bytes_referenced);
9577         printf("%s\n", PACKAGE_STRING);
9578
9579         free_root_recs_tree(&root_cache);
9580 close_out:
9581         close_ctree(root);
9582 err_out:
9583         return ret;
9584 }