btrfs-progs: Recover btree by dropping corrupted leaf/node.
[platform/upstream/btrfs-progs.git] / cmds-check.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #define _XOPEN_SOURCE 500
20 #define _GNU_SOURCE 1
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <fcntl.h>
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <unistd.h>
28 #include <getopt.h>
29 #include <uuid/uuid.h>
30 #include "ctree.h"
31 #include "volumes.h"
32 #include "repair.h"
33 #include "disk-io.h"
34 #include "print-tree.h"
35 #include "transaction.h"
36 #include "version.h"
37 #include "utils.h"
38 #include "commands.h"
39 #include "free-space-cache.h"
40 #include "btrfsck.h"
41 #include "qgroup-verify.h"
42 #include "rbtree-utils.h"
43 #include "backref.h"
44 #include "ulist.h"
45
46 static u64 bytes_used = 0;
47 static u64 total_csum_bytes = 0;
48 static u64 total_btree_bytes = 0;
49 static u64 total_fs_tree_bytes = 0;
50 static u64 total_extent_tree_bytes = 0;
51 static u64 btree_space_waste = 0;
52 static u64 data_bytes_allocated = 0;
53 static u64 data_bytes_referenced = 0;
54 static int found_old_backref = 0;
55 static LIST_HEAD(duplicate_extents);
56 static LIST_HEAD(delete_items);
57 static int repair = 0;
58 static int no_holes = 0;
59 static int init_extent_tree = 0;
60 static int check_data_csum = 0;
61
62 struct extent_backref {
63         struct list_head list;
64         unsigned int is_data:1;
65         unsigned int found_extent_tree:1;
66         unsigned int full_backref:1;
67         unsigned int found_ref:1;
68         unsigned int broken:1;
69 };
70
71 struct data_backref {
72         struct extent_backref node;
73         union {
74                 u64 parent;
75                 u64 root;
76         };
77         u64 owner;
78         u64 offset;
79         u64 disk_bytenr;
80         u64 bytes;
81         u64 ram_bytes;
82         u32 num_refs;
83         u32 found_ref;
84 };
85
86 struct tree_backref {
87         struct extent_backref node;
88         union {
89                 u64 parent;
90                 u64 root;
91         };
92 };
93
94 struct extent_record {
95         struct list_head backrefs;
96         struct list_head dups;
97         struct list_head list;
98         struct cache_extent cache;
99         struct btrfs_disk_key parent_key;
100         u64 start;
101         u64 max_size;
102         u64 nr;
103         u64 refs;
104         u64 extent_item_refs;
105         u64 generation;
106         u64 parent_generation;
107         u64 info_objectid;
108         u32 num_duplicates;
109         u8 info_level;
110         unsigned int found_rec:1;
111         unsigned int content_checked:1;
112         unsigned int owner_ref_checked:1;
113         unsigned int is_root:1;
114         unsigned int metadata:1;
115 };
116
117 struct inode_backref {
118         struct list_head list;
119         unsigned int found_dir_item:1;
120         unsigned int found_dir_index:1;
121         unsigned int found_inode_ref:1;
122         unsigned int filetype:8;
123         int errors;
124         unsigned int ref_type;
125         u64 dir;
126         u64 index;
127         u16 namelen;
128         char name[0];
129 };
130
131 struct dropping_root_item_record {
132         struct list_head list;
133         struct btrfs_root_item ri;
134         struct btrfs_key found_key;
135 };
136
137 #define REF_ERR_NO_DIR_ITEM             (1 << 0)
138 #define REF_ERR_NO_DIR_INDEX            (1 << 1)
139 #define REF_ERR_NO_INODE_REF            (1 << 2)
140 #define REF_ERR_DUP_DIR_ITEM            (1 << 3)
141 #define REF_ERR_DUP_DIR_INDEX           (1 << 4)
142 #define REF_ERR_DUP_INODE_REF           (1 << 5)
143 #define REF_ERR_INDEX_UNMATCH           (1 << 6)
144 #define REF_ERR_FILETYPE_UNMATCH        (1 << 7)
145 #define REF_ERR_NAME_TOO_LONG           (1 << 8) // 100
146 #define REF_ERR_NO_ROOT_REF             (1 << 9)
147 #define REF_ERR_NO_ROOT_BACKREF         (1 << 10)
148 #define REF_ERR_DUP_ROOT_REF            (1 << 11)
149 #define REF_ERR_DUP_ROOT_BACKREF        (1 << 12)
150
151 struct inode_record {
152         struct list_head backrefs;
153         unsigned int checked:1;
154         unsigned int merging:1;
155         unsigned int found_inode_item:1;
156         unsigned int found_dir_item:1;
157         unsigned int found_file_extent:1;
158         unsigned int found_csum_item:1;
159         unsigned int some_csum_missing:1;
160         unsigned int nodatasum:1;
161         int errors;
162
163         u64 ino;
164         u32 nlink;
165         u32 imode;
166         u64 isize;
167         u64 nbytes;
168
169         u32 found_link;
170         u64 found_size;
171         u64 extent_start;
172         u64 extent_end;
173         u64 first_extent_gap;
174
175         u32 refs;
176 };
177
178 #define I_ERR_NO_INODE_ITEM             (1 << 0)
179 #define I_ERR_NO_ORPHAN_ITEM            (1 << 1)
180 #define I_ERR_DUP_INODE_ITEM            (1 << 2)
181 #define I_ERR_DUP_DIR_INDEX             (1 << 3)
182 #define I_ERR_ODD_DIR_ITEM              (1 << 4)
183 #define I_ERR_ODD_FILE_EXTENT           (1 << 5)
184 #define I_ERR_BAD_FILE_EXTENT           (1 << 6)
185 #define I_ERR_FILE_EXTENT_OVERLAP       (1 << 7)
186 #define I_ERR_FILE_EXTENT_DISCOUNT      (1 << 8) // 100
187 #define I_ERR_DIR_ISIZE_WRONG           (1 << 9)
188 #define I_ERR_FILE_NBYTES_WRONG         (1 << 10) // 400
189 #define I_ERR_ODD_CSUM_ITEM             (1 << 11)
190 #define I_ERR_SOME_CSUM_MISSING         (1 << 12)
191 #define I_ERR_LINK_COUNT_WRONG          (1 << 13)
192
193 struct root_backref {
194         struct list_head list;
195         unsigned int found_dir_item:1;
196         unsigned int found_dir_index:1;
197         unsigned int found_back_ref:1;
198         unsigned int found_forward_ref:1;
199         unsigned int reachable:1;
200         int errors;
201         u64 ref_root;
202         u64 dir;
203         u64 index;
204         u16 namelen;
205         char name[0];
206 };
207
208 struct root_record {
209         struct list_head backrefs;
210         struct cache_extent cache;
211         unsigned int found_root_item:1;
212         u64 objectid;
213         u32 found_ref;
214 };
215
216 struct ptr_node {
217         struct cache_extent cache;
218         void *data;
219 };
220
221 struct shared_node {
222         struct cache_extent cache;
223         struct cache_tree root_cache;
224         struct cache_tree inode_cache;
225         struct inode_record *current;
226         u32 refs;
227 };
228
229 struct block_info {
230         u64 start;
231         u32 size;
232 };
233
234 struct walk_control {
235         struct cache_tree shared;
236         struct shared_node *nodes[BTRFS_MAX_LEVEL];
237         int active_node;
238         int root_level;
239 };
240
241 struct bad_item {
242         struct btrfs_key key;
243         u64 root_id;
244         struct list_head list;
245 };
246
247 static void reset_cached_block_groups(struct btrfs_fs_info *fs_info);
248
249 static void record_root_in_trans(struct btrfs_trans_handle *trans,
250                                  struct btrfs_root *root)
251 {
252         if (root->last_trans != trans->transid) {
253                 root->track_dirty = 1;
254                 root->last_trans = trans->transid;
255                 root->commit_root = root->node;
256                 extent_buffer_get(root->node);
257         }
258 }
259
260 static u8 imode_to_type(u32 imode)
261 {
262 #define S_SHIFT 12
263         static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
264                 [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
265                 [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
266                 [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
267                 [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
268                 [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
269                 [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
270                 [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
271         };
272
273         return btrfs_type_by_mode[(imode & S_IFMT) >> S_SHIFT];
274 #undef S_SHIFT
275 }
276
277 static int device_record_compare(struct rb_node *node1, struct rb_node *node2)
278 {
279         struct device_record *rec1;
280         struct device_record *rec2;
281
282         rec1 = rb_entry(node1, struct device_record, node);
283         rec2 = rb_entry(node2, struct device_record, node);
284         if (rec1->devid > rec2->devid)
285                 return -1;
286         else if (rec1->devid < rec2->devid)
287                 return 1;
288         else
289                 return 0;
290 }
291
292 static struct inode_record *clone_inode_rec(struct inode_record *orig_rec)
293 {
294         struct inode_record *rec;
295         struct inode_backref *backref;
296         struct inode_backref *orig;
297         size_t size;
298
299         rec = malloc(sizeof(*rec));
300         memcpy(rec, orig_rec, sizeof(*rec));
301         rec->refs = 1;
302         INIT_LIST_HEAD(&rec->backrefs);
303
304         list_for_each_entry(orig, &orig_rec->backrefs, list) {
305                 size = sizeof(*orig) + orig->namelen + 1;
306                 backref = malloc(size);
307                 memcpy(backref, orig, size);
308                 list_add_tail(&backref->list, &rec->backrefs);
309         }
310         return rec;
311 }
312
313 static void print_inode_error(struct btrfs_root *root, struct inode_record *rec)
314 {
315         u64 root_objectid = root->root_key.objectid;
316         int errors = rec->errors;
317
318         if (!errors)
319                 return;
320         /* reloc root errors, we print its corresponding fs root objectid*/
321         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
322                 root_objectid = root->root_key.offset;
323                 fprintf(stderr, "reloc");
324         }
325         fprintf(stderr, "root %llu inode %llu errors %x",
326                 (unsigned long long) root_objectid,
327                 (unsigned long long) rec->ino, rec->errors);
328
329         if (errors & I_ERR_NO_INODE_ITEM)
330                 fprintf(stderr, ", no inode item");
331         if (errors & I_ERR_NO_ORPHAN_ITEM)
332                 fprintf(stderr, ", no orphan item");
333         if (errors & I_ERR_DUP_INODE_ITEM)
334                 fprintf(stderr, ", dup inode item");
335         if (errors & I_ERR_DUP_DIR_INDEX)
336                 fprintf(stderr, ", dup dir index");
337         if (errors & I_ERR_ODD_DIR_ITEM)
338                 fprintf(stderr, ", odd dir item");
339         if (errors & I_ERR_ODD_FILE_EXTENT)
340                 fprintf(stderr, ", odd file extent");
341         if (errors & I_ERR_BAD_FILE_EXTENT)
342                 fprintf(stderr, ", bad file extent");
343         if (errors & I_ERR_FILE_EXTENT_OVERLAP)
344                 fprintf(stderr, ", file extent overlap");
345         if (errors & I_ERR_FILE_EXTENT_DISCOUNT)
346                 fprintf(stderr, ", file extent discount");
347         if (errors & I_ERR_DIR_ISIZE_WRONG)
348                 fprintf(stderr, ", dir isize wrong");
349         if (errors & I_ERR_FILE_NBYTES_WRONG)
350                 fprintf(stderr, ", nbytes wrong");
351         if (errors & I_ERR_ODD_CSUM_ITEM)
352                 fprintf(stderr, ", odd csum item");
353         if (errors & I_ERR_SOME_CSUM_MISSING)
354                 fprintf(stderr, ", some csum missing");
355         if (errors & I_ERR_LINK_COUNT_WRONG)
356                 fprintf(stderr, ", link count wrong");
357         fprintf(stderr, "\n");
358 }
359
360 static void print_ref_error(int errors)
361 {
362         if (errors & REF_ERR_NO_DIR_ITEM)
363                 fprintf(stderr, ", no dir item");
364         if (errors & REF_ERR_NO_DIR_INDEX)
365                 fprintf(stderr, ", no dir index");
366         if (errors & REF_ERR_NO_INODE_REF)
367                 fprintf(stderr, ", no inode ref");
368         if (errors & REF_ERR_DUP_DIR_ITEM)
369                 fprintf(stderr, ", dup dir item");
370         if (errors & REF_ERR_DUP_DIR_INDEX)
371                 fprintf(stderr, ", dup dir index");
372         if (errors & REF_ERR_DUP_INODE_REF)
373                 fprintf(stderr, ", dup inode ref");
374         if (errors & REF_ERR_INDEX_UNMATCH)
375                 fprintf(stderr, ", index unmatch");
376         if (errors & REF_ERR_FILETYPE_UNMATCH)
377                 fprintf(stderr, ", filetype unmatch");
378         if (errors & REF_ERR_NAME_TOO_LONG)
379                 fprintf(stderr, ", name too long");
380         if (errors & REF_ERR_NO_ROOT_REF)
381                 fprintf(stderr, ", no root ref");
382         if (errors & REF_ERR_NO_ROOT_BACKREF)
383                 fprintf(stderr, ", no root backref");
384         if (errors & REF_ERR_DUP_ROOT_REF)
385                 fprintf(stderr, ", dup root ref");
386         if (errors & REF_ERR_DUP_ROOT_BACKREF)
387                 fprintf(stderr, ", dup root backref");
388         fprintf(stderr, "\n");
389 }
390
391 static struct inode_record *get_inode_rec(struct cache_tree *inode_cache,
392                                           u64 ino, int mod)
393 {
394         struct ptr_node *node;
395         struct cache_extent *cache;
396         struct inode_record *rec = NULL;
397         int ret;
398
399         cache = lookup_cache_extent(inode_cache, ino, 1);
400         if (cache) {
401                 node = container_of(cache, struct ptr_node, cache);
402                 rec = node->data;
403                 if (mod && rec->refs > 1) {
404                         node->data = clone_inode_rec(rec);
405                         rec->refs--;
406                         rec = node->data;
407                 }
408         } else if (mod) {
409                 rec = calloc(1, sizeof(*rec));
410                 rec->ino = ino;
411                 rec->extent_start = (u64)-1;
412                 rec->first_extent_gap = (u64)-1;
413                 rec->refs = 1;
414                 INIT_LIST_HEAD(&rec->backrefs);
415
416                 node = malloc(sizeof(*node));
417                 node->cache.start = ino;
418                 node->cache.size = 1;
419                 node->data = rec;
420
421                 if (ino == BTRFS_FREE_INO_OBJECTID)
422                         rec->found_link = 1;
423
424                 ret = insert_cache_extent(inode_cache, &node->cache);
425                 BUG_ON(ret);
426         }
427         return rec;
428 }
429
430 static void free_inode_rec(struct inode_record *rec)
431 {
432         struct inode_backref *backref;
433
434         if (--rec->refs > 0)
435                 return;
436
437         while (!list_empty(&rec->backrefs)) {
438                 backref = list_entry(rec->backrefs.next,
439                                      struct inode_backref, list);
440                 list_del(&backref->list);
441                 free(backref);
442         }
443         free(rec);
444 }
445
446 static int can_free_inode_rec(struct inode_record *rec)
447 {
448         if (!rec->errors && rec->checked && rec->found_inode_item &&
449             rec->nlink == rec->found_link && list_empty(&rec->backrefs))
450                 return 1;
451         return 0;
452 }
453
454 static void maybe_free_inode_rec(struct cache_tree *inode_cache,
455                                  struct inode_record *rec)
456 {
457         struct cache_extent *cache;
458         struct inode_backref *tmp, *backref;
459         struct ptr_node *node;
460         unsigned char filetype;
461
462         if (!rec->found_inode_item)
463                 return;
464
465         filetype = imode_to_type(rec->imode);
466         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
467                 if (backref->found_dir_item && backref->found_dir_index) {
468                         if (backref->filetype != filetype)
469                                 backref->errors |= REF_ERR_FILETYPE_UNMATCH;
470                         if (!backref->errors && backref->found_inode_ref) {
471                                 list_del(&backref->list);
472                                 free(backref);
473                         }
474                 }
475         }
476
477         if (!rec->checked || rec->merging)
478                 return;
479
480         if (S_ISDIR(rec->imode)) {
481                 if (rec->found_size != rec->isize)
482                         rec->errors |= I_ERR_DIR_ISIZE_WRONG;
483                 if (rec->found_file_extent)
484                         rec->errors |= I_ERR_ODD_FILE_EXTENT;
485         } else if (S_ISREG(rec->imode) || S_ISLNK(rec->imode)) {
486                 if (rec->found_dir_item)
487                         rec->errors |= I_ERR_ODD_DIR_ITEM;
488                 if (rec->found_size != rec->nbytes)
489                         rec->errors |= I_ERR_FILE_NBYTES_WRONG;
490                 if (rec->extent_start == (u64)-1 || rec->extent_start > 0)
491                         rec->first_extent_gap = 0;
492                 if (rec->nlink > 0 && !no_holes &&
493                     (rec->extent_end < rec->isize ||
494                      rec->first_extent_gap < rec->isize))
495                         rec->errors |= I_ERR_FILE_EXTENT_DISCOUNT;
496         }
497
498         if (S_ISREG(rec->imode) || S_ISLNK(rec->imode)) {
499                 if (rec->found_csum_item && rec->nodatasum)
500                         rec->errors |= I_ERR_ODD_CSUM_ITEM;
501                 if (rec->some_csum_missing && !rec->nodatasum)
502                         rec->errors |= I_ERR_SOME_CSUM_MISSING;
503         }
504
505         BUG_ON(rec->refs != 1);
506         if (can_free_inode_rec(rec)) {
507                 cache = lookup_cache_extent(inode_cache, rec->ino, 1);
508                 node = container_of(cache, struct ptr_node, cache);
509                 BUG_ON(node->data != rec);
510                 remove_cache_extent(inode_cache, &node->cache);
511                 free(node);
512                 free_inode_rec(rec);
513         }
514 }
515
516 static int check_orphan_item(struct btrfs_root *root, u64 ino)
517 {
518         struct btrfs_path path;
519         struct btrfs_key key;
520         int ret;
521
522         key.objectid = BTRFS_ORPHAN_OBJECTID;
523         key.type = BTRFS_ORPHAN_ITEM_KEY;
524         key.offset = ino;
525
526         btrfs_init_path(&path);
527         ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
528         btrfs_release_path(&path);
529         if (ret > 0)
530                 ret = -ENOENT;
531         return ret;
532 }
533
534 static int process_inode_item(struct extent_buffer *eb,
535                               int slot, struct btrfs_key *key,
536                               struct shared_node *active_node)
537 {
538         struct inode_record *rec;
539         struct btrfs_inode_item *item;
540
541         rec = active_node->current;
542         BUG_ON(rec->ino != key->objectid || rec->refs > 1);
543         if (rec->found_inode_item) {
544                 rec->errors |= I_ERR_DUP_INODE_ITEM;
545                 return 1;
546         }
547         item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
548         rec->nlink = btrfs_inode_nlink(eb, item);
549         rec->isize = btrfs_inode_size(eb, item);
550         rec->nbytes = btrfs_inode_nbytes(eb, item);
551         rec->imode = btrfs_inode_mode(eb, item);
552         if (btrfs_inode_flags(eb, item) & BTRFS_INODE_NODATASUM)
553                 rec->nodatasum = 1;
554         rec->found_inode_item = 1;
555         if (rec->nlink == 0)
556                 rec->errors |= I_ERR_NO_ORPHAN_ITEM;
557         maybe_free_inode_rec(&active_node->inode_cache, rec);
558         return 0;
559 }
560
561 static struct inode_backref *get_inode_backref(struct inode_record *rec,
562                                                 const char *name,
563                                                 int namelen, u64 dir)
564 {
565         struct inode_backref *backref;
566
567         list_for_each_entry(backref, &rec->backrefs, list) {
568                 if (rec->ino == BTRFS_MULTIPLE_OBJECTIDS)
569                         break;
570                 if (backref->dir != dir || backref->namelen != namelen)
571                         continue;
572                 if (memcmp(name, backref->name, namelen))
573                         continue;
574                 return backref;
575         }
576
577         backref = malloc(sizeof(*backref) + namelen + 1);
578         memset(backref, 0, sizeof(*backref));
579         backref->dir = dir;
580         backref->namelen = namelen;
581         memcpy(backref->name, name, namelen);
582         backref->name[namelen] = '\0';
583         list_add_tail(&backref->list, &rec->backrefs);
584         return backref;
585 }
586
587 static int add_inode_backref(struct cache_tree *inode_cache,
588                              u64 ino, u64 dir, u64 index,
589                              const char *name, int namelen,
590                              int filetype, int itemtype, int errors)
591 {
592         struct inode_record *rec;
593         struct inode_backref *backref;
594
595         rec = get_inode_rec(inode_cache, ino, 1);
596         backref = get_inode_backref(rec, name, namelen, dir);
597         if (errors)
598                 backref->errors |= errors;
599         if (itemtype == BTRFS_DIR_INDEX_KEY) {
600                 if (backref->found_dir_index)
601                         backref->errors |= REF_ERR_DUP_DIR_INDEX;
602                 if (backref->found_inode_ref && backref->index != index)
603                         backref->errors |= REF_ERR_INDEX_UNMATCH;
604                 if (backref->found_dir_item && backref->filetype != filetype)
605                         backref->errors |= REF_ERR_FILETYPE_UNMATCH;
606
607                 backref->index = index;
608                 backref->filetype = filetype;
609                 backref->found_dir_index = 1;
610         } else if (itemtype == BTRFS_DIR_ITEM_KEY) {
611                 rec->found_link++;
612                 if (backref->found_dir_item)
613                         backref->errors |= REF_ERR_DUP_DIR_ITEM;
614                 if (backref->found_dir_index && backref->filetype != filetype)
615                         backref->errors |= REF_ERR_FILETYPE_UNMATCH;
616
617                 backref->filetype = filetype;
618                 backref->found_dir_item = 1;
619         } else if ((itemtype == BTRFS_INODE_REF_KEY) ||
620                    (itemtype == BTRFS_INODE_EXTREF_KEY)) {
621                 if (backref->found_inode_ref)
622                         backref->errors |= REF_ERR_DUP_INODE_REF;
623                 if (backref->found_dir_index && backref->index != index)
624                         backref->errors |= REF_ERR_INDEX_UNMATCH;
625                 else
626                         backref->index = index;
627
628                 backref->ref_type = itemtype;
629                 backref->found_inode_ref = 1;
630         } else {
631                 BUG_ON(1);
632         }
633
634         maybe_free_inode_rec(inode_cache, rec);
635         return 0;
636 }
637
638 static int merge_inode_recs(struct inode_record *src, struct inode_record *dst,
639                             struct cache_tree *dst_cache)
640 {
641         struct inode_backref *backref;
642         u32 dir_count = 0;
643
644         dst->merging = 1;
645         list_for_each_entry(backref, &src->backrefs, list) {
646                 if (backref->found_dir_index) {
647                         add_inode_backref(dst_cache, dst->ino, backref->dir,
648                                         backref->index, backref->name,
649                                         backref->namelen, backref->filetype,
650                                         BTRFS_DIR_INDEX_KEY, backref->errors);
651                 }
652                 if (backref->found_dir_item) {
653                         dir_count++;
654                         add_inode_backref(dst_cache, dst->ino,
655                                         backref->dir, 0, backref->name,
656                                         backref->namelen, backref->filetype,
657                                         BTRFS_DIR_ITEM_KEY, backref->errors);
658                 }
659                 if (backref->found_inode_ref) {
660                         add_inode_backref(dst_cache, dst->ino,
661                                         backref->dir, backref->index,
662                                         backref->name, backref->namelen, 0,
663                                         backref->ref_type, backref->errors);
664                 }
665         }
666
667         if (src->found_dir_item)
668                 dst->found_dir_item = 1;
669         if (src->found_file_extent)
670                 dst->found_file_extent = 1;
671         if (src->found_csum_item)
672                 dst->found_csum_item = 1;
673         if (src->some_csum_missing)
674                 dst->some_csum_missing = 1;
675         if (dst->first_extent_gap > src->first_extent_gap)
676                 dst->first_extent_gap = src->first_extent_gap;
677
678         BUG_ON(src->found_link < dir_count);
679         dst->found_link += src->found_link - dir_count;
680         dst->found_size += src->found_size;
681         if (src->extent_start != (u64)-1) {
682                 if (dst->extent_start == (u64)-1) {
683                         dst->extent_start = src->extent_start;
684                         dst->extent_end = src->extent_end;
685                 } else {
686                         if (dst->extent_end > src->extent_start)
687                                 dst->errors |= I_ERR_FILE_EXTENT_OVERLAP;
688                         else if (dst->extent_end < src->extent_start &&
689                                  dst->extent_end < dst->first_extent_gap)
690                                 dst->first_extent_gap = dst->extent_end;
691                         if (dst->extent_end < src->extent_end)
692                                 dst->extent_end = src->extent_end;
693                 }
694         }
695
696         dst->errors |= src->errors;
697         if (src->found_inode_item) {
698                 if (!dst->found_inode_item) {
699                         dst->nlink = src->nlink;
700                         dst->isize = src->isize;
701                         dst->nbytes = src->nbytes;
702                         dst->imode = src->imode;
703                         dst->nodatasum = src->nodatasum;
704                         dst->found_inode_item = 1;
705                 } else {
706                         dst->errors |= I_ERR_DUP_INODE_ITEM;
707                 }
708         }
709         dst->merging = 0;
710
711         return 0;
712 }
713
714 static int splice_shared_node(struct shared_node *src_node,
715                               struct shared_node *dst_node)
716 {
717         struct cache_extent *cache;
718         struct ptr_node *node, *ins;
719         struct cache_tree *src, *dst;
720         struct inode_record *rec, *conflict;
721         u64 current_ino = 0;
722         int splice = 0;
723         int ret;
724
725         if (--src_node->refs == 0)
726                 splice = 1;
727         if (src_node->current)
728                 current_ino = src_node->current->ino;
729
730         src = &src_node->root_cache;
731         dst = &dst_node->root_cache;
732 again:
733         cache = search_cache_extent(src, 0);
734         while (cache) {
735                 node = container_of(cache, struct ptr_node, cache);
736                 rec = node->data;
737                 cache = next_cache_extent(cache);
738
739                 if (splice) {
740                         remove_cache_extent(src, &node->cache);
741                         ins = node;
742                 } else {
743                         ins = malloc(sizeof(*ins));
744                         ins->cache.start = node->cache.start;
745                         ins->cache.size = node->cache.size;
746                         ins->data = rec;
747                         rec->refs++;
748                 }
749                 ret = insert_cache_extent(dst, &ins->cache);
750                 if (ret == -EEXIST) {
751                         conflict = get_inode_rec(dst, rec->ino, 1);
752                         merge_inode_recs(rec, conflict, dst);
753                         if (rec->checked) {
754                                 conflict->checked = 1;
755                                 if (dst_node->current == conflict)
756                                         dst_node->current = NULL;
757                         }
758                         maybe_free_inode_rec(dst, conflict);
759                         free_inode_rec(rec);
760                         free(ins);
761                 } else {
762                         BUG_ON(ret);
763                 }
764         }
765
766         if (src == &src_node->root_cache) {
767                 src = &src_node->inode_cache;
768                 dst = &dst_node->inode_cache;
769                 goto again;
770         }
771
772         if (current_ino > 0 && (!dst_node->current ||
773             current_ino > dst_node->current->ino)) {
774                 if (dst_node->current) {
775                         dst_node->current->checked = 1;
776                         maybe_free_inode_rec(dst, dst_node->current);
777                 }
778                 dst_node->current = get_inode_rec(dst, current_ino, 1);
779         }
780         return 0;
781 }
782
783 static void free_inode_ptr(struct cache_extent *cache)
784 {
785         struct ptr_node *node;
786         struct inode_record *rec;
787
788         node = container_of(cache, struct ptr_node, cache);
789         rec = node->data;
790         free_inode_rec(rec);
791         free(node);
792 }
793
794 FREE_EXTENT_CACHE_BASED_TREE(inode_recs, free_inode_ptr);
795
796 static struct shared_node *find_shared_node(struct cache_tree *shared,
797                                             u64 bytenr)
798 {
799         struct cache_extent *cache;
800         struct shared_node *node;
801
802         cache = lookup_cache_extent(shared, bytenr, 1);
803         if (cache) {
804                 node = container_of(cache, struct shared_node, cache);
805                 return node;
806         }
807         return NULL;
808 }
809
810 static int add_shared_node(struct cache_tree *shared, u64 bytenr, u32 refs)
811 {
812         int ret;
813         struct shared_node *node;
814
815         node = calloc(1, sizeof(*node));
816         node->cache.start = bytenr;
817         node->cache.size = 1;
818         cache_tree_init(&node->root_cache);
819         cache_tree_init(&node->inode_cache);
820         node->refs = refs;
821
822         ret = insert_cache_extent(shared, &node->cache);
823         BUG_ON(ret);
824         return 0;
825 }
826
827 static int enter_shared_node(struct btrfs_root *root, u64 bytenr, u32 refs,
828                              struct walk_control *wc, int level)
829 {
830         struct shared_node *node;
831         struct shared_node *dest;
832
833         if (level == wc->active_node)
834                 return 0;
835
836         BUG_ON(wc->active_node <= level);
837         node = find_shared_node(&wc->shared, bytenr);
838         if (!node) {
839                 add_shared_node(&wc->shared, bytenr, refs);
840                 node = find_shared_node(&wc->shared, bytenr);
841                 wc->nodes[level] = node;
842                 wc->active_node = level;
843                 return 0;
844         }
845
846         if (wc->root_level == wc->active_node &&
847             btrfs_root_refs(&root->root_item) == 0) {
848                 if (--node->refs == 0) {
849                         free_inode_recs_tree(&node->root_cache);
850                         free_inode_recs_tree(&node->inode_cache);
851                         remove_cache_extent(&wc->shared, &node->cache);
852                         free(node);
853                 }
854                 return 1;
855         }
856
857         dest = wc->nodes[wc->active_node];
858         splice_shared_node(node, dest);
859         if (node->refs == 0) {
860                 remove_cache_extent(&wc->shared, &node->cache);
861                 free(node);
862         }
863         return 1;
864 }
865
866 static int leave_shared_node(struct btrfs_root *root,
867                              struct walk_control *wc, int level)
868 {
869         struct shared_node *node;
870         struct shared_node *dest;
871         int i;
872
873         if (level == wc->root_level)
874                 return 0;
875
876         for (i = level + 1; i < BTRFS_MAX_LEVEL; i++) {
877                 if (wc->nodes[i])
878                         break;
879         }
880         BUG_ON(i >= BTRFS_MAX_LEVEL);
881
882         node = wc->nodes[wc->active_node];
883         wc->nodes[wc->active_node] = NULL;
884         wc->active_node = i;
885
886         dest = wc->nodes[wc->active_node];
887         if (wc->active_node < wc->root_level ||
888             btrfs_root_refs(&root->root_item) > 0) {
889                 BUG_ON(node->refs <= 1);
890                 splice_shared_node(node, dest);
891         } else {
892                 BUG_ON(node->refs < 2);
893                 node->refs--;
894         }
895         return 0;
896 }
897
898 /*
899  * Returns:
900  * < 0 - on error
901  * 1   - if the root with id child_root_id is a child of root parent_root_id
902  * 0   - if the root child_root_id isn't a child of the root parent_root_id but
903  *       has other root(s) as parent(s)
904  * 2   - if the root child_root_id doesn't have any parent roots
905  */
906 static int is_child_root(struct btrfs_root *root, u64 parent_root_id,
907                          u64 child_root_id)
908 {
909         struct btrfs_path path;
910         struct btrfs_key key;
911         struct extent_buffer *leaf;
912         int has_parent = 0;
913         int ret;
914
915         btrfs_init_path(&path);
916
917         key.objectid = parent_root_id;
918         key.type = BTRFS_ROOT_REF_KEY;
919         key.offset = child_root_id;
920         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, &path,
921                                 0, 0);
922         if (ret < 0)
923                 return ret;
924         btrfs_release_path(&path);
925         if (!ret)
926                 return 1;
927
928         key.objectid = child_root_id;
929         key.type = BTRFS_ROOT_BACKREF_KEY;
930         key.offset = 0;
931         ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, &path,
932                                 0, 0);
933         if (ret < 0)
934                 goto out;
935
936         while (1) {
937                 leaf = path.nodes[0];
938                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
939                         ret = btrfs_next_leaf(root->fs_info->tree_root, &path);
940                         if (ret)
941                                 break;
942                         leaf = path.nodes[0];
943                 }
944
945                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
946                 if (key.objectid != child_root_id ||
947                     key.type != BTRFS_ROOT_BACKREF_KEY)
948                         break;
949
950                 has_parent = 1;
951
952                 if (key.offset == parent_root_id) {
953                         btrfs_release_path(&path);
954                         return 1;
955                 }
956
957                 path.slots[0]++;
958         }
959 out:
960         btrfs_release_path(&path);
961         if (ret < 0)
962                 return ret;
963         return has_parent ? 0 : 2;
964 }
965
966 static int process_dir_item(struct btrfs_root *root,
967                             struct extent_buffer *eb,
968                             int slot, struct btrfs_key *key,
969                             struct shared_node *active_node)
970 {
971         u32 total;
972         u32 cur = 0;
973         u32 len;
974         u32 name_len;
975         u32 data_len;
976         int error;
977         int nritems = 0;
978         int filetype;
979         struct btrfs_dir_item *di;
980         struct inode_record *rec;
981         struct cache_tree *root_cache;
982         struct cache_tree *inode_cache;
983         struct btrfs_key location;
984         char namebuf[BTRFS_NAME_LEN];
985
986         root_cache = &active_node->root_cache;
987         inode_cache = &active_node->inode_cache;
988         rec = active_node->current;
989         rec->found_dir_item = 1;
990
991         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
992         total = btrfs_item_size_nr(eb, slot);
993         while (cur < total) {
994                 nritems++;
995                 btrfs_dir_item_key_to_cpu(eb, di, &location);
996                 name_len = btrfs_dir_name_len(eb, di);
997                 data_len = btrfs_dir_data_len(eb, di);
998                 filetype = btrfs_dir_type(eb, di);
999
1000                 rec->found_size += name_len;
1001                 if (name_len <= BTRFS_NAME_LEN) {
1002                         len = name_len;
1003                         error = 0;
1004                 } else {
1005                         len = BTRFS_NAME_LEN;
1006                         error = REF_ERR_NAME_TOO_LONG;
1007                 }
1008                 read_extent_buffer(eb, namebuf, (unsigned long)(di + 1), len);
1009
1010                 if (location.type == BTRFS_INODE_ITEM_KEY) {
1011                         add_inode_backref(inode_cache, location.objectid,
1012                                           key->objectid, key->offset, namebuf,
1013                                           len, filetype, key->type, error);
1014                 } else if (location.type == BTRFS_ROOT_ITEM_KEY) {
1015                         add_inode_backref(root_cache, location.objectid,
1016                                           key->objectid, key->offset,
1017                                           namebuf, len, filetype,
1018                                           key->type, error);
1019                 } else {
1020                         fprintf(stderr, "invalid location in dir item %u\n",
1021                                 location.type);
1022                         add_inode_backref(inode_cache, BTRFS_MULTIPLE_OBJECTIDS,
1023                                           key->objectid, key->offset, namebuf,
1024                                           len, filetype, key->type, error);
1025                 }
1026
1027                 len = sizeof(*di) + name_len + data_len;
1028                 di = (struct btrfs_dir_item *)((char *)di + len);
1029                 cur += len;
1030         }
1031         if (key->type == BTRFS_DIR_INDEX_KEY && nritems > 1)
1032                 rec->errors |= I_ERR_DUP_DIR_INDEX;
1033
1034         return 0;
1035 }
1036
1037 static int process_inode_ref(struct extent_buffer *eb,
1038                              int slot, struct btrfs_key *key,
1039                              struct shared_node *active_node)
1040 {
1041         u32 total;
1042         u32 cur = 0;
1043         u32 len;
1044         u32 name_len;
1045         u64 index;
1046         int error;
1047         struct cache_tree *inode_cache;
1048         struct btrfs_inode_ref *ref;
1049         char namebuf[BTRFS_NAME_LEN];
1050
1051         inode_cache = &active_node->inode_cache;
1052
1053         ref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
1054         total = btrfs_item_size_nr(eb, slot);
1055         while (cur < total) {
1056                 name_len = btrfs_inode_ref_name_len(eb, ref);
1057                 index = btrfs_inode_ref_index(eb, ref);
1058                 if (name_len <= BTRFS_NAME_LEN) {
1059                         len = name_len;
1060                         error = 0;
1061                 } else {
1062                         len = BTRFS_NAME_LEN;
1063                         error = REF_ERR_NAME_TOO_LONG;
1064                 }
1065                 read_extent_buffer(eb, namebuf, (unsigned long)(ref + 1), len);
1066                 add_inode_backref(inode_cache, key->objectid, key->offset,
1067                                   index, namebuf, len, 0, key->type, error);
1068
1069                 len = sizeof(*ref) + name_len;
1070                 ref = (struct btrfs_inode_ref *)((char *)ref + len);
1071                 cur += len;
1072         }
1073         return 0;
1074 }
1075
1076 static int process_inode_extref(struct extent_buffer *eb,
1077                                 int slot, struct btrfs_key *key,
1078                                 struct shared_node *active_node)
1079 {
1080         u32 total;
1081         u32 cur = 0;
1082         u32 len;
1083         u32 name_len;
1084         u64 index;
1085         u64 parent;
1086         int error;
1087         struct cache_tree *inode_cache;
1088         struct btrfs_inode_extref *extref;
1089         char namebuf[BTRFS_NAME_LEN];
1090
1091         inode_cache = &active_node->inode_cache;
1092
1093         extref = btrfs_item_ptr(eb, slot, struct btrfs_inode_extref);
1094         total = btrfs_item_size_nr(eb, slot);
1095         while (cur < total) {
1096                 name_len = btrfs_inode_extref_name_len(eb, extref);
1097                 index = btrfs_inode_extref_index(eb, extref);
1098                 parent = btrfs_inode_extref_parent(eb, extref);
1099                 if (name_len <= BTRFS_NAME_LEN) {
1100                         len = name_len;
1101                         error = 0;
1102                 } else {
1103                         len = BTRFS_NAME_LEN;
1104                         error = REF_ERR_NAME_TOO_LONG;
1105                 }
1106                 read_extent_buffer(eb, namebuf,
1107                                    (unsigned long)(extref + 1), len);
1108                 add_inode_backref(inode_cache, key->objectid, parent,
1109                                   index, namebuf, len, 0, key->type, error);
1110
1111                 len = sizeof(*extref) + name_len;
1112                 extref = (struct btrfs_inode_extref *)((char *)extref + len);
1113                 cur += len;
1114         }
1115         return 0;
1116
1117 }
1118
1119 static int count_csum_range(struct btrfs_root *root, u64 start,
1120                             u64 len, u64 *found)
1121 {
1122         struct btrfs_key key;
1123         struct btrfs_path path;
1124         struct extent_buffer *leaf;
1125         int ret;
1126         size_t size;
1127         *found = 0;
1128         u64 csum_end;
1129         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
1130
1131         btrfs_init_path(&path);
1132
1133         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1134         key.offset = start;
1135         key.type = BTRFS_EXTENT_CSUM_KEY;
1136
1137         ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
1138                                 &key, &path, 0, 0);
1139         if (ret < 0)
1140                 goto out;
1141         if (ret > 0 && path.slots[0] > 0) {
1142                 leaf = path.nodes[0];
1143                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0] - 1);
1144                 if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
1145                     key.type == BTRFS_EXTENT_CSUM_KEY)
1146                         path.slots[0]--;
1147         }
1148
1149         while (len > 0) {
1150                 leaf = path.nodes[0];
1151                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
1152                         ret = btrfs_next_leaf(root->fs_info->csum_root, &path);
1153                         if (ret > 0)
1154                                 break;
1155                         else if (ret < 0)
1156                                 goto out;
1157                         leaf = path.nodes[0];
1158                 }
1159
1160                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1161                 if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
1162                     key.type != BTRFS_EXTENT_CSUM_KEY)
1163                         break;
1164
1165                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
1166                 if (key.offset >= start + len)
1167                         break;
1168
1169                 if (key.offset > start)
1170                         start = key.offset;
1171
1172                 size = btrfs_item_size_nr(leaf, path.slots[0]);
1173                 csum_end = key.offset + (size / csum_size) * root->sectorsize;
1174                 if (csum_end > start) {
1175                         size = min(csum_end - start, len);
1176                         len -= size;
1177                         start += size;
1178                         *found += size;
1179                 }
1180
1181                 path.slots[0]++;
1182         }
1183 out:
1184         if (ret < 0)
1185                 return ret;
1186         btrfs_release_path(&path);
1187         return 0;
1188 }
1189
1190 static int process_file_extent(struct btrfs_root *root,
1191                                 struct extent_buffer *eb,
1192                                 int slot, struct btrfs_key *key,
1193                                 struct shared_node *active_node)
1194 {
1195         struct inode_record *rec;
1196         struct btrfs_file_extent_item *fi;
1197         u64 num_bytes = 0;
1198         u64 disk_bytenr = 0;
1199         u64 extent_offset = 0;
1200         u64 mask = root->sectorsize - 1;
1201         int extent_type;
1202         int ret;
1203
1204         rec = active_node->current;
1205         BUG_ON(rec->ino != key->objectid || rec->refs > 1);
1206         rec->found_file_extent = 1;
1207
1208         if (rec->extent_start == (u64)-1) {
1209                 rec->extent_start = key->offset;
1210                 rec->extent_end = key->offset;
1211         }
1212
1213         if (rec->extent_end > key->offset)
1214                 rec->errors |= I_ERR_FILE_EXTENT_OVERLAP;
1215         else if (rec->extent_end < key->offset &&
1216                  rec->extent_end < rec->first_extent_gap)
1217                 rec->first_extent_gap = rec->extent_end;
1218
1219         fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
1220         extent_type = btrfs_file_extent_type(eb, fi);
1221
1222         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1223                 num_bytes = btrfs_file_extent_inline_len(eb, slot, fi);
1224                 if (num_bytes == 0)
1225                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1226                 rec->found_size += num_bytes;
1227                 num_bytes = (num_bytes + mask) & ~mask;
1228         } else if (extent_type == BTRFS_FILE_EXTENT_REG ||
1229                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1230                 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1231                 disk_bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1232                 extent_offset = btrfs_file_extent_offset(eb, fi);
1233                 if (num_bytes == 0 || (num_bytes & mask))
1234                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1235                 if (num_bytes + extent_offset >
1236                     btrfs_file_extent_ram_bytes(eb, fi))
1237                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1238                 if (extent_type == BTRFS_FILE_EXTENT_PREALLOC &&
1239                     (btrfs_file_extent_compression(eb, fi) ||
1240                      btrfs_file_extent_encryption(eb, fi) ||
1241                      btrfs_file_extent_other_encoding(eb, fi)))
1242                         rec->errors |= I_ERR_BAD_FILE_EXTENT;
1243                 if (disk_bytenr > 0)
1244                         rec->found_size += num_bytes;
1245         } else {
1246                 rec->errors |= I_ERR_BAD_FILE_EXTENT;
1247         }
1248         rec->extent_end = key->offset + num_bytes;
1249
1250         if (disk_bytenr > 0) {
1251                 u64 found;
1252                 if (btrfs_file_extent_compression(eb, fi))
1253                         num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1254                 else
1255                         disk_bytenr += extent_offset;
1256
1257                 ret = count_csum_range(root, disk_bytenr, num_bytes, &found);
1258                 if (ret < 0)
1259                         return ret;
1260                 if (extent_type == BTRFS_FILE_EXTENT_REG) {
1261                         if (found > 0)
1262                                 rec->found_csum_item = 1;
1263                         if (found < num_bytes)
1264                                 rec->some_csum_missing = 1;
1265                 } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1266                         if (found > 0)
1267                                 rec->errors |= I_ERR_ODD_CSUM_ITEM;
1268                 }
1269         }
1270         return 0;
1271 }
1272
1273 static int process_one_leaf(struct btrfs_root *root, struct extent_buffer *eb,
1274                             struct walk_control *wc)
1275 {
1276         struct btrfs_key key;
1277         u32 nritems;
1278         int i;
1279         int ret = 0;
1280         struct cache_tree *inode_cache;
1281         struct shared_node *active_node;
1282
1283         if (wc->root_level == wc->active_node &&
1284             btrfs_root_refs(&root->root_item) == 0)
1285                 return 0;
1286
1287         active_node = wc->nodes[wc->active_node];
1288         inode_cache = &active_node->inode_cache;
1289         nritems = btrfs_header_nritems(eb);
1290         for (i = 0; i < nritems; i++) {
1291                 btrfs_item_key_to_cpu(eb, &key, i);
1292
1293                 if (key.objectid == BTRFS_FREE_SPACE_OBJECTID)
1294                         continue;
1295                 if (key.type == BTRFS_ORPHAN_ITEM_KEY)
1296                         continue;
1297
1298                 if (active_node->current == NULL ||
1299                     active_node->current->ino < key.objectid) {
1300                         if (active_node->current) {
1301                                 active_node->current->checked = 1;
1302                                 maybe_free_inode_rec(inode_cache,
1303                                                      active_node->current);
1304                         }
1305                         active_node->current = get_inode_rec(inode_cache,
1306                                                              key.objectid, 1);
1307                 }
1308                 switch (key.type) {
1309                 case BTRFS_DIR_ITEM_KEY:
1310                 case BTRFS_DIR_INDEX_KEY:
1311                         ret = process_dir_item(root, eb, i, &key, active_node);
1312                         break;
1313                 case BTRFS_INODE_REF_KEY:
1314                         ret = process_inode_ref(eb, i, &key, active_node);
1315                         break;
1316                 case BTRFS_INODE_EXTREF_KEY:
1317                         ret = process_inode_extref(eb, i, &key, active_node);
1318                         break;
1319                 case BTRFS_INODE_ITEM_KEY:
1320                         ret = process_inode_item(eb, i, &key, active_node);
1321                         break;
1322                 case BTRFS_EXTENT_DATA_KEY:
1323                         ret = process_file_extent(root, eb, i, &key,
1324                                                   active_node);
1325                         break;
1326                 default:
1327                         break;
1328                 };
1329         }
1330         return ret;
1331 }
1332
1333 static void reada_walk_down(struct btrfs_root *root,
1334                             struct extent_buffer *node, int slot)
1335 {
1336         u64 bytenr;
1337         u64 ptr_gen;
1338         u32 nritems;
1339         u32 blocksize;
1340         int i;
1341         int level;
1342
1343         level = btrfs_header_level(node);
1344         if (level != 1)
1345                 return;
1346
1347         nritems = btrfs_header_nritems(node);
1348         blocksize = btrfs_level_size(root, level - 1);
1349         for (i = slot; i < nritems; i++) {
1350                 bytenr = btrfs_node_blockptr(node, i);
1351                 ptr_gen = btrfs_node_ptr_generation(node, i);
1352                 readahead_tree_block(root, bytenr, blocksize, ptr_gen);
1353         }
1354 }
1355
1356 /*
1357  * Check the child node/leaf by the following condition:
1358  * 1. the first item key of the node/leaf should be the same with the one
1359  *    in parent.
1360  * 2. block in parent node should match the child node/leaf.
1361  * 3. generation of parent node and child's header should be consistent.
1362  *
1363  * Or the child node/leaf pointed by the key in parent is not valid.
1364  *
1365  * We hope to check leaf owner too, but since subvol may share leaves,
1366  * which makes leaf owner check not so strong, key check should be
1367  * sufficient enough for that case.
1368  */
1369 static int check_child_node(struct btrfs_root *root,
1370                             struct extent_buffer *parent, int slot,
1371                             struct extent_buffer *child)
1372 {
1373         struct btrfs_key parent_key;
1374         struct btrfs_key child_key;
1375         int ret = 0;
1376
1377         btrfs_node_key_to_cpu(parent, &parent_key, slot);
1378         if (btrfs_header_level(child) == 0)
1379                 btrfs_item_key_to_cpu(child, &child_key, 0);
1380         else
1381                 btrfs_node_key_to_cpu(child, &child_key, 0);
1382
1383         if (memcmp(&parent_key, &child_key, sizeof(parent_key))) {
1384                 ret = -EINVAL;
1385                 fprintf(stderr,
1386                         "Wrong key of child node/leaf, wanted: (%llu, %u, %llu), have: (%llu, %u, %llu)\n",
1387                         parent_key.objectid, parent_key.type, parent_key.offset,
1388                         child_key.objectid, child_key.type, child_key.offset);
1389         }
1390         if (btrfs_header_bytenr(child) != btrfs_node_blockptr(parent, slot)) {
1391                 ret = -EINVAL;
1392                 fprintf(stderr, "Wrong block of child node/leaf, wanted: %llu, have: %llu\n",
1393                         btrfs_node_blockptr(parent, slot),
1394                         btrfs_header_bytenr(child));
1395         }
1396         if (btrfs_node_ptr_generation(parent, slot) !=
1397             btrfs_header_generation(child)) {
1398                 ret = -EINVAL;
1399                 fprintf(stderr, "Wrong generation of child node/leaf, wanted: %llu, have: %llu\n",
1400                         btrfs_header_generation(child),
1401                         btrfs_node_ptr_generation(parent, slot));
1402         }
1403         return ret;
1404 }
1405
1406 static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
1407                           struct walk_control *wc, int *level)
1408 {
1409         enum btrfs_tree_block_status status;
1410         u64 bytenr;
1411         u64 ptr_gen;
1412         struct extent_buffer *next;
1413         struct extent_buffer *cur;
1414         u32 blocksize;
1415         int ret, err = 0;
1416         u64 refs;
1417
1418         WARN_ON(*level < 0);
1419         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1420         ret = btrfs_lookup_extent_info(NULL, root,
1421                                        path->nodes[*level]->start,
1422                                        *level, 1, &refs, NULL);
1423         if (ret < 0) {
1424                 err = ret;
1425                 goto out;
1426         }
1427
1428         if (refs > 1) {
1429                 ret = enter_shared_node(root, path->nodes[*level]->start,
1430                                         refs, wc, *level);
1431                 if (ret > 0) {
1432                         err = ret;
1433                         goto out;
1434                 }
1435         }
1436
1437         while (*level >= 0) {
1438                 WARN_ON(*level < 0);
1439                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1440                 cur = path->nodes[*level];
1441
1442                 if (btrfs_header_level(cur) != *level)
1443                         WARN_ON(1);
1444
1445                 if (path->slots[*level] >= btrfs_header_nritems(cur))
1446                         break;
1447                 if (*level == 0) {
1448                         ret = process_one_leaf(root, cur, wc);
1449                         if (ret < 0)
1450                                 err = ret;
1451                         break;
1452                 }
1453                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1454                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1455                 blocksize = btrfs_level_size(root, *level - 1);
1456                 ret = btrfs_lookup_extent_info(NULL, root, bytenr, *level - 1,
1457                                                1, &refs, NULL);
1458                 if (ret < 0)
1459                         refs = 0;
1460
1461                 if (refs > 1) {
1462                         ret = enter_shared_node(root, bytenr, refs,
1463                                                 wc, *level - 1);
1464                         if (ret > 0) {
1465                                 path->slots[*level]++;
1466                                 continue;
1467                         }
1468                 }
1469
1470                 next = btrfs_find_tree_block(root, bytenr, blocksize);
1471                 if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
1472                         free_extent_buffer(next);
1473                         reada_walk_down(root, cur, path->slots[*level]);
1474                         next = read_tree_block(root, bytenr, blocksize,
1475                                                ptr_gen);
1476                         if (!next) {
1477                                 struct btrfs_key node_key;
1478
1479                                 btrfs_node_key_to_cpu(path->nodes[*level],
1480                                                       &node_key,
1481                                                       path->slots[*level]);
1482                                 btrfs_add_corrupt_extent_record(root->fs_info,
1483                                                 &node_key,
1484                                                 path->nodes[*level]->start,
1485                                                 root->leafsize, *level);
1486                                 err = -EIO;
1487                                 goto out;
1488                         }
1489                 }
1490
1491                 ret = check_child_node(root, cur, path->slots[*level], next);
1492                 if (ret) {
1493                         err = ret;
1494                         goto out;
1495                 }
1496
1497                 if (btrfs_is_leaf(next))
1498                         status = btrfs_check_leaf(root, NULL, next);
1499                 else
1500                         status = btrfs_check_node(root, NULL, next);
1501                 if (status != BTRFS_TREE_BLOCK_CLEAN) {
1502                         free_extent_buffer(next);
1503                         err = -EIO;
1504                         goto out;
1505                 }
1506
1507                 *level = *level - 1;
1508                 free_extent_buffer(path->nodes[*level]);
1509                 path->nodes[*level] = next;
1510                 path->slots[*level] = 0;
1511         }
1512 out:
1513         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1514         return err;
1515 }
1516
1517 static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
1518                         struct walk_control *wc, int *level)
1519 {
1520         int i;
1521         struct extent_buffer *leaf;
1522
1523         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1524                 leaf = path->nodes[i];
1525                 if (path->slots[i] + 1 < btrfs_header_nritems(leaf)) {
1526                         path->slots[i]++;
1527                         *level = i;
1528                         return 0;
1529                 } else {
1530                         free_extent_buffer(path->nodes[*level]);
1531                         path->nodes[*level] = NULL;
1532                         BUG_ON(*level > wc->active_node);
1533                         if (*level == wc->active_node)
1534                                 leave_shared_node(root, wc, *level);
1535                         *level = i + 1;
1536                 }
1537         }
1538         return 1;
1539 }
1540
1541 static int check_root_dir(struct inode_record *rec)
1542 {
1543         struct inode_backref *backref;
1544         int ret = -1;
1545
1546         if (!rec->found_inode_item || rec->errors)
1547                 goto out;
1548         if (rec->nlink != 1 || rec->found_link != 0)
1549                 goto out;
1550         if (list_empty(&rec->backrefs))
1551                 goto out;
1552         backref = list_entry(rec->backrefs.next, struct inode_backref, list);
1553         if (!backref->found_inode_ref)
1554                 goto out;
1555         if (backref->index != 0 || backref->namelen != 2 ||
1556             memcmp(backref->name, "..", 2))
1557                 goto out;
1558         if (backref->found_dir_index || backref->found_dir_item)
1559                 goto out;
1560         ret = 0;
1561 out:
1562         return ret;
1563 }
1564
1565 static int repair_inode_isize(struct btrfs_trans_handle *trans,
1566                               struct btrfs_root *root, struct btrfs_path *path,
1567                               struct inode_record *rec)
1568 {
1569         struct btrfs_inode_item *ei;
1570         struct btrfs_key key;
1571         int ret;
1572
1573         key.objectid = rec->ino;
1574         key.type = BTRFS_INODE_ITEM_KEY;
1575         key.offset = (u64)-1;
1576
1577         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1578         if (ret < 0)
1579                 goto out;
1580         if (ret) {
1581                 if (!path->slots[0]) {
1582                         ret = -ENOENT;
1583                         goto out;
1584                 }
1585                 path->slots[0]--;
1586                 ret = 0;
1587         }
1588         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1589         if (key.objectid != rec->ino) {
1590                 ret = -ENOENT;
1591                 goto out;
1592         }
1593
1594         ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1595                             struct btrfs_inode_item);
1596         btrfs_set_inode_size(path->nodes[0], ei, rec->found_size);
1597         btrfs_mark_buffer_dirty(path->nodes[0]);
1598         rec->errors &= ~I_ERR_DIR_ISIZE_WRONG;
1599         printf("reset isize for dir %Lu root %Lu\n", rec->ino,
1600                root->root_key.objectid);
1601 out:
1602         btrfs_release_path(path);
1603         return ret;
1604 }
1605
1606 static int repair_inode_orphan_item(struct btrfs_trans_handle *trans,
1607                                     struct btrfs_root *root,
1608                                     struct btrfs_path *path,
1609                                     struct inode_record *rec)
1610 {
1611         int ret;
1612
1613         ret = btrfs_add_orphan_item(trans, root, path, rec->ino);
1614         btrfs_release_path(path);
1615         if (!ret)
1616                 rec->errors &= ~I_ERR_NO_ORPHAN_ITEM;
1617         return ret;
1618 }
1619
1620 static int add_missing_dir_index(struct btrfs_root *root,
1621                                  struct cache_tree *inode_cache,
1622                                  struct inode_record *rec,
1623                                  struct inode_backref *backref)
1624 {
1625         struct btrfs_path *path;
1626         struct btrfs_trans_handle *trans;
1627         struct btrfs_dir_item *dir_item;
1628         struct extent_buffer *leaf;
1629         struct btrfs_key key;
1630         struct btrfs_disk_key disk_key;
1631         struct inode_record *dir_rec;
1632         unsigned long name_ptr;
1633         u32 data_size = sizeof(*dir_item) + backref->namelen;
1634         int ret;
1635
1636         path = btrfs_alloc_path();
1637         if (!path)
1638                 return -ENOMEM;
1639
1640         trans = btrfs_start_transaction(root, 1);
1641         if (IS_ERR(trans)) {
1642                 btrfs_free_path(path);
1643                 return PTR_ERR(trans);
1644         }
1645
1646         fprintf(stderr, "repairing missing dir index item for inode %llu\n",
1647                 (unsigned long long)rec->ino);
1648         key.objectid = backref->dir;
1649         key.type = BTRFS_DIR_INDEX_KEY;
1650         key.offset = backref->index;
1651
1652         ret = btrfs_insert_empty_item(trans, root, path, &key, data_size);
1653         BUG_ON(ret);
1654
1655         leaf = path->nodes[0];
1656         dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
1657
1658         disk_key.objectid = cpu_to_le64(rec->ino);
1659         disk_key.type = BTRFS_INODE_ITEM_KEY;
1660         disk_key.offset = 0;
1661
1662         btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
1663         btrfs_set_dir_type(leaf, dir_item, imode_to_type(rec->imode));
1664         btrfs_set_dir_data_len(leaf, dir_item, 0);
1665         btrfs_set_dir_name_len(leaf, dir_item, backref->namelen);
1666         name_ptr = (unsigned long)(dir_item + 1);
1667         write_extent_buffer(leaf, backref->name, name_ptr, backref->namelen);
1668         btrfs_mark_buffer_dirty(leaf);
1669         btrfs_free_path(path);
1670         btrfs_commit_transaction(trans, root);
1671
1672         backref->found_dir_index = 1;
1673         dir_rec = get_inode_rec(inode_cache, backref->dir, 0);
1674         if (!dir_rec)
1675                 return 0;
1676         dir_rec->found_size += backref->namelen;
1677         if (dir_rec->found_size == dir_rec->isize &&
1678             (dir_rec->errors & I_ERR_DIR_ISIZE_WRONG))
1679                 dir_rec->errors &= ~I_ERR_DIR_ISIZE_WRONG;
1680         if (dir_rec->found_size != dir_rec->isize)
1681                 dir_rec->errors |= I_ERR_DIR_ISIZE_WRONG;
1682
1683         return 0;
1684 }
1685
1686 static int delete_dir_index(struct btrfs_root *root,
1687                             struct cache_tree *inode_cache,
1688                             struct inode_record *rec,
1689                             struct inode_backref *backref)
1690 {
1691         struct btrfs_trans_handle *trans;
1692         struct btrfs_dir_item *di;
1693         struct btrfs_path *path;
1694         int ret = 0;
1695
1696         path = btrfs_alloc_path();
1697         if (!path)
1698                 return -ENOMEM;
1699
1700         trans = btrfs_start_transaction(root, 1);
1701         if (IS_ERR(trans)) {
1702                 btrfs_free_path(path);
1703                 return PTR_ERR(trans);
1704         }
1705
1706
1707         fprintf(stderr, "Deleting bad dir index [%llu,%u,%llu] root %llu\n",
1708                 (unsigned long long)backref->dir,
1709                 BTRFS_DIR_INDEX_KEY, (unsigned long long)backref->index,
1710                 (unsigned long long)root->objectid);
1711
1712         di = btrfs_lookup_dir_index(trans, root, path, backref->dir,
1713                                     backref->name, backref->namelen,
1714                                     backref->index, -1);
1715         if (IS_ERR(di)) {
1716                 ret = PTR_ERR(di);
1717                 btrfs_free_path(path);
1718                 btrfs_commit_transaction(trans, root);
1719                 if (ret == -ENOENT)
1720                         return 0;
1721                 return ret;
1722         }
1723
1724         if (!di)
1725                 ret = btrfs_del_item(trans, root, path);
1726         else
1727                 ret = btrfs_delete_one_dir_name(trans, root, path, di);
1728         BUG_ON(ret);
1729         btrfs_free_path(path);
1730         btrfs_commit_transaction(trans, root);
1731         return ret;
1732 }
1733
1734 static int create_inode_item(struct btrfs_root *root,
1735                              struct inode_record *rec,
1736                              struct inode_backref *backref, int root_dir)
1737 {
1738         struct btrfs_trans_handle *trans;
1739         struct btrfs_inode_item inode_item;
1740         time_t now = time(NULL);
1741         int ret;
1742
1743         trans = btrfs_start_transaction(root, 1);
1744         if (IS_ERR(trans)) {
1745                 ret = PTR_ERR(trans);
1746                 return ret;
1747         }
1748
1749         fprintf(stderr, "root %llu inode %llu recreating inode item, this may "
1750                 "be incomplete, please check permissions and content after "
1751                 "the fsck completes.\n", (unsigned long long)root->objectid,
1752                 (unsigned long long)rec->ino);
1753
1754         memset(&inode_item, 0, sizeof(inode_item));
1755         btrfs_set_stack_inode_generation(&inode_item, trans->transid);
1756         if (root_dir)
1757                 btrfs_set_stack_inode_nlink(&inode_item, 1);
1758         else
1759                 btrfs_set_stack_inode_nlink(&inode_item, rec->found_link);
1760         btrfs_set_stack_inode_nbytes(&inode_item, rec->found_size);
1761         if (rec->found_dir_item) {
1762                 if (rec->found_file_extent)
1763                         fprintf(stderr, "root %llu inode %llu has both a dir "
1764                                 "item and extents, unsure if it is a dir or a "
1765                                 "regular file so setting it as a directory\n",
1766                                 (unsigned long long)root->objectid,
1767                                 (unsigned long long)rec->ino);
1768                 btrfs_set_stack_inode_mode(&inode_item, S_IFDIR | 0755);
1769                 btrfs_set_stack_inode_size(&inode_item, rec->found_size);
1770         } else if (!rec->found_dir_item) {
1771                 btrfs_set_stack_inode_size(&inode_item, rec->extent_end);
1772                 btrfs_set_stack_inode_mode(&inode_item, S_IFREG | 0755);
1773         }
1774         btrfs_set_stack_timespec_sec(&inode_item.atime, now);
1775         btrfs_set_stack_timespec_nsec(&inode_item.atime, 0);
1776         btrfs_set_stack_timespec_sec(&inode_item.ctime, now);
1777         btrfs_set_stack_timespec_nsec(&inode_item.ctime, 0);
1778         btrfs_set_stack_timespec_sec(&inode_item.mtime, now);
1779         btrfs_set_stack_timespec_nsec(&inode_item.mtime, 0);
1780         btrfs_set_stack_timespec_sec(&inode_item.otime, 0);
1781         btrfs_set_stack_timespec_nsec(&inode_item.otime, 0);
1782
1783         ret = btrfs_insert_inode(trans, root, rec->ino, &inode_item);
1784         BUG_ON(ret);
1785         btrfs_commit_transaction(trans, root);
1786         return 0;
1787 }
1788
1789 static int repair_inode_backrefs(struct btrfs_root *root,
1790                                  struct inode_record *rec,
1791                                  struct cache_tree *inode_cache,
1792                                  int delete)
1793 {
1794         struct inode_backref *tmp, *backref;
1795         u64 root_dirid = btrfs_root_dirid(&root->root_item);
1796         int ret = 0;
1797         int repaired = 0;
1798
1799         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
1800                 if (!delete && rec->ino == root_dirid) {
1801                         if (!rec->found_inode_item) {
1802                                 ret = create_inode_item(root, rec, backref, 1);
1803                                 if (ret)
1804                                         break;
1805                                 repaired++;
1806                         }
1807                 }
1808
1809                 /* Index 0 for root dir's are special, don't mess with it */
1810                 if (rec->ino == root_dirid && backref->index == 0)
1811                         continue;
1812
1813                 if (delete &&
1814                     ((backref->found_dir_index && !backref->found_inode_ref) ||
1815                      (backref->found_dir_index && backref->found_inode_ref &&
1816                       (backref->errors & REF_ERR_INDEX_UNMATCH)))) {
1817                         ret = delete_dir_index(root, inode_cache, rec, backref);
1818                         if (ret)
1819                                 break;
1820                         repaired++;
1821                         list_del(&backref->list);
1822                         free(backref);
1823                 }
1824
1825                 if (!delete && !backref->found_dir_index &&
1826                     backref->found_dir_item && backref->found_inode_ref) {
1827                         ret = add_missing_dir_index(root, inode_cache, rec,
1828                                                     backref);
1829                         if (ret)
1830                                 break;
1831                         repaired++;
1832                         if (backref->found_dir_item &&
1833                             backref->found_dir_index &&
1834                             backref->found_dir_index) {
1835                                 if (!backref->errors &&
1836                                     backref->found_inode_ref) {
1837                                         list_del(&backref->list);
1838                                         free(backref);
1839                                 }
1840                         }
1841                 }
1842
1843                 if (!delete && (!backref->found_dir_index &&
1844                                 !backref->found_dir_item &&
1845                                 backref->found_inode_ref)) {
1846                         struct btrfs_trans_handle *trans;
1847                         struct btrfs_key location;
1848
1849                         ret = check_dir_conflict(root, backref->name,
1850                                                  backref->namelen,
1851                                                  backref->dir,
1852                                                  backref->index);
1853                         if (ret) {
1854                                 /*
1855                                  * let nlink fixing routine to handle it,
1856                                  * which can do it better.
1857                                  */
1858                                 ret = 0;
1859                                 break;
1860                         }
1861                         location.objectid = rec->ino;
1862                         location.type = BTRFS_INODE_ITEM_KEY;
1863                         location.offset = 0;
1864
1865                         trans = btrfs_start_transaction(root, 1);
1866                         if (IS_ERR(trans)) {
1867                                 ret = PTR_ERR(trans);
1868                                 break;
1869                         }
1870                         fprintf(stderr, "adding missing dir index/item pair "
1871                                 "for inode %llu\n",
1872                                 (unsigned long long)rec->ino);
1873                         ret = btrfs_insert_dir_item(trans, root, backref->name,
1874                                                     backref->namelen,
1875                                                     backref->dir, &location,
1876                                                     imode_to_type(rec->imode),
1877                                                     backref->index);
1878                         BUG_ON(ret);
1879                         btrfs_commit_transaction(trans, root);
1880                         repaired++;
1881                 }
1882
1883                 if (!delete && (backref->found_inode_ref &&
1884                                 backref->found_dir_index &&
1885                                 backref->found_dir_item &&
1886                                 !(backref->errors & REF_ERR_INDEX_UNMATCH) &&
1887                                 !rec->found_inode_item)) {
1888                         ret = create_inode_item(root, rec, backref, 0);
1889                         if (ret)
1890                                 break;
1891                         repaired++;
1892                 }
1893
1894         }
1895         return ret ? ret : repaired;
1896 }
1897
1898 /*
1899  * To determine the file type for nlink/inode_item repair
1900  *
1901  * Return 0 if file type is found and BTRFS_FT_* is stored into type.
1902  * Return -ENOENT if file type is not found.
1903  */
1904 static int find_file_type(struct inode_record *rec, u8 *type)
1905 {
1906         struct inode_backref *backref;
1907
1908         list_for_each_entry(backref, &rec->backrefs, list) {
1909                 if (backref->found_dir_index || backref->found_dir_item) {
1910                         *type = backref->filetype;
1911                         return 0;
1912                 }
1913         }
1914         return -ENOENT;
1915 }
1916
1917 /*
1918  * To determine the file name for nlink repair
1919  *
1920  * Return 0 if file name is found, set name and namelen.
1921  * Return -ENOENT if file name is not found.
1922  */
1923 static int find_file_name(struct inode_record *rec,
1924                           char *name, int *namelen)
1925 {
1926         struct inode_backref *backref;
1927
1928         list_for_each_entry(backref, &rec->backrefs, list) {
1929                 if (backref->found_dir_index || backref->found_dir_item ||
1930                     backref->found_inode_ref) {
1931                         memcpy(name, backref->name, backref->namelen);
1932                         *namelen = backref->namelen;
1933                         return 0;
1934                 }
1935         }
1936         return -ENOENT;
1937 }
1938
1939 /* Reset the nlink of the inode to the correct one */
1940 static int reset_nlink(struct btrfs_trans_handle *trans,
1941                        struct btrfs_root *root,
1942                        struct btrfs_path *path,
1943                        struct inode_record *rec)
1944 {
1945         struct inode_backref *backref;
1946         struct inode_backref *tmp;
1947         struct btrfs_key key;
1948         struct btrfs_inode_item *inode_item;
1949         int ret = 0;
1950
1951         /* Remove all backref including the valid ones */
1952         list_for_each_entry_safe(backref, tmp, &rec->backrefs, list) {
1953                 ret = btrfs_unlink(trans, root, rec->ino, backref->dir,
1954                                    backref->index, backref->name,
1955                                    backref->namelen, 0);
1956                 if (ret < 0)
1957                         goto out;
1958
1959                 /* remove invalid backref, so it won't be added back */
1960                 if (!(backref->found_dir_index &&
1961                       backref->found_dir_item &&
1962                       backref->found_inode_ref)) {
1963                         list_del(&backref->list);
1964                         free(backref);
1965                 }
1966         }
1967
1968         /* Set nlink to 0 */
1969         key.objectid = rec->ino;
1970         key.type = BTRFS_INODE_ITEM_KEY;
1971         key.offset = 0;
1972         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1973         if (ret < 0)
1974                 goto out;
1975         if (ret > 0) {
1976                 ret = -ENOENT;
1977                 goto out;
1978         }
1979         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1980                                     struct btrfs_inode_item);
1981         btrfs_set_inode_nlink(path->nodes[0], inode_item, 0);
1982         btrfs_mark_buffer_dirty(path->nodes[0]);
1983         btrfs_release_path(path);
1984
1985         /*
1986          * Add back valid inode_ref/dir_item/dir_index,
1987          * add_link() will handle the nlink inc, so new nlink must be correct
1988          */
1989         list_for_each_entry(backref, &rec->backrefs, list) {
1990                 ret = btrfs_add_link(trans, root, rec->ino, backref->dir,
1991                                      backref->name, backref->namelen,
1992                                      backref->ref_type, &backref->index, 1);
1993                 if (ret < 0)
1994                         goto out;
1995         }
1996 out:
1997         btrfs_release_path(path);
1998         return ret;
1999 }
2000
2001 static int repair_inode_nlinks(struct btrfs_trans_handle *trans,
2002                                struct btrfs_root *root,
2003                                struct btrfs_path *path,
2004                                struct inode_record *rec)
2005 {
2006         char *dir_name = "lost+found";
2007         char namebuf[BTRFS_NAME_LEN] = {0};
2008         u64 lost_found_ino;
2009         u32 mode = 0700;
2010         u8 type = 0;
2011         int namelen = 0;
2012         int name_recovered = 0;
2013         int type_recovered = 0;
2014         int ret = 0;
2015
2016         /*
2017          * Get file name and type first before these invalid inode ref
2018          * are deleted by remove_all_invalid_backref()
2019          */
2020         name_recovered = !find_file_name(rec, namebuf, &namelen);
2021         type_recovered = !find_file_type(rec, &type);
2022
2023         if (!name_recovered) {
2024                 printf("Can't get file name for inode %llu, using '%llu' as fallback\n",
2025                        rec->ino, rec->ino);
2026                 namelen = count_digits(rec->ino);
2027                 sprintf(namebuf, "%llu", rec->ino);
2028                 name_recovered = 1;
2029         }
2030         if (!type_recovered) {
2031                 printf("Can't get file type for inode %llu, using FILE as fallback\n",
2032                        rec->ino);
2033                 type = BTRFS_FT_REG_FILE;
2034                 type_recovered = 1;
2035         }
2036
2037         ret = reset_nlink(trans, root, path, rec);
2038         if (ret < 0) {
2039                 fprintf(stderr,
2040                         "Failed to reset nlink for inode %llu: %s\n",
2041                         rec->ino, strerror(-ret));
2042                 goto out;
2043         }
2044
2045         if (rec->found_link == 0) {
2046                 lost_found_ino = root->highest_inode;
2047                 if (lost_found_ino >= BTRFS_LAST_FREE_OBJECTID) {
2048                         ret = -EOVERFLOW;
2049                         goto out;
2050                 }
2051                 lost_found_ino++;
2052                 ret = btrfs_mkdir(trans, root, dir_name, strlen(dir_name),
2053                                   BTRFS_FIRST_FREE_OBJECTID, &lost_found_ino,
2054                                   mode);
2055                 if (ret < 0) {
2056                         fprintf(stderr, "Failed to create '%s' dir: %s",
2057                                 dir_name, strerror(-ret));
2058                         goto out;
2059                 }
2060                 ret = btrfs_add_link(trans, root, rec->ino, lost_found_ino,
2061                                      namebuf, namelen, type, NULL, 1);
2062                 if (ret == -EEXIST) {
2063                         /*
2064                          * Conflicting file name, add ".INO" as suffix * +1 for '.'
2065                          */
2066                         if (namelen + count_digits(rec->ino) + 1 >
2067                             BTRFS_NAME_LEN) {
2068                                 ret = -EFBIG;
2069                                 goto out;
2070                         }
2071                         snprintf(namebuf + namelen, BTRFS_NAME_LEN - namelen,
2072                                  ".%llu", rec->ino);
2073                         namelen += count_digits(rec->ino) + 1;
2074                         ret = btrfs_add_link(trans, root, rec->ino,
2075                                              lost_found_ino, namebuf,
2076                                              namelen, type, NULL, 1);
2077                 }
2078                 if (ret < 0) {
2079                         fprintf(stderr,
2080                                 "Failed to link the inode %llu to %s dir: %s",
2081                                 rec->ino, dir_name, strerror(-ret));
2082                         goto out;
2083                 }
2084                 /*
2085                  * Just increase the found_link, don't actually add the
2086                  * backref. This will make things easier and this inode
2087                  * record will be freed after the repair is done.
2088                  * So fsck will not report problem about this inode.
2089                  */
2090                 rec->found_link++;
2091                 printf("Moving file '%.*s' to '%s' dir since it has no valid backref\n",
2092                        namelen, namebuf, dir_name);
2093         }
2094         rec->errors &= ~I_ERR_LINK_COUNT_WRONG;
2095         printf("Fixed the nlink of inode %llu\n", rec->ino);
2096 out:
2097         btrfs_release_path(path);
2098         return ret;
2099 }
2100
2101 static int try_repair_inode(struct btrfs_root *root, struct inode_record *rec)
2102 {
2103         struct btrfs_trans_handle *trans;
2104         struct btrfs_path *path;
2105         int ret = 0;
2106
2107         if (!(rec->errors & (I_ERR_DIR_ISIZE_WRONG |
2108                              I_ERR_NO_ORPHAN_ITEM |
2109                              I_ERR_LINK_COUNT_WRONG)))
2110                 return rec->errors;
2111
2112         path = btrfs_alloc_path();
2113         if (!path)
2114                 return -ENOMEM;
2115
2116         /*
2117          * For nlink repair, it may create a dir and add link, so
2118          * 2 for parent(256)'s dir_index and dir_item
2119          * 2 for lost+found dir's inode_item and inode_ref
2120          * 1 for the new inode_ref of the file
2121          * 2 for lost+found dir's dir_index and dir_item for the file
2122          */
2123         trans = btrfs_start_transaction(root, 7);
2124         if (IS_ERR(trans)) {
2125                 btrfs_free_path(path);
2126                 return PTR_ERR(trans);
2127         }
2128
2129         if (rec->errors & I_ERR_DIR_ISIZE_WRONG)
2130                 ret = repair_inode_isize(trans, root, path, rec);
2131         if (!ret && rec->errors & I_ERR_NO_ORPHAN_ITEM)
2132                 ret = repair_inode_orphan_item(trans, root, path, rec);
2133         if (!ret && rec->errors & I_ERR_LINK_COUNT_WRONG)
2134                 ret = repair_inode_nlinks(trans, root, path, rec);
2135         btrfs_commit_transaction(trans, root);
2136         btrfs_free_path(path);
2137         return ret;
2138 }
2139
2140 static int check_inode_recs(struct btrfs_root *root,
2141                             struct cache_tree *inode_cache)
2142 {
2143         struct cache_extent *cache;
2144         struct ptr_node *node;
2145         struct inode_record *rec;
2146         struct inode_backref *backref;
2147         int stage = 0;
2148         int ret;
2149         int err = 0;
2150         u64 error = 0;
2151         u64 root_dirid = btrfs_root_dirid(&root->root_item);
2152
2153         if (btrfs_root_refs(&root->root_item) == 0) {
2154                 if (!cache_tree_empty(inode_cache))
2155                         fprintf(stderr, "warning line %d\n", __LINE__);
2156                 return 0;
2157         }
2158
2159         /*
2160          * We need to record the highest inode number for later 'lost+found'
2161          * dir creation.
2162          * We must select a ino not used/refered by any existing inode, or
2163          * 'lost+found' ino may be a missing ino in a corrupted leaf,
2164          * this may cause 'lost+found' dir has wrong nlinks.
2165          */
2166         cache = last_cache_extent(inode_cache);
2167         if (cache) {
2168                 node = container_of(cache, struct ptr_node, cache);
2169                 rec = node->data;
2170                 if (rec->ino > root->highest_inode)
2171                         root->highest_inode = rec->ino;
2172         }
2173
2174         /*
2175          * We need to repair backrefs first because we could change some of the
2176          * errors in the inode recs.
2177          *
2178          * We also need to go through and delete invalid backrefs first and then
2179          * add the correct ones second.  We do this because we may get EEXIST
2180          * when adding back the correct index because we hadn't yet deleted the
2181          * invalid index.
2182          *
2183          * For example, if we were missing a dir index then the directories
2184          * isize would be wrong, so if we fixed the isize to what we thought it
2185          * would be and then fixed the backref we'd still have a invalid fs, so
2186          * we need to add back the dir index and then check to see if the isize
2187          * is still wrong.
2188          */
2189         while (stage < 3) {
2190                 stage++;
2191                 if (stage == 3 && !err)
2192                         break;
2193
2194                 cache = search_cache_extent(inode_cache, 0);
2195                 while (repair && cache) {
2196                         node = container_of(cache, struct ptr_node, cache);
2197                         rec = node->data;
2198                         cache = next_cache_extent(cache);
2199
2200                         /* Need to free everything up and rescan */
2201                         if (stage == 3) {
2202                                 remove_cache_extent(inode_cache, &node->cache);
2203                                 free(node);
2204                                 free_inode_rec(rec);
2205                                 continue;
2206                         }
2207
2208                         if (list_empty(&rec->backrefs))
2209                                 continue;
2210
2211                         ret = repair_inode_backrefs(root, rec, inode_cache,
2212                                                     stage == 1);
2213                         if (ret < 0) {
2214                                 err = ret;
2215                                 stage = 2;
2216                                 break;
2217                         } if (ret > 0) {
2218                                 err = -EAGAIN;
2219                         }
2220                 }
2221         }
2222         if (err)
2223                 return err;
2224
2225         rec = get_inode_rec(inode_cache, root_dirid, 0);
2226         if (rec) {
2227                 ret = check_root_dir(rec);
2228                 if (ret) {
2229                         fprintf(stderr, "root %llu root dir %llu error\n",
2230                                 (unsigned long long)root->root_key.objectid,
2231                                 (unsigned long long)root_dirid);
2232                         print_inode_error(root, rec);
2233                         error++;
2234                 }
2235         } else {
2236                 if (repair) {
2237                         struct btrfs_trans_handle *trans;
2238
2239                         trans = btrfs_start_transaction(root, 1);
2240                         if (IS_ERR(trans)) {
2241                                 err = PTR_ERR(trans);
2242                                 return err;
2243                         }
2244
2245                         fprintf(stderr,
2246                                 "root %llu missing its root dir, recreating\n",
2247                                 (unsigned long long)root->objectid);
2248
2249                         ret = btrfs_make_root_dir(trans, root, root_dirid);
2250                         BUG_ON(ret);
2251
2252                         btrfs_commit_transaction(trans, root);
2253                         return -EAGAIN;
2254                 }
2255
2256                 fprintf(stderr, "root %llu root dir %llu not found\n",
2257                         (unsigned long long)root->root_key.objectid,
2258                         (unsigned long long)root_dirid);
2259         }
2260
2261         while (1) {
2262                 cache = search_cache_extent(inode_cache, 0);
2263                 if (!cache)
2264                         break;
2265                 node = container_of(cache, struct ptr_node, cache);
2266                 rec = node->data;
2267                 remove_cache_extent(inode_cache, &node->cache);
2268                 free(node);
2269                 if (rec->ino == root_dirid ||
2270                     rec->ino == BTRFS_ORPHAN_OBJECTID) {
2271                         free_inode_rec(rec);
2272                         continue;
2273                 }
2274
2275                 if (rec->errors & I_ERR_NO_ORPHAN_ITEM) {
2276                         ret = check_orphan_item(root, rec->ino);
2277                         if (ret == 0)
2278                                 rec->errors &= ~I_ERR_NO_ORPHAN_ITEM;
2279                         if (can_free_inode_rec(rec)) {
2280                                 free_inode_rec(rec);
2281                                 continue;
2282                         }
2283                 }
2284
2285                 if (rec->found_link != rec->nlink)
2286                         rec->errors |= I_ERR_LINK_COUNT_WRONG;
2287                 if (repair) {
2288                         ret = try_repair_inode(root, rec);
2289                         if (ret == 0 && can_free_inode_rec(rec)) {
2290                                 free_inode_rec(rec);
2291                                 continue;
2292                         }
2293                         ret = 0;
2294                 }
2295
2296                 error++;
2297                 if (!rec->found_inode_item)
2298                         rec->errors |= I_ERR_NO_INODE_ITEM;
2299                 print_inode_error(root, rec);
2300                 list_for_each_entry(backref, &rec->backrefs, list) {
2301                         if (!backref->found_dir_item)
2302                                 backref->errors |= REF_ERR_NO_DIR_ITEM;
2303                         if (!backref->found_dir_index)
2304                                 backref->errors |= REF_ERR_NO_DIR_INDEX;
2305                         if (!backref->found_inode_ref)
2306                                 backref->errors |= REF_ERR_NO_INODE_REF;
2307                         fprintf(stderr, "\tunresolved ref dir %llu index %llu"
2308                                 " namelen %u name %s filetype %d errors %x",
2309                                 (unsigned long long)backref->dir,
2310                                 (unsigned long long)backref->index,
2311                                 backref->namelen, backref->name,
2312                                 backref->filetype, backref->errors);
2313                         print_ref_error(backref->errors);
2314                 }
2315                 free_inode_rec(rec);
2316         }
2317         return (error > 0) ? -1 : 0;
2318 }
2319
2320 static struct root_record *get_root_rec(struct cache_tree *root_cache,
2321                                         u64 objectid)
2322 {
2323         struct cache_extent *cache;
2324         struct root_record *rec = NULL;
2325         int ret;
2326
2327         cache = lookup_cache_extent(root_cache, objectid, 1);
2328         if (cache) {
2329                 rec = container_of(cache, struct root_record, cache);
2330         } else {
2331                 rec = calloc(1, sizeof(*rec));
2332                 rec->objectid = objectid;
2333                 INIT_LIST_HEAD(&rec->backrefs);
2334                 rec->cache.start = objectid;
2335                 rec->cache.size = 1;
2336
2337                 ret = insert_cache_extent(root_cache, &rec->cache);
2338                 BUG_ON(ret);
2339         }
2340         return rec;
2341 }
2342
2343 static struct root_backref *get_root_backref(struct root_record *rec,
2344                                              u64 ref_root, u64 dir, u64 index,
2345                                              const char *name, int namelen)
2346 {
2347         struct root_backref *backref;
2348
2349         list_for_each_entry(backref, &rec->backrefs, list) {
2350                 if (backref->ref_root != ref_root || backref->dir != dir ||
2351                     backref->namelen != namelen)
2352                         continue;
2353                 if (memcmp(name, backref->name, namelen))
2354                         continue;
2355                 return backref;
2356         }
2357
2358         backref = malloc(sizeof(*backref) + namelen + 1);
2359         memset(backref, 0, sizeof(*backref));
2360         backref->ref_root = ref_root;
2361         backref->dir = dir;
2362         backref->index = index;
2363         backref->namelen = namelen;
2364         memcpy(backref->name, name, namelen);
2365         backref->name[namelen] = '\0';
2366         list_add_tail(&backref->list, &rec->backrefs);
2367         return backref;
2368 }
2369
2370 static void free_root_record(struct cache_extent *cache)
2371 {
2372         struct root_record *rec;
2373         struct root_backref *backref;
2374
2375         rec = container_of(cache, struct root_record, cache);
2376         while (!list_empty(&rec->backrefs)) {
2377                 backref = list_entry(rec->backrefs.next,
2378                                      struct root_backref, list);
2379                 list_del(&backref->list);
2380                 free(backref);
2381         }
2382
2383         kfree(rec);
2384 }
2385
2386 FREE_EXTENT_CACHE_BASED_TREE(root_recs, free_root_record);
2387
2388 static int add_root_backref(struct cache_tree *root_cache,
2389                             u64 root_id, u64 ref_root, u64 dir, u64 index,
2390                             const char *name, int namelen,
2391                             int item_type, int errors)
2392 {
2393         struct root_record *rec;
2394         struct root_backref *backref;
2395
2396         rec = get_root_rec(root_cache, root_id);
2397         backref = get_root_backref(rec, ref_root, dir, index, name, namelen);
2398
2399         backref->errors |= errors;
2400
2401         if (item_type != BTRFS_DIR_ITEM_KEY) {
2402                 if (backref->found_dir_index || backref->found_back_ref ||
2403                     backref->found_forward_ref) {
2404                         if (backref->index != index)
2405                                 backref->errors |= REF_ERR_INDEX_UNMATCH;
2406                 } else {
2407                         backref->index = index;
2408                 }
2409         }
2410
2411         if (item_type == BTRFS_DIR_ITEM_KEY) {
2412                 if (backref->found_forward_ref)
2413                         rec->found_ref++;
2414                 backref->found_dir_item = 1;
2415         } else if (item_type == BTRFS_DIR_INDEX_KEY) {
2416                 backref->found_dir_index = 1;
2417         } else if (item_type == BTRFS_ROOT_REF_KEY) {
2418                 if (backref->found_forward_ref)
2419                         backref->errors |= REF_ERR_DUP_ROOT_REF;
2420                 else if (backref->found_dir_item)
2421                         rec->found_ref++;
2422                 backref->found_forward_ref = 1;
2423         } else if (item_type == BTRFS_ROOT_BACKREF_KEY) {
2424                 if (backref->found_back_ref)
2425                         backref->errors |= REF_ERR_DUP_ROOT_BACKREF;
2426                 backref->found_back_ref = 1;
2427         } else {
2428                 BUG_ON(1);
2429         }
2430
2431         if (backref->found_forward_ref && backref->found_dir_item)
2432                 backref->reachable = 1;
2433         return 0;
2434 }
2435
2436 static int merge_root_recs(struct btrfs_root *root,
2437                            struct cache_tree *src_cache,
2438                            struct cache_tree *dst_cache)
2439 {
2440         struct cache_extent *cache;
2441         struct ptr_node *node;
2442         struct inode_record *rec;
2443         struct inode_backref *backref;
2444         int ret = 0;
2445
2446         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2447                 free_inode_recs_tree(src_cache);
2448                 return 0;
2449         }
2450
2451         while (1) {
2452                 cache = search_cache_extent(src_cache, 0);
2453                 if (!cache)
2454                         break;
2455                 node = container_of(cache, struct ptr_node, cache);
2456                 rec = node->data;
2457                 remove_cache_extent(src_cache, &node->cache);
2458                 free(node);
2459
2460                 ret = is_child_root(root, root->objectid, rec->ino);
2461                 if (ret < 0)
2462                         break;
2463                 else if (ret == 0)
2464                         goto skip;
2465
2466                 list_for_each_entry(backref, &rec->backrefs, list) {
2467                         BUG_ON(backref->found_inode_ref);
2468                         if (backref->found_dir_item)
2469                                 add_root_backref(dst_cache, rec->ino,
2470                                         root->root_key.objectid, backref->dir,
2471                                         backref->index, backref->name,
2472                                         backref->namelen, BTRFS_DIR_ITEM_KEY,
2473                                         backref->errors);
2474                         if (backref->found_dir_index)
2475                                 add_root_backref(dst_cache, rec->ino,
2476                                         root->root_key.objectid, backref->dir,
2477                                         backref->index, backref->name,
2478                                         backref->namelen, BTRFS_DIR_INDEX_KEY,
2479                                         backref->errors);
2480                 }
2481 skip:
2482                 free_inode_rec(rec);
2483         }
2484         if (ret < 0)
2485                 return ret;
2486         return 0;
2487 }
2488
2489 static int check_root_refs(struct btrfs_root *root,
2490                            struct cache_tree *root_cache)
2491 {
2492         struct root_record *rec;
2493         struct root_record *ref_root;
2494         struct root_backref *backref;
2495         struct cache_extent *cache;
2496         int loop = 1;
2497         int ret;
2498         int error;
2499         int errors = 0;
2500
2501         rec = get_root_rec(root_cache, BTRFS_FS_TREE_OBJECTID);
2502         rec->found_ref = 1;
2503
2504         /* fixme: this can not detect circular references */
2505         while (loop) {
2506                 loop = 0;
2507                 cache = search_cache_extent(root_cache, 0);
2508                 while (1) {
2509                         if (!cache)
2510                                 break;
2511                         rec = container_of(cache, struct root_record, cache);
2512                         cache = next_cache_extent(cache);
2513
2514                         if (rec->found_ref == 0)
2515                                 continue;
2516
2517                         list_for_each_entry(backref, &rec->backrefs, list) {
2518                                 if (!backref->reachable)
2519                                         continue;
2520
2521                                 ref_root = get_root_rec(root_cache,
2522                                                         backref->ref_root);
2523                                 if (ref_root->found_ref > 0)
2524                                         continue;
2525
2526                                 backref->reachable = 0;
2527                                 rec->found_ref--;
2528                                 if (rec->found_ref == 0)
2529                                         loop = 1;
2530                         }
2531                 }
2532         }
2533
2534         cache = search_cache_extent(root_cache, 0);
2535         while (1) {
2536                 if (!cache)
2537                         break;
2538                 rec = container_of(cache, struct root_record, cache);
2539                 cache = next_cache_extent(cache);
2540
2541                 if (rec->found_ref == 0 &&
2542                     rec->objectid >= BTRFS_FIRST_FREE_OBJECTID &&
2543                     rec->objectid <= BTRFS_LAST_FREE_OBJECTID) {
2544                         ret = check_orphan_item(root->fs_info->tree_root,
2545                                                 rec->objectid);
2546                         if (ret == 0)
2547                                 continue;
2548
2549                         /*
2550                          * If we don't have a root item then we likely just have
2551                          * a dir item in a snapshot for this root but no actual
2552                          * ref key or anything so it's meaningless.
2553                          */
2554                         if (!rec->found_root_item)
2555                                 continue;
2556                         errors++;
2557                         fprintf(stderr, "fs tree %llu not referenced\n",
2558                                 (unsigned long long)rec->objectid);
2559                 }
2560
2561                 error = 0;
2562                 if (rec->found_ref > 0 && !rec->found_root_item)
2563                         error = 1;
2564                 list_for_each_entry(backref, &rec->backrefs, list) {
2565                         if (!backref->found_dir_item)
2566                                 backref->errors |= REF_ERR_NO_DIR_ITEM;
2567                         if (!backref->found_dir_index)
2568                                 backref->errors |= REF_ERR_NO_DIR_INDEX;
2569                         if (!backref->found_back_ref)
2570                                 backref->errors |= REF_ERR_NO_ROOT_BACKREF;
2571                         if (!backref->found_forward_ref)
2572                                 backref->errors |= REF_ERR_NO_ROOT_REF;
2573                         if (backref->reachable && backref->errors)
2574                                 error = 1;
2575                 }
2576                 if (!error)
2577                         continue;
2578
2579                 errors++;
2580                 fprintf(stderr, "fs tree %llu refs %u %s\n",
2581                         (unsigned long long)rec->objectid, rec->found_ref,
2582                          rec->found_root_item ? "" : "not found");
2583
2584                 list_for_each_entry(backref, &rec->backrefs, list) {
2585                         if (!backref->reachable)
2586                                 continue;
2587                         if (!backref->errors && rec->found_root_item)
2588                                 continue;
2589                         fprintf(stderr, "\tunresolved ref root %llu dir %llu"
2590                                 " index %llu namelen %u name %s errors %x\n",
2591                                 (unsigned long long)backref->ref_root,
2592                                 (unsigned long long)backref->dir,
2593                                 (unsigned long long)backref->index,
2594                                 backref->namelen, backref->name,
2595                                 backref->errors);
2596                         print_ref_error(backref->errors);
2597                 }
2598         }
2599         return errors > 0 ? 1 : 0;
2600 }
2601
2602 static int process_root_ref(struct extent_buffer *eb, int slot,
2603                             struct btrfs_key *key,
2604                             struct cache_tree *root_cache)
2605 {
2606         u64 dirid;
2607         u64 index;
2608         u32 len;
2609         u32 name_len;
2610         struct btrfs_root_ref *ref;
2611         char namebuf[BTRFS_NAME_LEN];
2612         int error;
2613
2614         ref = btrfs_item_ptr(eb, slot, struct btrfs_root_ref);
2615
2616         dirid = btrfs_root_ref_dirid(eb, ref);
2617         index = btrfs_root_ref_sequence(eb, ref);
2618         name_len = btrfs_root_ref_name_len(eb, ref);
2619
2620         if (name_len <= BTRFS_NAME_LEN) {
2621                 len = name_len;
2622                 error = 0;
2623         } else {
2624                 len = BTRFS_NAME_LEN;
2625                 error = REF_ERR_NAME_TOO_LONG;
2626         }
2627         read_extent_buffer(eb, namebuf, (unsigned long)(ref + 1), len);
2628
2629         if (key->type == BTRFS_ROOT_REF_KEY) {
2630                 add_root_backref(root_cache, key->offset, key->objectid, dirid,
2631                                  index, namebuf, len, key->type, error);
2632         } else {
2633                 add_root_backref(root_cache, key->objectid, key->offset, dirid,
2634                                  index, namebuf, len, key->type, error);
2635         }
2636         return 0;
2637 }
2638
2639 static void free_corrupt_block(struct cache_extent *cache)
2640 {
2641         struct btrfs_corrupt_block *corrupt;
2642
2643         corrupt = container_of(cache, struct btrfs_corrupt_block, cache);
2644         free(corrupt);
2645 }
2646
2647 FREE_EXTENT_CACHE_BASED_TREE(corrupt_blocks, free_corrupt_block);
2648
2649 /*
2650  * Repair the btree of the given root.
2651  *
2652  * The fix is to remove the node key in corrupt_blocks cache_tree.
2653  * and rebalance the tree.
2654  * After the fix, the btree should be writeable.
2655  */
2656 static int repair_btree(struct btrfs_root *root,
2657                         struct cache_tree *corrupt_blocks)
2658 {
2659         struct btrfs_trans_handle *trans;
2660         struct btrfs_path *path;
2661         struct btrfs_corrupt_block *corrupt;
2662         struct cache_extent *cache;
2663         struct btrfs_key key;
2664         u64 offset;
2665         int level;
2666         int ret = 0;
2667
2668         if (cache_tree_empty(corrupt_blocks))
2669                 return 0;
2670
2671         path = btrfs_alloc_path();
2672         if (!path)
2673                 return -ENOMEM;
2674
2675         trans = btrfs_start_transaction(root, 1);
2676         if (IS_ERR(trans)) {
2677                 ret = PTR_ERR(trans);
2678                 fprintf(stderr, "Error starting transaction: %s\n",
2679                         strerror(-ret));
2680                 return ret;
2681         }
2682         cache = first_cache_extent(corrupt_blocks);
2683         while (cache) {
2684                 corrupt = container_of(cache, struct btrfs_corrupt_block,
2685                                        cache);
2686                 level = corrupt->level;
2687                 path->lowest_level = level;
2688                 key.objectid = corrupt->key.objectid;
2689                 key.type = corrupt->key.type;
2690                 key.offset = corrupt->key.offset;
2691
2692                 /*
2693                  * Here we don't want to do any tree balance, since it may
2694                  * cause a balance with corrupted brother leaf/node,
2695                  * so ins_len set to 0 here.
2696                  * Balance will be done after all corrupt node/leaf is deleted.
2697                  */
2698                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2699                 if (ret < 0)
2700                         goto out;
2701                 offset = btrfs_node_blockptr(path->nodes[level],
2702                                              path->slots[level]);
2703
2704                 /* Remove the ptr */
2705                 ret = btrfs_del_ptr(trans, root, path, level,
2706                                     path->slots[level]);
2707                 if (ret < 0)
2708                         goto out;
2709                 /*
2710                  * Remove the corresponding extent
2711                  * return value is not concerned.
2712                  */
2713                 btrfs_release_path(path);
2714                 ret = btrfs_free_extent(trans, root, offset, root->nodesize,
2715                                         0, root->root_key.objectid,
2716                                         level - 1, 0);
2717                 cache = next_cache_extent(cache);
2718         }
2719
2720         /* Balance the btree using btrfs_search_slot() */
2721         cache = first_cache_extent(corrupt_blocks);
2722         while (cache) {
2723                 corrupt = container_of(cache, struct btrfs_corrupt_block,
2724                                        cache);
2725                 memcpy(&key, &corrupt->key, sizeof(key));
2726                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2727                 if (ret < 0)
2728                         goto out;
2729                 /* return will always >0 since it won't find the item */
2730                 ret = 0;
2731                 btrfs_release_path(path);
2732                 cache = next_cache_extent(cache);
2733         }
2734 out:
2735         btrfs_free_path(path);
2736         btrfs_commit_transaction(trans, root);
2737         return ret;
2738 }
2739
2740 static int check_fs_root(struct btrfs_root *root,
2741                          struct cache_tree *root_cache,
2742                          struct walk_control *wc)
2743 {
2744         int ret = 0;
2745         int err = 0;
2746         int wret;
2747         int level;
2748         struct btrfs_path path;
2749         struct shared_node root_node;
2750         struct root_record *rec;
2751         struct btrfs_root_item *root_item = &root->root_item;
2752         struct cache_tree corrupt_blocks;
2753         enum btrfs_tree_block_status status;
2754
2755         /*
2756          * Reuse the corrupt_block cache tree to record corrupted tree block
2757          *
2758          * Unlike the usage in extent tree check, here we do it in a per
2759          * fs/subvol tree base.
2760          */
2761         cache_tree_init(&corrupt_blocks);
2762         root->fs_info->corrupt_blocks = &corrupt_blocks;
2763         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2764                 rec = get_root_rec(root_cache, root->root_key.objectid);
2765                 if (btrfs_root_refs(root_item) > 0)
2766                         rec->found_root_item = 1;
2767         }
2768
2769         btrfs_init_path(&path);
2770         memset(&root_node, 0, sizeof(root_node));
2771         cache_tree_init(&root_node.root_cache);
2772         cache_tree_init(&root_node.inode_cache);
2773
2774         level = btrfs_header_level(root->node);
2775         memset(wc->nodes, 0, sizeof(wc->nodes));
2776         wc->nodes[level] = &root_node;
2777         wc->active_node = level;
2778         wc->root_level = level;
2779
2780         /* We may not have checked the root block, lets do that now */
2781         if (btrfs_is_leaf(root->node))
2782                 status = btrfs_check_leaf(root, NULL, root->node);
2783         else
2784                 status = btrfs_check_node(root, NULL, root->node);
2785         if (status != BTRFS_TREE_BLOCK_CLEAN)
2786                 return -EIO;
2787
2788         if (btrfs_root_refs(root_item) > 0 ||
2789             btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2790                 path.nodes[level] = root->node;
2791                 extent_buffer_get(root->node);
2792                 path.slots[level] = 0;
2793         } else {
2794                 struct btrfs_key key;
2795                 struct btrfs_disk_key found_key;
2796
2797                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2798                 level = root_item->drop_level;
2799                 path.lowest_level = level;
2800                 wret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
2801                 if (wret < 0)
2802                         goto skip_walking;
2803                 btrfs_node_key(path.nodes[level], &found_key,
2804                                 path.slots[level]);
2805                 WARN_ON(memcmp(&found_key, &root_item->drop_progress,
2806                                         sizeof(found_key)));
2807         }
2808
2809         while (1) {
2810                 wret = walk_down_tree(root, &path, wc, &level);
2811                 if (wret < 0)
2812                         ret = wret;
2813                 if (wret != 0)
2814                         break;
2815
2816                 wret = walk_up_tree(root, &path, wc, &level);
2817                 if (wret < 0)
2818                         ret = wret;
2819                 if (wret != 0)
2820                         break;
2821         }
2822 skip_walking:
2823         btrfs_release_path(&path);
2824
2825         if (!cache_tree_empty(&corrupt_blocks)) {
2826                 struct cache_extent *cache;
2827                 struct btrfs_corrupt_block *corrupt;
2828
2829                 printf("The following tree block(s) is corrupted in tree %llu:\n",
2830                        root->root_key.objectid);
2831                 cache = first_cache_extent(&corrupt_blocks);
2832                 while (cache) {
2833                         corrupt = container_of(cache,
2834                                                struct btrfs_corrupt_block,
2835                                                cache);
2836                         printf("\ttree block bytenr: %llu, level: %d, node key: (%llu, %u, %llu)\n",
2837                                cache->start, corrupt->level,
2838                                corrupt->key.objectid, corrupt->key.type,
2839                                corrupt->key.offset);
2840                         cache = next_cache_extent(cache);
2841                 }
2842                 if (repair) {
2843                         printf("Try to repair the btree for root %llu\n",
2844                                root->root_key.objectid);
2845                         ret = repair_btree(root, &corrupt_blocks);
2846                         if (ret < 0)
2847                                 fprintf(stderr, "Failed to repair btree: %s\n",
2848                                         strerror(-ret));
2849                         if (!ret)
2850                                 printf("Btree for root %llu is fixed\n",
2851                                        root->root_key.objectid);
2852                 }
2853         }
2854
2855         err = merge_root_recs(root, &root_node.root_cache, root_cache);
2856         if (err < 0)
2857                 ret = err;
2858
2859         if (root_node.current) {
2860                 root_node.current->checked = 1;
2861                 maybe_free_inode_rec(&root_node.inode_cache,
2862                                 root_node.current);
2863         }
2864
2865         err = check_inode_recs(root, &root_node.inode_cache);
2866         if (!ret)
2867                 ret = err;
2868
2869         free_corrupt_blocks_tree(&corrupt_blocks);
2870         root->fs_info->corrupt_blocks = NULL;
2871         return ret;
2872 }
2873
2874 static int fs_root_objectid(u64 objectid)
2875 {
2876         if (objectid == BTRFS_TREE_RELOC_OBJECTID ||
2877             objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2878                 return 1;
2879         return is_fstree(objectid);
2880 }
2881
2882 static int check_fs_roots(struct btrfs_root *root,
2883                           struct cache_tree *root_cache)
2884 {
2885         struct btrfs_path path;
2886         struct btrfs_key key;
2887         struct walk_control wc;
2888         struct extent_buffer *leaf, *tree_node;
2889         struct btrfs_root *tmp_root;
2890         struct btrfs_root *tree_root = root->fs_info->tree_root;
2891         int ret;
2892         int err = 0;
2893
2894         /*
2895          * Just in case we made any changes to the extent tree that weren't
2896          * reflected into the free space cache yet.
2897          */
2898         if (repair)
2899                 reset_cached_block_groups(root->fs_info);
2900         memset(&wc, 0, sizeof(wc));
2901         cache_tree_init(&wc.shared);
2902         btrfs_init_path(&path);
2903
2904 again:
2905         key.offset = 0;
2906         key.objectid = 0;
2907         key.type = BTRFS_ROOT_ITEM_KEY;
2908         ret = btrfs_search_slot(NULL, tree_root, &key, &path, 0, 0);
2909         if (ret < 0) {
2910                 err = 1;
2911                 goto out;
2912         }
2913         tree_node = tree_root->node;
2914         while (1) {
2915                 if (tree_node != tree_root->node) {
2916                         free_root_recs_tree(root_cache);
2917                         btrfs_release_path(&path);
2918                         goto again;
2919                 }
2920                 leaf = path.nodes[0];
2921                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
2922                         ret = btrfs_next_leaf(tree_root, &path);
2923                         if (ret) {
2924                                 if (ret < 0)
2925                                         err = 1;
2926                                 break;
2927                         }
2928                         leaf = path.nodes[0];
2929                 }
2930                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
2931                 if (key.type == BTRFS_ROOT_ITEM_KEY &&
2932                     fs_root_objectid(key.objectid)) {
2933                         if (key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2934                                 tmp_root = btrfs_read_fs_root_no_cache(
2935                                                 root->fs_info, &key);
2936                         } else {
2937                                 key.offset = (u64)-1;
2938                                 tmp_root = btrfs_read_fs_root(
2939                                                 root->fs_info, &key);
2940                         }
2941                         if (IS_ERR(tmp_root)) {
2942                                 err = 1;
2943                                 goto next;
2944                         }
2945                         ret = check_fs_root(tmp_root, root_cache, &wc);
2946                         if (ret == -EAGAIN) {
2947                                 free_root_recs_tree(root_cache);
2948                                 btrfs_release_path(&path);
2949                                 goto again;
2950                         }
2951                         if (ret)
2952                                 err = 1;
2953                         if (key.objectid == BTRFS_TREE_RELOC_OBJECTID)
2954                                 btrfs_free_fs_root(tmp_root);
2955                 } else if (key.type == BTRFS_ROOT_REF_KEY ||
2956                            key.type == BTRFS_ROOT_BACKREF_KEY) {
2957                         process_root_ref(leaf, path.slots[0], &key,
2958                                          root_cache);
2959                 }
2960 next:
2961                 path.slots[0]++;
2962         }
2963 out:
2964         btrfs_release_path(&path);
2965         if (err)
2966                 free_extent_cache_tree(&wc.shared);
2967         if (!cache_tree_empty(&wc.shared))
2968                 fprintf(stderr, "warning line %d\n", __LINE__);
2969
2970         return err;
2971 }
2972
2973 static int all_backpointers_checked(struct extent_record *rec, int print_errs)
2974 {
2975         struct list_head *cur = rec->backrefs.next;
2976         struct extent_backref *back;
2977         struct tree_backref *tback;
2978         struct data_backref *dback;
2979         u64 found = 0;
2980         int err = 0;
2981
2982         while(cur != &rec->backrefs) {
2983                 back = list_entry(cur, struct extent_backref, list);
2984                 cur = cur->next;
2985                 if (!back->found_extent_tree) {
2986                         err = 1;
2987                         if (!print_errs)
2988                                 goto out;
2989                         if (back->is_data) {
2990                                 dback = (struct data_backref *)back;
2991                                 fprintf(stderr, "Backref %llu %s %llu"
2992                                         " owner %llu offset %llu num_refs %lu"
2993                                         " not found in extent tree\n",
2994                                         (unsigned long long)rec->start,
2995                                         back->full_backref ?
2996                                         "parent" : "root",
2997                                         back->full_backref ?
2998                                         (unsigned long long)dback->parent:
2999                                         (unsigned long long)dback->root,
3000                                         (unsigned long long)dback->owner,
3001                                         (unsigned long long)dback->offset,
3002                                         (unsigned long)dback->num_refs);
3003                         } else {
3004                                 tback = (struct tree_backref *)back;
3005                                 fprintf(stderr, "Backref %llu parent %llu"
3006                                         " root %llu not found in extent tree\n",
3007                                         (unsigned long long)rec->start,
3008                                         (unsigned long long)tback->parent,
3009                                         (unsigned long long)tback->root);
3010                         }
3011                 }
3012                 if (!back->is_data && !back->found_ref) {
3013                         err = 1;
3014                         if (!print_errs)
3015                                 goto out;
3016                         tback = (struct tree_backref *)back;
3017                         fprintf(stderr, "Backref %llu %s %llu not referenced back %p\n",
3018                                 (unsigned long long)rec->start,
3019                                 back->full_backref ? "parent" : "root",
3020                                 back->full_backref ?
3021                                 (unsigned long long)tback->parent :
3022                                 (unsigned long long)tback->root, back);
3023                 }
3024                 if (back->is_data) {
3025                         dback = (struct data_backref *)back;
3026                         if (dback->found_ref != dback->num_refs) {
3027                                 err = 1;
3028                                 if (!print_errs)
3029                                         goto out;
3030                                 fprintf(stderr, "Incorrect local backref count"
3031                                         " on %llu %s %llu owner %llu"
3032                                         " offset %llu found %u wanted %u back %p\n",
3033                                         (unsigned long long)rec->start,
3034                                         back->full_backref ?
3035                                         "parent" : "root",
3036                                         back->full_backref ?
3037                                         (unsigned long long)dback->parent:
3038                                         (unsigned long long)dback->root,
3039                                         (unsigned long long)dback->owner,
3040                                         (unsigned long long)dback->offset,
3041                                         dback->found_ref, dback->num_refs, back);
3042                         }
3043                         if (dback->disk_bytenr != rec->start) {
3044                                 err = 1;
3045                                 if (!print_errs)
3046                                         goto out;
3047                                 fprintf(stderr, "Backref disk bytenr does not"
3048                                         " match extent record, bytenr=%llu, "
3049                                         "ref bytenr=%llu\n",
3050                                         (unsigned long long)rec->start,
3051                                         (unsigned long long)dback->disk_bytenr);
3052                         }
3053
3054                         if (dback->bytes != rec->nr) {
3055                                 err = 1;
3056                                 if (!print_errs)
3057                                         goto out;
3058                                 fprintf(stderr, "Backref bytes do not match "
3059                                         "extent backref, bytenr=%llu, ref "
3060                                         "bytes=%llu, backref bytes=%llu\n",
3061                                         (unsigned long long)rec->start,
3062                                         (unsigned long long)rec->nr,
3063                                         (unsigned long long)dback->bytes);
3064                         }
3065                 }
3066                 if (!back->is_data) {
3067                         found += 1;
3068                 } else {
3069                         dback = (struct data_backref *)back;
3070                         found += dback->found_ref;
3071                 }
3072         }
3073         if (found != rec->refs) {
3074                 err = 1;
3075                 if (!print_errs)
3076                         goto out;
3077                 fprintf(stderr, "Incorrect global backref count "
3078                         "on %llu found %llu wanted %llu\n",
3079                         (unsigned long long)rec->start,
3080                         (unsigned long long)found,
3081                         (unsigned long long)rec->refs);
3082         }
3083 out:
3084         return err;
3085 }
3086
3087 static int free_all_extent_backrefs(struct extent_record *rec)
3088 {
3089         struct extent_backref *back;
3090         struct list_head *cur;
3091         while (!list_empty(&rec->backrefs)) {
3092                 cur = rec->backrefs.next;
3093                 back = list_entry(cur, struct extent_backref, list);
3094                 list_del(cur);
3095                 free(back);
3096         }
3097         return 0;
3098 }
3099
3100 static void free_extent_record_cache(struct btrfs_fs_info *fs_info,
3101                                      struct cache_tree *extent_cache)
3102 {
3103         struct cache_extent *cache;
3104         struct extent_record *rec;
3105
3106         while (1) {
3107                 cache = first_cache_extent(extent_cache);
3108                 if (!cache)
3109                         break;
3110                 rec = container_of(cache, struct extent_record, cache);
3111                 btrfs_unpin_extent(fs_info, rec->start, rec->max_size);
3112                 remove_cache_extent(extent_cache, cache);
3113                 free_all_extent_backrefs(rec);
3114                 free(rec);
3115         }
3116 }
3117
3118 static int maybe_free_extent_rec(struct cache_tree *extent_cache,
3119                                  struct extent_record *rec)
3120 {
3121         if (rec->content_checked && rec->owner_ref_checked &&
3122             rec->extent_item_refs == rec->refs && rec->refs > 0 &&
3123             rec->num_duplicates == 0 && !all_backpointers_checked(rec, 0)) {
3124                 remove_cache_extent(extent_cache, &rec->cache);
3125                 free_all_extent_backrefs(rec);
3126                 list_del_init(&rec->list);
3127                 free(rec);
3128         }
3129         return 0;
3130 }
3131
3132 static int check_owner_ref(struct btrfs_root *root,
3133                             struct extent_record *rec,
3134                             struct extent_buffer *buf)
3135 {
3136         struct extent_backref *node;
3137         struct tree_backref *back;
3138         struct btrfs_root *ref_root;
3139         struct btrfs_key key;
3140         struct btrfs_path path;
3141         struct extent_buffer *parent;
3142         int level;
3143         int found = 0;
3144         int ret;
3145
3146         list_for_each_entry(node, &rec->backrefs, list) {
3147                 if (node->is_data)
3148                         continue;
3149                 if (!node->found_ref)
3150                         continue;
3151                 if (node->full_backref)
3152                         continue;
3153                 back = (struct tree_backref *)node;
3154                 if (btrfs_header_owner(buf) == back->root)
3155                         return 0;
3156         }
3157         BUG_ON(rec->is_root);
3158
3159         /* try to find the block by search corresponding fs tree */
3160         key.objectid = btrfs_header_owner(buf);
3161         key.type = BTRFS_ROOT_ITEM_KEY;
3162         key.offset = (u64)-1;
3163
3164         ref_root = btrfs_read_fs_root(root->fs_info, &key);
3165         if (IS_ERR(ref_root))
3166                 return 1;
3167
3168         level = btrfs_header_level(buf);
3169         if (level == 0)
3170                 btrfs_item_key_to_cpu(buf, &key, 0);
3171         else
3172                 btrfs_node_key_to_cpu(buf, &key, 0);
3173
3174         btrfs_init_path(&path);
3175         path.lowest_level = level + 1;
3176         ret = btrfs_search_slot(NULL, ref_root, &key, &path, 0, 0);
3177         if (ret < 0)
3178                 return 0;
3179
3180         parent = path.nodes[level + 1];
3181         if (parent && buf->start == btrfs_node_blockptr(parent,
3182                                                         path.slots[level + 1]))
3183                 found = 1;
3184
3185         btrfs_release_path(&path);
3186         return found ? 0 : 1;
3187 }
3188
3189 static int is_extent_tree_record(struct extent_record *rec)
3190 {
3191         struct list_head *cur = rec->backrefs.next;
3192         struct extent_backref *node;
3193         struct tree_backref *back;
3194         int is_extent = 0;
3195
3196         while(cur != &rec->backrefs) {
3197                 node = list_entry(cur, struct extent_backref, list);
3198                 cur = cur->next;
3199                 if (node->is_data)
3200                         return 0;
3201                 back = (struct tree_backref *)node;
3202                 if (node->full_backref)
3203                         return 0;
3204                 if (back->root == BTRFS_EXTENT_TREE_OBJECTID)
3205                         is_extent = 1;
3206         }
3207         return is_extent;
3208 }
3209
3210
3211 static int record_bad_block_io(struct btrfs_fs_info *info,
3212                                struct cache_tree *extent_cache,
3213                                u64 start, u64 len)
3214 {
3215         struct extent_record *rec;
3216         struct cache_extent *cache;
3217         struct btrfs_key key;
3218
3219         cache = lookup_cache_extent(extent_cache, start, len);
3220         if (!cache)
3221                 return 0;
3222
3223         rec = container_of(cache, struct extent_record, cache);
3224         if (!is_extent_tree_record(rec))
3225                 return 0;
3226
3227         btrfs_disk_key_to_cpu(&key, &rec->parent_key);
3228         return btrfs_add_corrupt_extent_record(info, &key, start, len, 0);
3229 }
3230
3231 static int swap_values(struct btrfs_root *root, struct btrfs_path *path,
3232                        struct extent_buffer *buf, int slot)
3233 {
3234         if (btrfs_header_level(buf)) {
3235                 struct btrfs_key_ptr ptr1, ptr2;
3236
3237                 read_extent_buffer(buf, &ptr1, btrfs_node_key_ptr_offset(slot),
3238                                    sizeof(struct btrfs_key_ptr));
3239                 read_extent_buffer(buf, &ptr2,
3240                                    btrfs_node_key_ptr_offset(slot + 1),
3241                                    sizeof(struct btrfs_key_ptr));
3242                 write_extent_buffer(buf, &ptr1,
3243                                     btrfs_node_key_ptr_offset(slot + 1),
3244                                     sizeof(struct btrfs_key_ptr));
3245                 write_extent_buffer(buf, &ptr2,
3246                                     btrfs_node_key_ptr_offset(slot),
3247                                     sizeof(struct btrfs_key_ptr));
3248                 if (slot == 0) {
3249                         struct btrfs_disk_key key;
3250                         btrfs_node_key(buf, &key, 0);
3251                         btrfs_fixup_low_keys(root, path, &key,
3252                                              btrfs_header_level(buf) + 1);
3253                 }
3254         } else {
3255                 struct btrfs_item *item1, *item2;
3256                 struct btrfs_key k1, k2;
3257                 char *item1_data, *item2_data;
3258                 u32 item1_offset, item2_offset, item1_size, item2_size;
3259
3260                 item1 = btrfs_item_nr(slot);
3261                 item2 = btrfs_item_nr(slot + 1);
3262                 btrfs_item_key_to_cpu(buf, &k1, slot);
3263                 btrfs_item_key_to_cpu(buf, &k2, slot + 1);
3264                 item1_offset = btrfs_item_offset(buf, item1);
3265                 item2_offset = btrfs_item_offset(buf, item2);
3266                 item1_size = btrfs_item_size(buf, item1);
3267                 item2_size = btrfs_item_size(buf, item2);
3268
3269                 item1_data = malloc(item1_size);
3270                 if (!item1_data)
3271                         return -ENOMEM;
3272                 item2_data = malloc(item2_size);
3273                 if (!item2_data) {
3274                         free(item1_data);
3275                         return -ENOMEM;
3276                 }
3277
3278                 read_extent_buffer(buf, item1_data, item1_offset, item1_size);
3279                 read_extent_buffer(buf, item2_data, item2_offset, item2_size);
3280
3281                 write_extent_buffer(buf, item1_data, item2_offset, item2_size);
3282                 write_extent_buffer(buf, item2_data, item1_offset, item1_size);
3283                 free(item1_data);
3284                 free(item2_data);
3285
3286                 btrfs_set_item_offset(buf, item1, item2_offset);
3287                 btrfs_set_item_offset(buf, item2, item1_offset);
3288                 btrfs_set_item_size(buf, item1, item2_size);
3289                 btrfs_set_item_size(buf, item2, item1_size);
3290
3291                 path->slots[0] = slot;
3292                 btrfs_set_item_key_unsafe(root, path, &k2);
3293                 path->slots[0] = slot + 1;
3294                 btrfs_set_item_key_unsafe(root, path, &k1);
3295         }
3296         return 0;
3297 }
3298
3299 static int fix_key_order(struct btrfs_trans_handle *trans,
3300                          struct btrfs_root *root,
3301                          struct btrfs_path *path)
3302 {
3303         struct extent_buffer *buf;
3304         struct btrfs_key k1, k2;
3305         int i;
3306         int level = path->lowest_level;
3307         int ret;
3308
3309         buf = path->nodes[level];
3310         for (i = 0; i < btrfs_header_nritems(buf) - 1; i++) {
3311                 if (level) {
3312                         btrfs_node_key_to_cpu(buf, &k1, i);
3313                         btrfs_node_key_to_cpu(buf, &k2, i + 1);
3314                 } else {
3315                         btrfs_item_key_to_cpu(buf, &k1, i);
3316                         btrfs_item_key_to_cpu(buf, &k2, i + 1);
3317                 }
3318                 if (btrfs_comp_cpu_keys(&k1, &k2) < 0)
3319                         continue;
3320                 ret = swap_values(root, path, buf, i);
3321                 if (ret)
3322                         break;
3323                 btrfs_mark_buffer_dirty(buf);
3324                 i = 0;
3325         }
3326         return ret;
3327 }
3328
3329 static int delete_bogus_item(struct btrfs_trans_handle *trans,
3330                              struct btrfs_root *root,
3331                              struct btrfs_path *path,
3332                              struct extent_buffer *buf, int slot)
3333 {
3334         struct btrfs_key key;
3335         int nritems = btrfs_header_nritems(buf);
3336
3337         btrfs_item_key_to_cpu(buf, &key, slot);
3338
3339         /* These are all the keys we can deal with missing. */
3340         if (key.type != BTRFS_DIR_INDEX_KEY &&
3341             key.type != BTRFS_EXTENT_ITEM_KEY &&
3342             key.type != BTRFS_METADATA_ITEM_KEY &&
3343             key.type != BTRFS_TREE_BLOCK_REF_KEY &&
3344             key.type != BTRFS_EXTENT_DATA_REF_KEY)
3345                 return -1;
3346
3347         printf("Deleting bogus item [%llu,%u,%llu] at slot %d on block %llu\n",
3348                (unsigned long long)key.objectid, key.type,
3349                (unsigned long long)key.offset, slot, buf->start);
3350         memmove_extent_buffer(buf, btrfs_item_nr_offset(slot),
3351                               btrfs_item_nr_offset(slot + 1),
3352                               sizeof(struct btrfs_item) *
3353                               (nritems - slot - 1));
3354         btrfs_set_header_nritems(buf, nritems - 1);
3355         if (slot == 0) {
3356                 struct btrfs_disk_key disk_key;
3357
3358                 btrfs_item_key(buf, &disk_key, 0);
3359                 btrfs_fixup_low_keys(root, path, &disk_key, 1);
3360         }
3361         btrfs_mark_buffer_dirty(buf);
3362         return 0;
3363 }
3364
3365 static int fix_item_offset(struct btrfs_trans_handle *trans,
3366                            struct btrfs_root *root,
3367                            struct btrfs_path *path)
3368 {
3369         struct extent_buffer *buf;
3370         int i;
3371         int ret = 0;
3372
3373         /* We should only get this for leaves */
3374         BUG_ON(path->lowest_level);
3375         buf = path->nodes[0];
3376 again:
3377         for (i = 0; i < btrfs_header_nritems(buf); i++) {
3378                 unsigned int shift = 0, offset;
3379
3380                 if (i == 0 && btrfs_item_end_nr(buf, i) !=
3381                     BTRFS_LEAF_DATA_SIZE(root)) {
3382                         if (btrfs_item_end_nr(buf, i) >
3383                             BTRFS_LEAF_DATA_SIZE(root)) {
3384                                 ret = delete_bogus_item(trans, root, path,
3385                                                         buf, i);
3386                                 if (!ret)
3387                                         goto again;
3388                                 fprintf(stderr, "item is off the end of the "
3389                                         "leaf, can't fix\n");
3390                                 ret = -EIO;
3391                                 break;
3392                         }
3393                         shift = BTRFS_LEAF_DATA_SIZE(root) -
3394                                 btrfs_item_end_nr(buf, i);
3395                 } else if (i > 0 && btrfs_item_end_nr(buf, i) !=
3396                            btrfs_item_offset_nr(buf, i - 1)) {
3397                         if (btrfs_item_end_nr(buf, i) >
3398                             btrfs_item_offset_nr(buf, i - 1)) {
3399                                 ret = delete_bogus_item(trans, root, path,
3400                                                         buf, i);
3401                                 if (!ret)
3402                                         goto again;
3403                                 fprintf(stderr, "items overlap, can't fix\n");
3404                                 ret = -EIO;
3405                                 break;
3406                         }
3407                         shift = btrfs_item_offset_nr(buf, i - 1) -
3408                                 btrfs_item_end_nr(buf, i);
3409                 }
3410                 if (!shift)
3411                         continue;
3412
3413                 printf("Shifting item nr %d by %u bytes in block %llu\n",
3414                        i, shift, (unsigned long long)buf->start);
3415                 offset = btrfs_item_offset_nr(buf, i);
3416                 memmove_extent_buffer(buf,
3417                                       btrfs_leaf_data(buf) + offset + shift,
3418                                       btrfs_leaf_data(buf) + offset,
3419                                       btrfs_item_size_nr(buf, i));
3420                 btrfs_set_item_offset(buf, btrfs_item_nr(i),
3421                                       offset + shift);
3422                 btrfs_mark_buffer_dirty(buf);
3423         }
3424
3425         /*
3426          * We may have moved things, in which case we want to exit so we don't
3427          * write those changes out.  Once we have proper abort functionality in
3428          * progs this can be changed to something nicer.
3429          */
3430         BUG_ON(ret);
3431         return ret;
3432 }
3433
3434 /*
3435  * Attempt to fix basic block failures.  If we can't fix it for whatever reason
3436  * then just return -EIO.
3437  */
3438 static int try_to_fix_bad_block(struct btrfs_trans_handle *trans,
3439                                 struct btrfs_root *root,
3440                                 struct extent_buffer *buf,
3441                                 enum btrfs_tree_block_status status)
3442 {
3443         struct ulist *roots;
3444         struct ulist_node *node;
3445         struct btrfs_root *search_root;
3446         struct btrfs_path *path;
3447         struct ulist_iterator iter;
3448         struct btrfs_key root_key, key;
3449         int ret;
3450
3451         if (status != BTRFS_TREE_BLOCK_BAD_KEY_ORDER &&
3452             status != BTRFS_TREE_BLOCK_INVALID_OFFSETS)
3453                 return -EIO;
3454
3455         path = btrfs_alloc_path();
3456         if (!path)
3457                 return -EIO;
3458
3459         ret = btrfs_find_all_roots(trans, root->fs_info, buf->start,
3460                                    0, &roots);
3461         if (ret) {
3462                 btrfs_free_path(path);
3463                 return -EIO;
3464         }
3465
3466         ULIST_ITER_INIT(&iter);
3467         while ((node = ulist_next(roots, &iter))) {
3468                 root_key.objectid = node->val;
3469                 root_key.type = BTRFS_ROOT_ITEM_KEY;
3470                 root_key.offset = (u64)-1;
3471
3472                 search_root = btrfs_read_fs_root(root->fs_info, &root_key);
3473                 if (IS_ERR(root)) {
3474                         ret = -EIO;
3475                         break;
3476                 }
3477
3478                 record_root_in_trans(trans, search_root);
3479
3480                 path->lowest_level = btrfs_header_level(buf);
3481                 path->skip_check_block = 1;
3482                 if (path->lowest_level)
3483                         btrfs_node_key_to_cpu(buf, &key, 0);
3484                 else
3485                         btrfs_item_key_to_cpu(buf, &key, 0);
3486                 ret = btrfs_search_slot(trans, search_root, &key, path, 0, 1);
3487                 if (ret) {
3488                         ret = -EIO;
3489                         break;
3490                 }
3491                 if (status == BTRFS_TREE_BLOCK_BAD_KEY_ORDER)
3492                         ret = fix_key_order(trans, search_root, path);
3493                 else if (status == BTRFS_TREE_BLOCK_INVALID_OFFSETS)
3494                         ret = fix_item_offset(trans, search_root, path);
3495                 if (ret)
3496                         break;
3497                 btrfs_release_path(path);
3498         }
3499         ulist_free(roots);
3500         btrfs_free_path(path);
3501         return ret;
3502 }
3503
3504 static int check_block(struct btrfs_trans_handle *trans,
3505                        struct btrfs_root *root,
3506                        struct cache_tree *extent_cache,
3507                        struct extent_buffer *buf, u64 flags)
3508 {
3509         struct extent_record *rec;
3510         struct cache_extent *cache;
3511         struct btrfs_key key;
3512         enum btrfs_tree_block_status status;
3513         int ret = 0;
3514         int level;
3515
3516         cache = lookup_cache_extent(extent_cache, buf->start, buf->len);
3517         if (!cache)
3518                 return 1;
3519         rec = container_of(cache, struct extent_record, cache);
3520         rec->generation = btrfs_header_generation(buf);
3521
3522         level = btrfs_header_level(buf);
3523         if (btrfs_header_nritems(buf) > 0) {
3524
3525                 if (level == 0)
3526                         btrfs_item_key_to_cpu(buf, &key, 0);
3527                 else
3528                         btrfs_node_key_to_cpu(buf, &key, 0);
3529
3530                 rec->info_objectid = key.objectid;
3531         }
3532         rec->info_level = level;
3533
3534         if (btrfs_is_leaf(buf))
3535                 status = btrfs_check_leaf(root, &rec->parent_key, buf);
3536         else
3537                 status = btrfs_check_node(root, &rec->parent_key, buf);
3538
3539         if (status != BTRFS_TREE_BLOCK_CLEAN) {
3540                 if (repair)
3541                         status = try_to_fix_bad_block(trans, root, buf,
3542                                                       status);
3543                 if (status != BTRFS_TREE_BLOCK_CLEAN) {
3544                         ret = -EIO;
3545                         fprintf(stderr, "bad block %llu\n",
3546                                 (unsigned long long)buf->start);
3547                 } else {
3548                         /*
3549                          * Signal to callers we need to start the scan over
3550                          * again since we'll have cow'ed blocks.
3551                          */
3552                         ret = -EAGAIN;
3553                 }
3554         } else {
3555                 rec->content_checked = 1;
3556                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3557                         rec->owner_ref_checked = 1;
3558                 else {
3559                         ret = check_owner_ref(root, rec, buf);
3560                         if (!ret)
3561                                 rec->owner_ref_checked = 1;
3562                 }
3563         }
3564         if (!ret)
3565                 maybe_free_extent_rec(extent_cache, rec);
3566         return ret;
3567 }
3568
3569 static struct tree_backref *find_tree_backref(struct extent_record *rec,
3570                                                 u64 parent, u64 root)
3571 {
3572         struct list_head *cur = rec->backrefs.next;
3573         struct extent_backref *node;
3574         struct tree_backref *back;
3575
3576         while(cur != &rec->backrefs) {
3577                 node = list_entry(cur, struct extent_backref, list);
3578                 cur = cur->next;
3579                 if (node->is_data)
3580                         continue;
3581                 back = (struct tree_backref *)node;
3582                 if (parent > 0) {
3583                         if (!node->full_backref)
3584                                 continue;
3585                         if (parent == back->parent)
3586                                 return back;
3587                 } else {
3588                         if (node->full_backref)
3589                                 continue;
3590                         if (back->root == root)
3591                                 return back;
3592                 }
3593         }
3594         return NULL;
3595 }
3596
3597 static struct tree_backref *alloc_tree_backref(struct extent_record *rec,
3598                                                 u64 parent, u64 root)
3599 {
3600         struct tree_backref *ref = malloc(sizeof(*ref));
3601         memset(&ref->node, 0, sizeof(ref->node));
3602         if (parent > 0) {
3603                 ref->parent = parent;
3604                 ref->node.full_backref = 1;
3605         } else {
3606                 ref->root = root;
3607                 ref->node.full_backref = 0;
3608         }
3609         list_add_tail(&ref->node.list, &rec->backrefs);
3610
3611         return ref;
3612 }
3613
3614 static struct data_backref *find_data_backref(struct extent_record *rec,
3615                                                 u64 parent, u64 root,
3616                                                 u64 owner, u64 offset,
3617                                                 int found_ref,
3618                                                 u64 disk_bytenr, u64 bytes)
3619 {
3620         struct list_head *cur = rec->backrefs.next;
3621         struct extent_backref *node;
3622         struct data_backref *back;
3623
3624         while(cur != &rec->backrefs) {
3625                 node = list_entry(cur, struct extent_backref, list);
3626                 cur = cur->next;
3627                 if (!node->is_data)
3628                         continue;
3629                 back = (struct data_backref *)node;
3630                 if (parent > 0) {
3631                         if (!node->full_backref)
3632                                 continue;
3633                         if (parent == back->parent)
3634                                 return back;
3635                 } else {
3636                         if (node->full_backref)
3637                                 continue;
3638                         if (back->root == root && back->owner == owner &&
3639                             back->offset == offset) {
3640                                 if (found_ref && node->found_ref &&
3641                                     (back->bytes != bytes ||
3642                                     back->disk_bytenr != disk_bytenr))
3643                                         continue;
3644                                 return back;
3645                         }
3646                 }
3647         }
3648         return NULL;
3649 }
3650
3651 static struct data_backref *alloc_data_backref(struct extent_record *rec,
3652                                                 u64 parent, u64 root,
3653                                                 u64 owner, u64 offset,
3654                                                 u64 max_size)
3655 {
3656         struct data_backref *ref = malloc(sizeof(*ref));
3657         memset(&ref->node, 0, sizeof(ref->node));
3658         ref->node.is_data = 1;
3659
3660         if (parent > 0) {
3661                 ref->parent = parent;
3662                 ref->owner = 0;
3663                 ref->offset = 0;
3664                 ref->node.full_backref = 1;
3665         } else {
3666                 ref->root = root;
3667                 ref->owner = owner;
3668                 ref->offset = offset;
3669                 ref->node.full_backref = 0;
3670         }
3671         ref->bytes = max_size;
3672         ref->found_ref = 0;
3673         ref->num_refs = 0;
3674         list_add_tail(&ref->node.list, &rec->backrefs);
3675         if (max_size > rec->max_size)
3676                 rec->max_size = max_size;
3677         return ref;
3678 }
3679
3680 static int add_extent_rec(struct cache_tree *extent_cache,
3681                           struct btrfs_key *parent_key, u64 parent_gen,
3682                           u64 start, u64 nr, u64 extent_item_refs,
3683                           int is_root, int inc_ref, int set_checked,
3684                           int metadata, int extent_rec, u64 max_size)
3685 {
3686         struct extent_record *rec;
3687         struct cache_extent *cache;
3688         int ret = 0;
3689         int dup = 0;
3690
3691         cache = lookup_cache_extent(extent_cache, start, nr);
3692         if (cache) {
3693                 rec = container_of(cache, struct extent_record, cache);
3694                 if (inc_ref)
3695                         rec->refs++;
3696                 if (rec->nr == 1)
3697                         rec->nr = max(nr, max_size);
3698
3699                 /*
3700                  * We need to make sure to reset nr to whatever the extent
3701                  * record says was the real size, this way we can compare it to
3702                  * the backrefs.
3703                  */
3704                 if (extent_rec) {
3705                         if (start != rec->start || rec->found_rec) {
3706                                 struct extent_record *tmp;
3707
3708                                 dup = 1;
3709                                 if (list_empty(&rec->list))
3710                                         list_add_tail(&rec->list,
3711                                                       &duplicate_extents);
3712
3713                                 /*
3714                                  * We have to do this song and dance in case we
3715                                  * find an extent record that falls inside of
3716                                  * our current extent record but does not have
3717                                  * the same objectid.
3718                                  */
3719                                 tmp = malloc(sizeof(*tmp));
3720                                 if (!tmp)
3721                                         return -ENOMEM;
3722                                 tmp->start = start;
3723                                 tmp->max_size = max_size;
3724                                 tmp->nr = nr;
3725                                 tmp->found_rec = 1;
3726                                 tmp->metadata = metadata;
3727                                 tmp->extent_item_refs = extent_item_refs;
3728                                 INIT_LIST_HEAD(&tmp->list);
3729                                 list_add_tail(&tmp->list, &rec->dups);
3730                                 rec->num_duplicates++;
3731                         } else {
3732                                 rec->nr = nr;
3733                                 rec->found_rec = 1;
3734                         }
3735                 }
3736
3737                 if (extent_item_refs && !dup) {
3738                         if (rec->extent_item_refs) {
3739                                 fprintf(stderr, "block %llu rec "
3740                                         "extent_item_refs %llu, passed %llu\n",
3741                                         (unsigned long long)start,
3742                                         (unsigned long long)
3743                                                         rec->extent_item_refs,
3744                                         (unsigned long long)extent_item_refs);
3745                         }
3746                         rec->extent_item_refs = extent_item_refs;
3747                 }
3748                 if (is_root)
3749                         rec->is_root = 1;
3750                 if (set_checked) {
3751                         rec->content_checked = 1;
3752                         rec->owner_ref_checked = 1;
3753                 }
3754
3755                 if (parent_key)
3756                         btrfs_cpu_key_to_disk(&rec->parent_key, parent_key);
3757                 if (parent_gen)
3758                         rec->parent_generation = parent_gen;
3759
3760                 if (rec->max_size < max_size)
3761                         rec->max_size = max_size;
3762
3763                 maybe_free_extent_rec(extent_cache, rec);
3764                 return ret;
3765         }
3766         rec = malloc(sizeof(*rec));
3767         rec->start = start;
3768         rec->max_size = max_size;
3769         rec->nr = max(nr, max_size);
3770         rec->found_rec = !!extent_rec;
3771         rec->content_checked = 0;
3772         rec->owner_ref_checked = 0;
3773         rec->num_duplicates = 0;
3774         rec->metadata = metadata;
3775         INIT_LIST_HEAD(&rec->backrefs);
3776         INIT_LIST_HEAD(&rec->dups);
3777         INIT_LIST_HEAD(&rec->list);
3778
3779         if (is_root)
3780                 rec->is_root = 1;
3781         else
3782                 rec->is_root = 0;
3783
3784         if (inc_ref)
3785                 rec->refs = 1;
3786         else
3787                 rec->refs = 0;
3788
3789         if (extent_item_refs)
3790                 rec->extent_item_refs = extent_item_refs;
3791         else
3792                 rec->extent_item_refs = 0;
3793
3794         if (parent_key)
3795                 btrfs_cpu_key_to_disk(&rec->parent_key, parent_key);
3796         else
3797                 memset(&rec->parent_key, 0, sizeof(*parent_key));
3798
3799         if (parent_gen)
3800                 rec->parent_generation = parent_gen;
3801         else
3802                 rec->parent_generation = 0;
3803
3804         rec->cache.start = start;
3805         rec->cache.size = nr;
3806         ret = insert_cache_extent(extent_cache, &rec->cache);
3807         BUG_ON(ret);
3808         bytes_used += nr;
3809         if (set_checked) {
3810                 rec->content_checked = 1;
3811                 rec->owner_ref_checked = 1;
3812         }
3813         return ret;
3814 }
3815
3816 static int add_tree_backref(struct cache_tree *extent_cache, u64 bytenr,
3817                             u64 parent, u64 root, int found_ref)
3818 {
3819         struct extent_record *rec;
3820         struct tree_backref *back;
3821         struct cache_extent *cache;
3822
3823         cache = lookup_cache_extent(extent_cache, bytenr, 1);
3824         if (!cache) {
3825                 add_extent_rec(extent_cache, NULL, 0, bytenr,
3826                                1, 0, 0, 0, 0, 1, 0, 0);
3827                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
3828                 if (!cache)
3829                         abort();
3830         }
3831
3832         rec = container_of(cache, struct extent_record, cache);
3833         if (rec->start != bytenr) {
3834                 abort();
3835         }
3836
3837         back = find_tree_backref(rec, parent, root);
3838         if (!back)
3839                 back = alloc_tree_backref(rec, parent, root);
3840
3841         if (found_ref) {
3842                 if (back->node.found_ref) {
3843                         fprintf(stderr, "Extent back ref already exists "
3844                                 "for %llu parent %llu root %llu \n",
3845                                 (unsigned long long)bytenr,
3846                                 (unsigned long long)parent,
3847                                 (unsigned long long)root);
3848                 }
3849                 back->node.found_ref = 1;
3850         } else {
3851                 if (back->node.found_extent_tree) {
3852                         fprintf(stderr, "Extent back ref already exists "
3853                                 "for %llu parent %llu root %llu \n",
3854                                 (unsigned long long)bytenr,
3855                                 (unsigned long long)parent,
3856                                 (unsigned long long)root);
3857                 }
3858                 back->node.found_extent_tree = 1;
3859         }
3860         maybe_free_extent_rec(extent_cache, rec);
3861         return 0;
3862 }
3863
3864 static int add_data_backref(struct cache_tree *extent_cache, u64 bytenr,
3865                             u64 parent, u64 root, u64 owner, u64 offset,
3866                             u32 num_refs, int found_ref, u64 max_size)
3867 {
3868         struct extent_record *rec;
3869         struct data_backref *back;
3870         struct cache_extent *cache;
3871
3872         cache = lookup_cache_extent(extent_cache, bytenr, 1);
3873         if (!cache) {
3874                 add_extent_rec(extent_cache, NULL, 0, bytenr, 1, 0, 0, 0, 0,
3875                                0, 0, max_size);
3876                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
3877                 if (!cache)
3878                         abort();
3879         }
3880
3881         rec = container_of(cache, struct extent_record, cache);
3882         if (rec->max_size < max_size)
3883                 rec->max_size = max_size;
3884
3885         /*
3886          * If found_ref is set then max_size is the real size and must match the
3887          * existing refs.  So if we have already found a ref then we need to
3888          * make sure that this ref matches the existing one, otherwise we need
3889          * to add a new backref so we can notice that the backrefs don't match
3890          * and we need to figure out who is telling the truth.  This is to
3891          * account for that awful fsync bug I introduced where we'd end up with
3892          * a btrfs_file_extent_item that would have its length include multiple
3893          * prealloc extents or point inside of a prealloc extent.
3894          */
3895         back = find_data_backref(rec, parent, root, owner, offset, found_ref,
3896                                  bytenr, max_size);
3897         if (!back)
3898                 back = alloc_data_backref(rec, parent, root, owner, offset,
3899                                           max_size);
3900
3901         if (found_ref) {
3902                 BUG_ON(num_refs != 1);
3903                 if (back->node.found_ref)
3904                         BUG_ON(back->bytes != max_size);
3905                 back->node.found_ref = 1;
3906                 back->found_ref += 1;
3907                 back->bytes = max_size;
3908                 back->disk_bytenr = bytenr;
3909                 rec->refs += 1;
3910                 rec->content_checked = 1;
3911                 rec->owner_ref_checked = 1;
3912         } else {
3913                 if (back->node.found_extent_tree) {
3914                         fprintf(stderr, "Extent back ref already exists "
3915                                 "for %llu parent %llu root %llu "
3916                                 "owner %llu offset %llu num_refs %lu\n",
3917                                 (unsigned long long)bytenr,
3918                                 (unsigned long long)parent,
3919                                 (unsigned long long)root,
3920                                 (unsigned long long)owner,
3921                                 (unsigned long long)offset,
3922                                 (unsigned long)num_refs);
3923                 }
3924                 back->num_refs = num_refs;
3925                 back->node.found_extent_tree = 1;
3926         }
3927         maybe_free_extent_rec(extent_cache, rec);
3928         return 0;
3929 }
3930
3931 static int add_pending(struct cache_tree *pending,
3932                        struct cache_tree *seen, u64 bytenr, u32 size)
3933 {
3934         int ret;
3935         ret = add_cache_extent(seen, bytenr, size);
3936         if (ret)
3937                 return ret;
3938         add_cache_extent(pending, bytenr, size);
3939         return 0;
3940 }
3941
3942 static int pick_next_pending(struct cache_tree *pending,
3943                         struct cache_tree *reada,
3944                         struct cache_tree *nodes,
3945                         u64 last, struct block_info *bits, int bits_nr,
3946                         int *reada_bits)
3947 {
3948         unsigned long node_start = last;
3949         struct cache_extent *cache;
3950         int ret;
3951
3952         cache = search_cache_extent(reada, 0);
3953         if (cache) {
3954                 bits[0].start = cache->start;
3955                 bits[0].size = cache->size;
3956                 *reada_bits = 1;
3957                 return 1;
3958         }
3959         *reada_bits = 0;
3960         if (node_start > 32768)
3961                 node_start -= 32768;
3962
3963         cache = search_cache_extent(nodes, node_start);
3964         if (!cache)
3965                 cache = search_cache_extent(nodes, 0);
3966
3967         if (!cache) {
3968                  cache = search_cache_extent(pending, 0);
3969                  if (!cache)
3970                          return 0;
3971                  ret = 0;
3972                  do {
3973                          bits[ret].start = cache->start;
3974                          bits[ret].size = cache->size;
3975                          cache = next_cache_extent(cache);
3976                          ret++;
3977                  } while (cache && ret < bits_nr);
3978                  return ret;
3979         }
3980
3981         ret = 0;
3982         do {
3983                 bits[ret].start = cache->start;
3984                 bits[ret].size = cache->size;
3985                 cache = next_cache_extent(cache);
3986                 ret++;
3987         } while (cache && ret < bits_nr);
3988
3989         if (bits_nr - ret > 8) {
3990                 u64 lookup = bits[0].start + bits[0].size;
3991                 struct cache_extent *next;
3992                 next = search_cache_extent(pending, lookup);
3993                 while(next) {
3994                         if (next->start - lookup > 32768)
3995                                 break;
3996                         bits[ret].start = next->start;
3997                         bits[ret].size = next->size;
3998                         lookup = next->start + next->size;
3999                         ret++;
4000                         if (ret == bits_nr)
4001                                 break;
4002                         next = next_cache_extent(next);
4003                         if (!next)
4004                                 break;
4005                 }
4006         }
4007         return ret;
4008 }
4009
4010 static void free_chunk_record(struct cache_extent *cache)
4011 {
4012         struct chunk_record *rec;
4013
4014         rec = container_of(cache, struct chunk_record, cache);
4015         list_del_init(&rec->list);
4016         list_del_init(&rec->dextents);
4017         free(rec);
4018 }
4019
4020 void free_chunk_cache_tree(struct cache_tree *chunk_cache)
4021 {
4022         cache_tree_free_extents(chunk_cache, free_chunk_record);
4023 }
4024
4025 static void free_device_record(struct rb_node *node)
4026 {
4027         struct device_record *rec;
4028
4029         rec = container_of(node, struct device_record, node);
4030         free(rec);
4031 }
4032
4033 FREE_RB_BASED_TREE(device_cache, free_device_record);
4034
4035 int insert_block_group_record(struct block_group_tree *tree,
4036                               struct block_group_record *bg_rec)
4037 {
4038         int ret;
4039
4040         ret = insert_cache_extent(&tree->tree, &bg_rec->cache);
4041         if (ret)
4042                 return ret;
4043
4044         list_add_tail(&bg_rec->list, &tree->block_groups);
4045         return 0;
4046 }
4047
4048 static void free_block_group_record(struct cache_extent *cache)
4049 {
4050         struct block_group_record *rec;
4051
4052         rec = container_of(cache, struct block_group_record, cache);
4053         list_del_init(&rec->list);
4054         free(rec);
4055 }
4056
4057 void free_block_group_tree(struct block_group_tree *tree)
4058 {
4059         cache_tree_free_extents(&tree->tree, free_block_group_record);
4060 }
4061
4062 int insert_device_extent_record(struct device_extent_tree *tree,
4063                                 struct device_extent_record *de_rec)
4064 {
4065         int ret;
4066
4067         /*
4068          * Device extent is a bit different from the other extents, because
4069          * the extents which belong to the different devices may have the
4070          * same start and size, so we need use the special extent cache
4071          * search/insert functions.
4072          */
4073         ret = insert_cache_extent2(&tree->tree, &de_rec->cache);
4074         if (ret)
4075                 return ret;
4076
4077         list_add_tail(&de_rec->chunk_list, &tree->no_chunk_orphans);
4078         list_add_tail(&de_rec->device_list, &tree->no_device_orphans);
4079         return 0;
4080 }
4081
4082 static void free_device_extent_record(struct cache_extent *cache)
4083 {
4084         struct device_extent_record *rec;
4085
4086         rec = container_of(cache, struct device_extent_record, cache);
4087         if (!list_empty(&rec->chunk_list))
4088                 list_del_init(&rec->chunk_list);
4089         if (!list_empty(&rec->device_list))
4090                 list_del_init(&rec->device_list);
4091         free(rec);
4092 }
4093
4094 void free_device_extent_tree(struct device_extent_tree *tree)
4095 {
4096         cache_tree_free_extents(&tree->tree, free_device_extent_record);
4097 }
4098
4099 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4100 static int process_extent_ref_v0(struct cache_tree *extent_cache,
4101                                  struct extent_buffer *leaf, int slot)
4102 {
4103         struct btrfs_extent_ref_v0 *ref0;
4104         struct btrfs_key key;
4105
4106         btrfs_item_key_to_cpu(leaf, &key, slot);
4107         ref0 = btrfs_item_ptr(leaf, slot, struct btrfs_extent_ref_v0);
4108         if (btrfs_ref_objectid_v0(leaf, ref0) < BTRFS_FIRST_FREE_OBJECTID) {
4109                 add_tree_backref(extent_cache, key.objectid, key.offset, 0, 0);
4110         } else {
4111                 add_data_backref(extent_cache, key.objectid, key.offset, 0,
4112                                  0, 0, btrfs_ref_count_v0(leaf, ref0), 0, 0);
4113         }
4114         return 0;
4115 }
4116 #endif
4117
4118 struct chunk_record *btrfs_new_chunk_record(struct extent_buffer *leaf,
4119                                             struct btrfs_key *key,
4120                                             int slot)
4121 {
4122         struct btrfs_chunk *ptr;
4123         struct chunk_record *rec;
4124         int num_stripes, i;
4125
4126         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4127         num_stripes = btrfs_chunk_num_stripes(leaf, ptr);
4128
4129         rec = malloc(btrfs_chunk_record_size(num_stripes));
4130         if (!rec) {
4131                 fprintf(stderr, "memory allocation failed\n");
4132                 exit(-1);
4133         }
4134
4135         memset(rec, 0, btrfs_chunk_record_size(num_stripes));
4136
4137         INIT_LIST_HEAD(&rec->list);
4138         INIT_LIST_HEAD(&rec->dextents);
4139         rec->bg_rec = NULL;
4140
4141         rec->cache.start = key->offset;
4142         rec->cache.size = btrfs_chunk_length(leaf, ptr);
4143
4144         rec->generation = btrfs_header_generation(leaf);
4145
4146         rec->objectid = key->objectid;
4147         rec->type = key->type;
4148         rec->offset = key->offset;
4149
4150         rec->length = rec->cache.size;
4151         rec->owner = btrfs_chunk_owner(leaf, ptr);
4152         rec->stripe_len = btrfs_chunk_stripe_len(leaf, ptr);
4153         rec->type_flags = btrfs_chunk_type(leaf, ptr);
4154         rec->io_width = btrfs_chunk_io_width(leaf, ptr);
4155         rec->io_align = btrfs_chunk_io_align(leaf, ptr);
4156         rec->sector_size = btrfs_chunk_sector_size(leaf, ptr);
4157         rec->num_stripes = num_stripes;
4158         rec->sub_stripes = btrfs_chunk_sub_stripes(leaf, ptr);
4159
4160         for (i = 0; i < rec->num_stripes; ++i) {
4161                 rec->stripes[i].devid =
4162                         btrfs_stripe_devid_nr(leaf, ptr, i);
4163                 rec->stripes[i].offset =
4164                         btrfs_stripe_offset_nr(leaf, ptr, i);
4165                 read_extent_buffer(leaf, rec->stripes[i].dev_uuid,
4166                                 (unsigned long)btrfs_stripe_dev_uuid_nr(ptr, i),
4167                                 BTRFS_UUID_SIZE);
4168         }
4169
4170         return rec;
4171 }
4172
4173 static int process_chunk_item(struct cache_tree *chunk_cache,
4174                               struct btrfs_key *key, struct extent_buffer *eb,
4175                               int slot)
4176 {
4177         struct chunk_record *rec;
4178         int ret = 0;
4179
4180         rec = btrfs_new_chunk_record(eb, key, slot);
4181         ret = insert_cache_extent(chunk_cache, &rec->cache);
4182         if (ret) {
4183                 fprintf(stderr, "Chunk[%llu, %llu] existed.\n",
4184                         rec->offset, rec->length);
4185                 free(rec);
4186         }
4187
4188         return ret;
4189 }
4190
4191 static int process_device_item(struct rb_root *dev_cache,
4192                 struct btrfs_key *key, struct extent_buffer *eb, int slot)
4193 {
4194         struct btrfs_dev_item *ptr;
4195         struct device_record *rec;
4196         int ret = 0;
4197
4198         ptr = btrfs_item_ptr(eb,
4199                 slot, struct btrfs_dev_item);
4200
4201         rec = malloc(sizeof(*rec));
4202         if (!rec) {
4203                 fprintf(stderr, "memory allocation failed\n");
4204                 return -ENOMEM;
4205         }
4206
4207         rec->devid = key->offset;
4208         rec->generation = btrfs_header_generation(eb);
4209
4210         rec->objectid = key->objectid;
4211         rec->type = key->type;
4212         rec->offset = key->offset;
4213
4214         rec->devid = btrfs_device_id(eb, ptr);
4215         rec->total_byte = btrfs_device_total_bytes(eb, ptr);
4216         rec->byte_used = btrfs_device_bytes_used(eb, ptr);
4217
4218         ret = rb_insert(dev_cache, &rec->node, device_record_compare);
4219         if (ret) {
4220                 fprintf(stderr, "Device[%llu] existed.\n", rec->devid);
4221                 free(rec);
4222         }
4223
4224         return ret;
4225 }
4226
4227 struct block_group_record *
4228 btrfs_new_block_group_record(struct extent_buffer *leaf, struct btrfs_key *key,
4229                              int slot)
4230 {
4231         struct btrfs_block_group_item *ptr;
4232         struct block_group_record *rec;
4233
4234         rec = malloc(sizeof(*rec));
4235         if (!rec) {
4236                 fprintf(stderr, "memory allocation failed\n");
4237                 exit(-1);
4238         }
4239         memset(rec, 0, sizeof(*rec));
4240
4241         rec->cache.start = key->objectid;
4242         rec->cache.size = key->offset;
4243
4244         rec->generation = btrfs_header_generation(leaf);
4245
4246         rec->objectid = key->objectid;
4247         rec->type = key->type;
4248         rec->offset = key->offset;
4249
4250         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_block_group_item);
4251         rec->flags = btrfs_disk_block_group_flags(leaf, ptr);
4252
4253         INIT_LIST_HEAD(&rec->list);
4254
4255         return rec;
4256 }
4257
4258 static int process_block_group_item(struct block_group_tree *block_group_cache,
4259                                     struct btrfs_key *key,
4260                                     struct extent_buffer *eb, int slot)
4261 {
4262         struct block_group_record *rec;
4263         int ret = 0;
4264
4265         rec = btrfs_new_block_group_record(eb, key, slot);
4266         ret = insert_block_group_record(block_group_cache, rec);
4267         if (ret) {
4268                 fprintf(stderr, "Block Group[%llu, %llu] existed.\n",
4269                         rec->objectid, rec->offset);
4270                 free(rec);
4271         }
4272
4273         return ret;
4274 }
4275
4276 struct device_extent_record *
4277 btrfs_new_device_extent_record(struct extent_buffer *leaf,
4278                                struct btrfs_key *key, int slot)
4279 {
4280         struct device_extent_record *rec;
4281         struct btrfs_dev_extent *ptr;
4282
4283         rec = malloc(sizeof(*rec));
4284         if (!rec) {
4285                 fprintf(stderr, "memory allocation failed\n");
4286                 exit(-1);
4287         }
4288         memset(rec, 0, sizeof(*rec));
4289
4290         rec->cache.objectid = key->objectid;
4291         rec->cache.start = key->offset;
4292
4293         rec->generation = btrfs_header_generation(leaf);
4294
4295         rec->objectid = key->objectid;
4296         rec->type = key->type;
4297         rec->offset = key->offset;
4298
4299         ptr = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
4300         rec->chunk_objecteid =
4301                 btrfs_dev_extent_chunk_objectid(leaf, ptr);
4302         rec->chunk_offset =
4303                 btrfs_dev_extent_chunk_offset(leaf, ptr);
4304         rec->length = btrfs_dev_extent_length(leaf, ptr);
4305         rec->cache.size = rec->length;
4306
4307         INIT_LIST_HEAD(&rec->chunk_list);
4308         INIT_LIST_HEAD(&rec->device_list);
4309
4310         return rec;
4311 }
4312
4313 static int
4314 process_device_extent_item(struct device_extent_tree *dev_extent_cache,
4315                            struct btrfs_key *key, struct extent_buffer *eb,
4316                            int slot)
4317 {
4318         struct device_extent_record *rec;
4319         int ret;
4320
4321         rec = btrfs_new_device_extent_record(eb, key, slot);
4322         ret = insert_device_extent_record(dev_extent_cache, rec);
4323         if (ret) {
4324                 fprintf(stderr,
4325                         "Device extent[%llu, %llu, %llu] existed.\n",
4326                         rec->objectid, rec->offset, rec->length);
4327                 free(rec);
4328         }
4329
4330         return ret;
4331 }
4332
4333 static int process_extent_item(struct btrfs_root *root,
4334                                struct cache_tree *extent_cache,
4335                                struct extent_buffer *eb, int slot)
4336 {
4337         struct btrfs_extent_item *ei;
4338         struct btrfs_extent_inline_ref *iref;
4339         struct btrfs_extent_data_ref *dref;
4340         struct btrfs_shared_data_ref *sref;
4341         struct btrfs_key key;
4342         unsigned long end;
4343         unsigned long ptr;
4344         int type;
4345         u32 item_size = btrfs_item_size_nr(eb, slot);
4346         u64 refs = 0;
4347         u64 offset;
4348         u64 num_bytes;
4349         int metadata = 0;
4350
4351         btrfs_item_key_to_cpu(eb, &key, slot);
4352
4353         if (key.type == BTRFS_METADATA_ITEM_KEY) {
4354                 metadata = 1;
4355                 num_bytes = root->leafsize;
4356         } else {
4357                 num_bytes = key.offset;
4358         }
4359
4360         if (item_size < sizeof(*ei)) {
4361 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4362                 struct btrfs_extent_item_v0 *ei0;
4363                 BUG_ON(item_size != sizeof(*ei0));
4364                 ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
4365                 refs = btrfs_extent_refs_v0(eb, ei0);
4366 #else
4367                 BUG();
4368 #endif
4369                 return add_extent_rec(extent_cache, NULL, 0, key.objectid,
4370                                       num_bytes, refs, 0, 0, 0, metadata, 1,
4371                                       num_bytes);
4372         }
4373
4374         ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
4375         refs = btrfs_extent_refs(eb, ei);
4376
4377         add_extent_rec(extent_cache, NULL, 0, key.objectid, num_bytes,
4378                        refs, 0, 0, 0, metadata, 1, num_bytes);
4379
4380         ptr = (unsigned long)(ei + 1);
4381         if (btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK &&
4382             key.type == BTRFS_EXTENT_ITEM_KEY)
4383                 ptr += sizeof(struct btrfs_tree_block_info);
4384
4385         end = (unsigned long)ei + item_size;
4386         while (ptr < end) {
4387                 iref = (struct btrfs_extent_inline_ref *)ptr;
4388                 type = btrfs_extent_inline_ref_type(eb, iref);
4389                 offset = btrfs_extent_inline_ref_offset(eb, iref);
4390                 switch (type) {
4391                 case BTRFS_TREE_BLOCK_REF_KEY:
4392                         add_tree_backref(extent_cache, key.objectid,
4393                                          0, offset, 0);
4394                         break;
4395                 case BTRFS_SHARED_BLOCK_REF_KEY:
4396                         add_tree_backref(extent_cache, key.objectid,
4397                                          offset, 0, 0);
4398                         break;
4399                 case BTRFS_EXTENT_DATA_REF_KEY:
4400                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
4401                         add_data_backref(extent_cache, key.objectid, 0,
4402                                         btrfs_extent_data_ref_root(eb, dref),
4403                                         btrfs_extent_data_ref_objectid(eb,
4404                                                                        dref),
4405                                         btrfs_extent_data_ref_offset(eb, dref),
4406                                         btrfs_extent_data_ref_count(eb, dref),
4407                                         0, num_bytes);
4408                         break;
4409                 case BTRFS_SHARED_DATA_REF_KEY:
4410                         sref = (struct btrfs_shared_data_ref *)(iref + 1);
4411                         add_data_backref(extent_cache, key.objectid, offset,
4412                                         0, 0, 0,
4413                                         btrfs_shared_data_ref_count(eb, sref),
4414                                         0, num_bytes);
4415                         break;
4416                 default:
4417                         fprintf(stderr, "corrupt extent record: key %Lu %u %Lu\n",
4418                                 key.objectid, key.type, num_bytes);
4419                         goto out;
4420                 }
4421                 ptr += btrfs_extent_inline_ref_size(type);
4422         }
4423         WARN_ON(ptr > end);
4424 out:
4425         return 0;
4426 }
4427
4428 static int check_cache_range(struct btrfs_root *root,
4429                              struct btrfs_block_group_cache *cache,
4430                              u64 offset, u64 bytes)
4431 {
4432         struct btrfs_free_space *entry;
4433         u64 *logical;
4434         u64 bytenr;
4435         int stripe_len;
4436         int i, nr, ret;
4437
4438         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4439                 bytenr = btrfs_sb_offset(i);
4440                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
4441                                        cache->key.objectid, bytenr, 0,
4442                                        &logical, &nr, &stripe_len);
4443                 if (ret)
4444                         return ret;
4445
4446                 while (nr--) {
4447                         if (logical[nr] + stripe_len <= offset)
4448                                 continue;
4449                         if (offset + bytes <= logical[nr])
4450                                 continue;
4451                         if (logical[nr] == offset) {
4452                                 if (stripe_len >= bytes) {
4453                                         kfree(logical);
4454                                         return 0;
4455                                 }
4456                                 bytes -= stripe_len;
4457                                 offset += stripe_len;
4458                         } else if (logical[nr] < offset) {
4459                                 if (logical[nr] + stripe_len >=
4460                                     offset + bytes) {
4461                                         kfree(logical);
4462                                         return 0;
4463                                 }
4464                                 bytes = (offset + bytes) -
4465                                         (logical[nr] + stripe_len);
4466                                 offset = logical[nr] + stripe_len;
4467                         } else {
4468                                 /*
4469                                  * Could be tricky, the super may land in the
4470                                  * middle of the area we're checking.  First
4471                                  * check the easiest case, it's at the end.
4472                                  */
4473                                 if (logical[nr] + stripe_len >=
4474                                     bytes + offset) {
4475                                         bytes = logical[nr] - offset;
4476                                         continue;
4477                                 }
4478
4479                                 /* Check the left side */
4480                                 ret = check_cache_range(root, cache,
4481                                                         offset,
4482                                                         logical[nr] - offset);
4483                                 if (ret) {
4484                                         kfree(logical);
4485                                         return ret;
4486                                 }
4487
4488                                 /* Now we continue with the right side */
4489                                 bytes = (offset + bytes) -
4490                                         (logical[nr] + stripe_len);
4491                                 offset = logical[nr] + stripe_len;
4492                         }
4493                 }
4494
4495                 kfree(logical);
4496         }
4497
4498         entry = btrfs_find_free_space(cache->free_space_ctl, offset, bytes);
4499         if (!entry) {
4500                 fprintf(stderr, "There is no free space entry for %Lu-%Lu\n",
4501                         offset, offset+bytes);
4502                 return -EINVAL;
4503         }
4504
4505         if (entry->offset != offset) {
4506                 fprintf(stderr, "Wanted offset %Lu, found %Lu\n", offset,
4507                         entry->offset);
4508                 return -EINVAL;
4509         }
4510
4511         if (entry->bytes != bytes) {
4512                 fprintf(stderr, "Wanted bytes %Lu, found %Lu for off %Lu\n",
4513                         bytes, entry->bytes, offset);
4514                 return -EINVAL;
4515         }
4516
4517         unlink_free_space(cache->free_space_ctl, entry);
4518         free(entry);
4519         return 0;
4520 }
4521
4522 static int verify_space_cache(struct btrfs_root *root,
4523                               struct btrfs_block_group_cache *cache)
4524 {
4525         struct btrfs_path *path;
4526         struct extent_buffer *leaf;
4527         struct btrfs_key key;
4528         u64 last;
4529         int ret = 0;
4530
4531         path = btrfs_alloc_path();
4532         if (!path)
4533                 return -ENOMEM;
4534
4535         root = root->fs_info->extent_root;
4536
4537         last = max_t(u64, cache->key.objectid, BTRFS_SUPER_INFO_OFFSET);
4538
4539         key.objectid = last;
4540         key.offset = 0;
4541         key.type = BTRFS_EXTENT_ITEM_KEY;
4542
4543         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4544         if (ret < 0)
4545                 goto out;
4546         ret = 0;
4547         while (1) {
4548                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4549                         ret = btrfs_next_leaf(root, path);
4550                         if (ret < 0)
4551                                 goto out;
4552                         if (ret > 0) {
4553                                 ret = 0;
4554                                 break;
4555                         }
4556                 }
4557                 leaf = path->nodes[0];
4558                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4559                 if (key.objectid >= cache->key.offset + cache->key.objectid)
4560                         break;
4561                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
4562                     key.type != BTRFS_METADATA_ITEM_KEY) {
4563                         path->slots[0]++;
4564                         continue;
4565                 }
4566
4567                 if (last == key.objectid) {
4568                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
4569                                 last = key.objectid + key.offset;
4570                         else
4571                                 last = key.objectid + root->leafsize;
4572                         path->slots[0]++;
4573                         continue;
4574                 }
4575
4576                 ret = check_cache_range(root, cache, last,
4577                                         key.objectid - last);
4578                 if (ret)
4579                         break;
4580                 if (key.type == BTRFS_EXTENT_ITEM_KEY)
4581                         last = key.objectid + key.offset;
4582                 else
4583                         last = key.objectid + root->leafsize;
4584                 path->slots[0]++;
4585         }
4586
4587         if (last < cache->key.objectid + cache->key.offset)
4588                 ret = check_cache_range(root, cache, last,
4589                                         cache->key.objectid +
4590                                         cache->key.offset - last);
4591
4592 out:
4593         btrfs_free_path(path);
4594
4595         if (!ret &&
4596             !RB_EMPTY_ROOT(&cache->free_space_ctl->free_space_offset)) {
4597                 fprintf(stderr, "There are still entries left in the space "
4598                         "cache\n");
4599                 ret = -EINVAL;
4600         }
4601
4602         return ret;
4603 }
4604
4605 static int check_space_cache(struct btrfs_root *root)
4606 {
4607         struct btrfs_block_group_cache *cache;
4608         u64 start = BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE;
4609         int ret;
4610         int error = 0;
4611
4612         if (btrfs_super_cache_generation(root->fs_info->super_copy) != -1ULL &&
4613             btrfs_super_generation(root->fs_info->super_copy) !=
4614             btrfs_super_cache_generation(root->fs_info->super_copy)) {
4615                 printf("cache and super generation don't match, space cache "
4616                        "will be invalidated\n");
4617                 return 0;
4618         }
4619
4620         while (1) {
4621                 cache = btrfs_lookup_first_block_group(root->fs_info, start);
4622                 if (!cache)
4623                         break;
4624
4625                 start = cache->key.objectid + cache->key.offset;
4626                 if (!cache->free_space_ctl) {
4627                         if (btrfs_init_free_space_ctl(cache,
4628                                                       root->sectorsize)) {
4629                                 ret = -ENOMEM;
4630                                 break;
4631                         }
4632                 } else {
4633                         btrfs_remove_free_space_cache(cache);
4634                 }
4635
4636                 ret = load_free_space_cache(root->fs_info, cache);
4637                 if (!ret)
4638                         continue;
4639
4640                 ret = verify_space_cache(root, cache);
4641                 if (ret) {
4642                         fprintf(stderr, "cache appears valid but isnt %Lu\n",
4643                                 cache->key.objectid);
4644                         error++;
4645                 }
4646         }
4647
4648         return error ? -EINVAL : 0;
4649 }
4650
4651 static int read_extent_data(struct btrfs_root *root, char *data,
4652                         u64 logical, u64 *len, int mirror)
4653 {
4654         u64 offset = 0;
4655         struct btrfs_multi_bio *multi = NULL;
4656         struct btrfs_fs_info *info = root->fs_info;
4657         struct btrfs_device *device;
4658         int ret = 0;
4659         u64 max_len = *len;
4660
4661         ret = btrfs_map_block(&info->mapping_tree, READ, logical, len,
4662                               &multi, mirror, NULL);
4663         if (ret) {
4664                 fprintf(stderr, "Couldn't map the block %llu\n",
4665                                 logical + offset);
4666                 goto err;
4667         }
4668         device = multi->stripes[0].dev;
4669
4670         if (device->fd == 0)
4671                 goto err;
4672         if (*len > max_len)
4673                 *len = max_len;
4674
4675         ret = pread64(device->fd, data, *len, multi->stripes[0].physical);
4676         if (ret != *len)
4677                 ret = -EIO;
4678         else
4679                 ret = 0;
4680 err:
4681         kfree(multi);
4682         return ret;
4683 }
4684
4685 static int check_extent_csums(struct btrfs_root *root, u64 bytenr,
4686                         u64 num_bytes, unsigned long leaf_offset,
4687                         struct extent_buffer *eb) {
4688
4689         u64 offset = 0;
4690         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
4691         char *data;
4692         unsigned long csum_offset;
4693         u32 csum;
4694         u32 csum_expected;
4695         u64 read_len;
4696         u64 data_checked = 0;
4697         u64 tmp;
4698         int ret = 0;
4699         int mirror;
4700         int num_copies;
4701
4702         if (num_bytes % root->sectorsize)
4703                 return -EINVAL;
4704
4705         data = malloc(num_bytes);
4706         if (!data)
4707                 return -ENOMEM;
4708
4709         while (offset < num_bytes) {
4710                 mirror = 0;
4711 again:
4712                 read_len = num_bytes - offset;
4713                 /* read as much space once a time */
4714                 ret = read_extent_data(root, data + offset,
4715                                 bytenr + offset, &read_len, mirror);
4716                 if (ret)
4717                         goto out;
4718                 data_checked = 0;
4719                 /* verify every 4k data's checksum */
4720                 while (data_checked < read_len) {
4721                         csum = ~(u32)0;
4722                         tmp = offset + data_checked;
4723
4724                         csum = btrfs_csum_data(NULL, (char *)data + tmp,
4725                                                csum, root->sectorsize);
4726                         btrfs_csum_final(csum, (char *)&csum);
4727
4728                         csum_offset = leaf_offset +
4729                                  tmp / root->sectorsize * csum_size;
4730                         read_extent_buffer(eb, (char *)&csum_expected,
4731                                            csum_offset, csum_size);
4732                         /* try another mirror */
4733                         if (csum != csum_expected) {
4734                                 fprintf(stderr, "mirror %d bytenr %llu csum %u expected csum %u\n",
4735                                                 mirror, bytenr + tmp,
4736                                                 csum, csum_expected);
4737                                 num_copies = btrfs_num_copies(
4738                                                 &root->fs_info->mapping_tree,
4739                                                 bytenr, num_bytes);
4740                                 if (mirror < num_copies - 1) {
4741                                         mirror += 1;
4742                                         goto again;
4743                                 }
4744                         }
4745                         data_checked += root->sectorsize;
4746                 }
4747                 offset += read_len;
4748         }
4749 out:
4750         free(data);
4751         return ret;
4752 }
4753
4754 static int check_extent_exists(struct btrfs_root *root, u64 bytenr,
4755                                u64 num_bytes)
4756 {
4757         struct btrfs_path *path;
4758         struct extent_buffer *leaf;
4759         struct btrfs_key key;
4760         int ret;
4761
4762         path = btrfs_alloc_path();
4763         if (!path) {
4764                 fprintf(stderr, "Error allocing path\n");
4765                 return -ENOMEM;
4766         }
4767
4768         key.objectid = bytenr;
4769         key.type = BTRFS_EXTENT_ITEM_KEY;
4770         key.offset = (u64)-1;
4771
4772 again:
4773         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
4774                                 0, 0);
4775         if (ret < 0) {
4776                 fprintf(stderr, "Error looking up extent record %d\n", ret);
4777                 btrfs_free_path(path);
4778                 return ret;
4779         } else if (ret) {
4780                 if (path->slots[0] > 0) {
4781                         path->slots[0]--;
4782                 } else {
4783                         ret = btrfs_prev_leaf(root, path);
4784                         if (ret < 0) {
4785                                 goto out;
4786                         } else if (ret > 0) {
4787                                 ret = 0;
4788                                 goto out;
4789                         }
4790                 }
4791         }
4792
4793         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4794
4795         /*
4796          * Block group items come before extent items if they have the same
4797          * bytenr, so walk back one more just in case.  Dear future traveler,
4798          * first congrats on mastering time travel.  Now if it's not too much
4799          * trouble could you go back to 2006 and tell Chris to make the
4800          * BLOCK_GROUP_ITEM_KEY (and BTRFS_*_REF_KEY) lower than the
4801          * EXTENT_ITEM_KEY please?
4802          */
4803         while (key.type > BTRFS_EXTENT_ITEM_KEY) {
4804                 if (path->slots[0] > 0) {
4805                         path->slots[0]--;
4806                 } else {
4807                         ret = btrfs_prev_leaf(root, path);
4808                         if (ret < 0) {
4809                                 goto out;
4810                         } else if (ret > 0) {
4811                                 ret = 0;
4812                                 goto out;
4813                         }
4814                 }
4815                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4816         }
4817
4818         while (num_bytes) {
4819                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4820                         ret = btrfs_next_leaf(root, path);
4821                         if (ret < 0) {
4822                                 fprintf(stderr, "Error going to next leaf "
4823                                         "%d\n", ret);
4824                                 btrfs_free_path(path);
4825                                 return ret;
4826                         } else if (ret) {
4827                                 break;
4828                         }
4829                 }
4830                 leaf = path->nodes[0];
4831                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4832                 if (key.type != BTRFS_EXTENT_ITEM_KEY) {
4833                         path->slots[0]++;
4834                         continue;
4835                 }
4836                 if (key.objectid + key.offset < bytenr) {
4837                         path->slots[0]++;
4838                         continue;
4839                 }
4840                 if (key.objectid > bytenr + num_bytes)
4841                         break;
4842
4843                 if (key.objectid == bytenr) {
4844                         if (key.offset >= num_bytes) {
4845                                 num_bytes = 0;
4846                                 break;
4847                         }
4848                         num_bytes -= key.offset;
4849                         bytenr += key.offset;
4850                 } else if (key.objectid < bytenr) {
4851                         if (key.objectid + key.offset >= bytenr + num_bytes) {
4852                                 num_bytes = 0;
4853                                 break;
4854                         }
4855                         num_bytes = (bytenr + num_bytes) -
4856                                 (key.objectid + key.offset);
4857                         bytenr = key.objectid + key.offset;
4858                 } else {
4859                         if (key.objectid + key.offset < bytenr + num_bytes) {
4860                                 u64 new_start = key.objectid + key.offset;
4861                                 u64 new_bytes = bytenr + num_bytes - new_start;
4862
4863                                 /*
4864                                  * Weird case, the extent is in the middle of
4865                                  * our range, we'll have to search one side
4866                                  * and then the other.  Not sure if this happens
4867                                  * in real life, but no harm in coding it up
4868                                  * anyway just in case.
4869                                  */
4870                                 btrfs_release_path(path);
4871                                 ret = check_extent_exists(root, new_start,
4872                                                           new_bytes);
4873                                 if (ret) {
4874                                         fprintf(stderr, "Right section didn't "
4875                                                 "have a record\n");
4876                                         break;
4877                                 }
4878                                 num_bytes = key.objectid - bytenr;
4879                                 goto again;
4880                         }
4881                         num_bytes = key.objectid - bytenr;
4882                 }
4883                 path->slots[0]++;
4884         }
4885         ret = 0;
4886
4887 out:
4888         if (num_bytes && !ret) {
4889                 fprintf(stderr, "There are no extents for csum range "
4890                         "%Lu-%Lu\n", bytenr, bytenr+num_bytes);
4891                 ret = 1;
4892         }
4893
4894         btrfs_free_path(path);
4895         return ret;
4896 }
4897
4898 static int check_csums(struct btrfs_root *root)
4899 {
4900         struct btrfs_path *path;
4901         struct extent_buffer *leaf;
4902         struct btrfs_key key;
4903         u64 offset = 0, num_bytes = 0;
4904         u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
4905         int errors = 0;
4906         int ret;
4907         u64 data_len;
4908         unsigned long leaf_offset;
4909
4910         root = root->fs_info->csum_root;
4911         if (!extent_buffer_uptodate(root->node)) {
4912                 fprintf(stderr, "No valid csum tree found\n");
4913                 return -ENOENT;
4914         }
4915
4916         key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
4917         key.type = BTRFS_EXTENT_CSUM_KEY;
4918         key.offset = 0;
4919
4920         path = btrfs_alloc_path();
4921         if (!path)
4922                 return -ENOMEM;
4923
4924         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4925         if (ret < 0) {
4926                 fprintf(stderr, "Error searching csum tree %d\n", ret);
4927                 btrfs_free_path(path);
4928                 return ret;
4929         }
4930
4931         if (ret > 0 && path->slots[0])
4932                 path->slots[0]--;
4933         ret = 0;
4934
4935         while (1) {
4936                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4937                         ret = btrfs_next_leaf(root, path);
4938                         if (ret < 0) {
4939                                 fprintf(stderr, "Error going to next leaf "
4940                                         "%d\n", ret);
4941                                 break;
4942                         }
4943                         if (ret)
4944                                 break;
4945                 }
4946                 leaf = path->nodes[0];
4947
4948                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4949                 if (key.type != BTRFS_EXTENT_CSUM_KEY) {
4950                         path->slots[0]++;
4951                         continue;
4952                 }
4953
4954                 data_len = (btrfs_item_size_nr(leaf, path->slots[0]) /
4955                               csum_size) * root->sectorsize;
4956                 if (!check_data_csum)
4957                         goto skip_csum_check;
4958                 leaf_offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
4959                 ret = check_extent_csums(root, key.offset, data_len,
4960                                          leaf_offset, leaf);
4961                 if (ret)
4962                         break;
4963 skip_csum_check:
4964                 if (!num_bytes) {
4965                         offset = key.offset;
4966                 } else if (key.offset != offset + num_bytes) {
4967                         ret = check_extent_exists(root, offset, num_bytes);
4968                         if (ret) {
4969                                 fprintf(stderr, "Csum exists for %Lu-%Lu but "
4970                                         "there is no extent record\n",
4971                                         offset, offset+num_bytes);
4972                                 errors++;
4973                         }
4974                         offset = key.offset;
4975                         num_bytes = 0;
4976                 }
4977                 num_bytes += data_len;
4978                 path->slots[0]++;
4979         }
4980
4981         btrfs_free_path(path);
4982         return errors;
4983 }
4984
4985 static int is_dropped_key(struct btrfs_key *key,
4986                           struct btrfs_key *drop_key) {
4987         if (key->objectid < drop_key->objectid)
4988                 return 1;
4989         else if (key->objectid == drop_key->objectid) {
4990                 if (key->type < drop_key->type)
4991                         return 1;
4992                 else if (key->type == drop_key->type) {
4993                         if (key->offset < drop_key->offset)
4994                                 return 1;
4995                 }
4996         }
4997         return 0;
4998 }
4999
5000 static int run_next_block(struct btrfs_trans_handle *trans,
5001                           struct btrfs_root *root,
5002                           struct block_info *bits,
5003                           int bits_nr,
5004                           u64 *last,
5005                           struct cache_tree *pending,
5006                           struct cache_tree *seen,
5007                           struct cache_tree *reada,
5008                           struct cache_tree *nodes,
5009                           struct cache_tree *extent_cache,
5010                           struct cache_tree *chunk_cache,
5011                           struct rb_root *dev_cache,
5012                           struct block_group_tree *block_group_cache,
5013                           struct device_extent_tree *dev_extent_cache,
5014                           struct btrfs_root_item *ri)
5015 {
5016         struct extent_buffer *buf;
5017         u64 bytenr;
5018         u32 size;
5019         u64 parent;
5020         u64 owner;
5021         u64 flags;
5022         u64 ptr;
5023         u64 gen = 0;
5024         int ret = 0;
5025         int i;
5026         int nritems;
5027         struct btrfs_key key;
5028         struct cache_extent *cache;
5029         int reada_bits;
5030
5031         nritems = pick_next_pending(pending, reada, nodes, *last, bits,
5032                                     bits_nr, &reada_bits);
5033         if (nritems == 0)
5034                 return 1;
5035
5036         if (!reada_bits) {
5037                 for(i = 0; i < nritems; i++) {
5038                         ret = add_cache_extent(reada, bits[i].start,
5039                                                bits[i].size);
5040                         if (ret == -EEXIST)
5041                                 continue;
5042
5043                         /* fixme, get the parent transid */
5044                         readahead_tree_block(root, bits[i].start,
5045                                              bits[i].size, 0);
5046                 }
5047         }
5048         *last = bits[0].start;
5049         bytenr = bits[0].start;
5050         size = bits[0].size;
5051
5052         cache = lookup_cache_extent(pending, bytenr, size);
5053         if (cache) {
5054                 remove_cache_extent(pending, cache);
5055                 free(cache);
5056         }
5057         cache = lookup_cache_extent(reada, bytenr, size);
5058         if (cache) {
5059                 remove_cache_extent(reada, cache);
5060                 free(cache);
5061         }
5062         cache = lookup_cache_extent(nodes, bytenr, size);
5063         if (cache) {
5064                 remove_cache_extent(nodes, cache);
5065                 free(cache);
5066         }
5067         cache = lookup_cache_extent(extent_cache, bytenr, size);
5068         if (cache) {
5069                 struct extent_record *rec;
5070
5071                 rec = container_of(cache, struct extent_record, cache);
5072                 gen = rec->parent_generation;
5073         }
5074
5075         /* fixme, get the real parent transid */
5076         buf = read_tree_block(root, bytenr, size, gen);
5077         if (!extent_buffer_uptodate(buf)) {
5078                 record_bad_block_io(root->fs_info,
5079                                     extent_cache, bytenr, size);
5080                 goto out;
5081         }
5082
5083         nritems = btrfs_header_nritems(buf);
5084
5085         /*
5086          * FIXME, this only works only if we don't have any full
5087          * backref mode.
5088          */
5089         if (!init_extent_tree) {
5090                 ret = btrfs_lookup_extent_info(NULL, root, bytenr,
5091                                        btrfs_header_level(buf), 1, NULL,
5092                                        &flags);
5093                 if (ret < 0)
5094                         flags = 0;
5095         } else {
5096                 flags = 0;
5097         }
5098
5099         if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5100                 parent = bytenr;
5101                 owner = 0;
5102         } else {
5103                 parent = 0;
5104                 owner = btrfs_header_owner(buf);
5105         }
5106
5107         ret = check_block(trans, root, extent_cache, buf, flags);
5108         if (ret)
5109                 goto out;
5110
5111         if (btrfs_is_leaf(buf)) {
5112                 btree_space_waste += btrfs_leaf_free_space(root, buf);
5113                 for (i = 0; i < nritems; i++) {
5114                         struct btrfs_file_extent_item *fi;
5115                         btrfs_item_key_to_cpu(buf, &key, i);
5116                         if (key.type == BTRFS_EXTENT_ITEM_KEY) {
5117                                 process_extent_item(root, extent_cache, buf,
5118                                                     i);
5119                                 continue;
5120                         }
5121                         if (key.type == BTRFS_METADATA_ITEM_KEY) {
5122                                 process_extent_item(root, extent_cache, buf,
5123                                                     i);
5124                                 continue;
5125                         }
5126                         if (key.type == BTRFS_EXTENT_CSUM_KEY) {
5127                                 total_csum_bytes +=
5128                                         btrfs_item_size_nr(buf, i);
5129                                 continue;
5130                         }
5131                         if (key.type == BTRFS_CHUNK_ITEM_KEY) {
5132                                 process_chunk_item(chunk_cache, &key, buf, i);
5133                                 continue;
5134                         }
5135                         if (key.type == BTRFS_DEV_ITEM_KEY) {
5136                                 process_device_item(dev_cache, &key, buf, i);
5137                                 continue;
5138                         }
5139                         if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
5140                                 process_block_group_item(block_group_cache,
5141                                         &key, buf, i);
5142                                 continue;
5143                         }
5144                         if (key.type == BTRFS_DEV_EXTENT_KEY) {
5145                                 process_device_extent_item(dev_extent_cache,
5146                                         &key, buf, i);
5147                                 continue;
5148
5149                         }
5150                         if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
5151 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5152                                 process_extent_ref_v0(extent_cache, buf, i);
5153 #else
5154                                 BUG();
5155 #endif
5156                                 continue;
5157                         }
5158
5159                         if (key.type == BTRFS_TREE_BLOCK_REF_KEY) {
5160                                 add_tree_backref(extent_cache, key.objectid, 0,
5161                                                  key.offset, 0);
5162                                 continue;
5163                         }
5164                         if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
5165                                 add_tree_backref(extent_cache, key.objectid,
5166                                                  key.offset, 0, 0);
5167                                 continue;
5168                         }
5169                         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
5170                                 struct btrfs_extent_data_ref *ref;
5171                                 ref = btrfs_item_ptr(buf, i,
5172                                                 struct btrfs_extent_data_ref);
5173                                 add_data_backref(extent_cache,
5174                                         key.objectid, 0,
5175                                         btrfs_extent_data_ref_root(buf, ref),
5176                                         btrfs_extent_data_ref_objectid(buf,
5177                                                                        ref),
5178                                         btrfs_extent_data_ref_offset(buf, ref),
5179                                         btrfs_extent_data_ref_count(buf, ref),
5180                                         0, root->sectorsize);
5181                                 continue;
5182                         }
5183                         if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
5184                                 struct btrfs_shared_data_ref *ref;
5185                                 ref = btrfs_item_ptr(buf, i,
5186                                                 struct btrfs_shared_data_ref);
5187                                 add_data_backref(extent_cache,
5188                                         key.objectid, key.offset, 0, 0, 0,
5189                                         btrfs_shared_data_ref_count(buf, ref),
5190                                         0, root->sectorsize);
5191                                 continue;
5192                         }
5193                         if (key.type == BTRFS_ORPHAN_ITEM_KEY) {
5194                                 struct bad_item *bad;
5195
5196                                 if (key.objectid == BTRFS_ORPHAN_OBJECTID)
5197                                         continue;
5198                                 if (!owner)
5199                                         continue;
5200                                 bad = malloc(sizeof(struct bad_item));
5201                                 if (!bad)
5202                                         continue;
5203                                 INIT_LIST_HEAD(&bad->list);
5204                                 memcpy(&bad->key, &key,
5205                                        sizeof(struct btrfs_key));
5206                                 bad->root_id = owner;
5207                                 list_add_tail(&bad->list, &delete_items);
5208                                 continue;
5209                         }
5210                         if (key.type != BTRFS_EXTENT_DATA_KEY)
5211                                 continue;
5212                         fi = btrfs_item_ptr(buf, i,
5213                                             struct btrfs_file_extent_item);
5214                         if (btrfs_file_extent_type(buf, fi) ==
5215                             BTRFS_FILE_EXTENT_INLINE)
5216                                 continue;
5217                         if (btrfs_file_extent_disk_bytenr(buf, fi) == 0)
5218                                 continue;
5219
5220                         data_bytes_allocated +=
5221                                 btrfs_file_extent_disk_num_bytes(buf, fi);
5222                         if (data_bytes_allocated < root->sectorsize) {
5223                                 abort();
5224                         }
5225                         data_bytes_referenced +=
5226                                 btrfs_file_extent_num_bytes(buf, fi);
5227                         add_data_backref(extent_cache,
5228                                 btrfs_file_extent_disk_bytenr(buf, fi),
5229                                 parent, owner, key.objectid, key.offset -
5230                                 btrfs_file_extent_offset(buf, fi), 1, 1,
5231                                 btrfs_file_extent_disk_num_bytes(buf, fi));
5232                 }
5233         } else {
5234                 int level;
5235                 struct btrfs_key first_key;
5236
5237                 first_key.objectid = 0;
5238
5239                 if (nritems > 0)
5240                         btrfs_item_key_to_cpu(buf, &first_key, 0);
5241                 level = btrfs_header_level(buf);
5242                 for (i = 0; i < nritems; i++) {
5243                         ptr = btrfs_node_blockptr(buf, i);
5244                         size = btrfs_level_size(root, level - 1);
5245                         btrfs_node_key_to_cpu(buf, &key, i);
5246                         if (ri != NULL) {
5247                                 struct btrfs_key drop_key;
5248                                 btrfs_disk_key_to_cpu(&drop_key,
5249                                                       &ri->drop_progress);
5250                                 if ((level == ri->drop_level)
5251                                     && is_dropped_key(&key, &drop_key)) {
5252                                         continue;
5253                                 }
5254                         }
5255                         ret = add_extent_rec(extent_cache, &key,
5256                                              btrfs_node_ptr_generation(buf, i),
5257                                              ptr, size, 0, 0, 1, 0, 1, 0,
5258                                              size);
5259                         BUG_ON(ret);
5260
5261                         add_tree_backref(extent_cache, ptr, parent, owner, 1);
5262
5263                         if (level > 1) {
5264                                 add_pending(nodes, seen, ptr, size);
5265                         } else {
5266                                 add_pending(pending, seen, ptr, size);
5267                         }
5268                 }
5269                 btree_space_waste += (BTRFS_NODEPTRS_PER_BLOCK(root) -
5270                                       nritems) * sizeof(struct btrfs_key_ptr);
5271         }
5272         total_btree_bytes += buf->len;
5273         if (fs_root_objectid(btrfs_header_owner(buf)))
5274                 total_fs_tree_bytes += buf->len;
5275         if (btrfs_header_owner(buf) == BTRFS_EXTENT_TREE_OBJECTID)
5276                 total_extent_tree_bytes += buf->len;
5277         if (!found_old_backref &&
5278             btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID &&
5279             btrfs_header_backref_rev(buf) == BTRFS_MIXED_BACKREF_REV &&
5280             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
5281                 found_old_backref = 1;
5282 out:
5283         free_extent_buffer(buf);
5284         return ret;
5285 }
5286
5287 static int add_root_to_pending(struct extent_buffer *buf,
5288                                struct cache_tree *extent_cache,
5289                                struct cache_tree *pending,
5290                                struct cache_tree *seen,
5291                                struct cache_tree *nodes,
5292                                struct btrfs_key *root_key)
5293 {
5294         if (btrfs_header_level(buf) > 0)
5295                 add_pending(nodes, seen, buf->start, buf->len);
5296         else
5297                 add_pending(pending, seen, buf->start, buf->len);
5298         add_extent_rec(extent_cache, NULL, 0, buf->start, buf->len,
5299                        0, 1, 1, 0, 1, 0, buf->len);
5300
5301         if (root_key->objectid == BTRFS_TREE_RELOC_OBJECTID ||
5302             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
5303                 add_tree_backref(extent_cache, buf->start, buf->start,
5304                                  0, 1);
5305         else
5306                 add_tree_backref(extent_cache, buf->start, 0,
5307                                  root_key->objectid, 1);
5308         return 0;
5309 }
5310
5311 /* as we fix the tree, we might be deleting blocks that
5312  * we're tracking for repair.  This hook makes sure we
5313  * remove any backrefs for blocks as we are fixing them.
5314  */
5315 static int free_extent_hook(struct btrfs_trans_handle *trans,
5316                             struct btrfs_root *root,
5317                             u64 bytenr, u64 num_bytes, u64 parent,
5318                             u64 root_objectid, u64 owner, u64 offset,
5319                             int refs_to_drop)
5320 {
5321         struct extent_record *rec;
5322         struct cache_extent *cache;
5323         int is_data;
5324         struct cache_tree *extent_cache = root->fs_info->fsck_extent_cache;
5325
5326         is_data = owner >= BTRFS_FIRST_FREE_OBJECTID;
5327         cache = lookup_cache_extent(extent_cache, bytenr, num_bytes);
5328         if (!cache)
5329                 return 0;
5330
5331         rec = container_of(cache, struct extent_record, cache);
5332         if (is_data) {
5333                 struct data_backref *back;
5334                 back = find_data_backref(rec, parent, root_objectid, owner,
5335                                          offset, 1, bytenr, num_bytes);
5336                 if (!back)
5337                         goto out;
5338                 if (back->node.found_ref) {
5339                         back->found_ref -= refs_to_drop;
5340                         if (rec->refs)
5341                                 rec->refs -= refs_to_drop;
5342                 }
5343                 if (back->node.found_extent_tree) {
5344                         back->num_refs -= refs_to_drop;
5345                         if (rec->extent_item_refs)
5346                                 rec->extent_item_refs -= refs_to_drop;
5347                 }
5348                 if (back->found_ref == 0)
5349                         back->node.found_ref = 0;
5350                 if (back->num_refs == 0)
5351                         back->node.found_extent_tree = 0;
5352
5353                 if (!back->node.found_extent_tree && back->node.found_ref) {
5354                         list_del(&back->node.list);
5355                         free(back);
5356                 }
5357         } else {
5358                 struct tree_backref *back;
5359                 back = find_tree_backref(rec, parent, root_objectid);
5360                 if (!back)
5361                         goto out;
5362                 if (back->node.found_ref) {
5363                         if (rec->refs)
5364                                 rec->refs--;
5365                         back->node.found_ref = 0;
5366                 }
5367                 if (back->node.found_extent_tree) {
5368                         if (rec->extent_item_refs)
5369                                 rec->extent_item_refs--;
5370                         back->node.found_extent_tree = 0;
5371                 }
5372                 if (!back->node.found_extent_tree && back->node.found_ref) {
5373                         list_del(&back->node.list);
5374                         free(back);
5375                 }
5376         }
5377         maybe_free_extent_rec(extent_cache, rec);
5378 out:
5379         return 0;
5380 }
5381
5382 static int delete_extent_records(struct btrfs_trans_handle *trans,
5383                                  struct btrfs_root *root,
5384                                  struct btrfs_path *path,
5385                                  u64 bytenr, u64 new_len)
5386 {
5387         struct btrfs_key key;
5388         struct btrfs_key found_key;
5389         struct extent_buffer *leaf;
5390         int ret;
5391         int slot;
5392
5393
5394         key.objectid = bytenr;
5395         key.type = (u8)-1;
5396         key.offset = (u64)-1;
5397
5398         while(1) {
5399                 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
5400                                         &key, path, 0, 1);
5401                 if (ret < 0)
5402                         break;
5403
5404                 if (ret > 0) {
5405                         ret = 0;
5406                         if (path->slots[0] == 0)
5407                                 break;
5408                         path->slots[0]--;
5409                 }
5410                 ret = 0;
5411
5412                 leaf = path->nodes[0];
5413                 slot = path->slots[0];
5414
5415                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
5416                 if (found_key.objectid != bytenr)
5417                         break;
5418
5419                 if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
5420                     found_key.type != BTRFS_METADATA_ITEM_KEY &&
5421                     found_key.type != BTRFS_TREE_BLOCK_REF_KEY &&
5422                     found_key.type != BTRFS_EXTENT_DATA_REF_KEY &&
5423                     found_key.type != BTRFS_EXTENT_REF_V0_KEY &&
5424                     found_key.type != BTRFS_SHARED_BLOCK_REF_KEY &&
5425                     found_key.type != BTRFS_SHARED_DATA_REF_KEY) {
5426                         btrfs_release_path(path);
5427                         if (found_key.type == 0) {
5428                                 if (found_key.offset == 0)
5429                                         break;
5430                                 key.offset = found_key.offset - 1;
5431                                 key.type = found_key.type;
5432                         }
5433                         key.type = found_key.type - 1;
5434                         key.offset = (u64)-1;
5435                         continue;
5436                 }
5437
5438                 fprintf(stderr, "repair deleting extent record: key %Lu %u %Lu\n",
5439                         found_key.objectid, found_key.type, found_key.offset);
5440
5441                 ret = btrfs_del_item(trans, root->fs_info->extent_root, path);
5442                 if (ret)
5443                         break;
5444                 btrfs_release_path(path);
5445
5446                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5447                     found_key.type == BTRFS_METADATA_ITEM_KEY) {
5448                         u64 bytes = (found_key.type == BTRFS_EXTENT_ITEM_KEY) ?
5449                                 found_key.offset : root->leafsize;
5450
5451                         ret = btrfs_update_block_group(trans, root, bytenr,
5452                                                        bytes, 0, 0);
5453                         if (ret)
5454                                 break;
5455                 }
5456         }
5457
5458         btrfs_release_path(path);
5459         return ret;
5460 }
5461
5462 /*
5463  * for a single backref, this will allocate a new extent
5464  * and add the backref to it.
5465  */
5466 static int record_extent(struct btrfs_trans_handle *trans,
5467                          struct btrfs_fs_info *info,
5468                          struct btrfs_path *path,
5469                          struct extent_record *rec,
5470                          struct extent_backref *back,
5471                          int allocated, u64 flags)
5472 {
5473         int ret;
5474         struct btrfs_root *extent_root = info->extent_root;
5475         struct extent_buffer *leaf;
5476         struct btrfs_key ins_key;
5477         struct btrfs_extent_item *ei;
5478         struct tree_backref *tback;
5479         struct data_backref *dback;
5480         struct btrfs_tree_block_info *bi;
5481
5482         if (!back->is_data)
5483                 rec->max_size = max_t(u64, rec->max_size,
5484                                     info->extent_root->leafsize);
5485
5486         if (!allocated) {
5487                 u32 item_size = sizeof(*ei);
5488
5489                 if (!back->is_data)
5490                         item_size += sizeof(*bi);
5491
5492                 ins_key.objectid = rec->start;
5493                 ins_key.offset = rec->max_size;
5494                 ins_key.type = BTRFS_EXTENT_ITEM_KEY;
5495
5496                 ret = btrfs_insert_empty_item(trans, extent_root, path,
5497                                         &ins_key, item_size);
5498                 if (ret)
5499                         goto fail;
5500
5501                 leaf = path->nodes[0];
5502                 ei = btrfs_item_ptr(leaf, path->slots[0],
5503                                     struct btrfs_extent_item);
5504
5505                 btrfs_set_extent_refs(leaf, ei, 0);
5506                 btrfs_set_extent_generation(leaf, ei, rec->generation);
5507
5508                 if (back->is_data) {
5509                         btrfs_set_extent_flags(leaf, ei,
5510                                                BTRFS_EXTENT_FLAG_DATA);
5511                 } else {
5512                         struct btrfs_disk_key copy_key;;
5513
5514                         tback = (struct tree_backref *)back;
5515                         bi = (struct btrfs_tree_block_info *)(ei + 1);
5516                         memset_extent_buffer(leaf, 0, (unsigned long)bi,
5517                                              sizeof(*bi));
5518
5519                         btrfs_set_disk_key_objectid(&copy_key,
5520                                                     rec->info_objectid);
5521                         btrfs_set_disk_key_type(&copy_key, 0);
5522                         btrfs_set_disk_key_offset(&copy_key, 0);
5523
5524                         btrfs_set_tree_block_level(leaf, bi, rec->info_level);
5525                         btrfs_set_tree_block_key(leaf, bi, &copy_key);
5526
5527                         btrfs_set_extent_flags(leaf, ei,
5528                                                BTRFS_EXTENT_FLAG_TREE_BLOCK | flags);
5529                 }
5530
5531                 btrfs_mark_buffer_dirty(leaf);
5532                 ret = btrfs_update_block_group(trans, extent_root, rec->start,
5533                                                rec->max_size, 1, 0);
5534                 if (ret)
5535                         goto fail;
5536                 btrfs_release_path(path);
5537         }
5538
5539         if (back->is_data) {
5540                 u64 parent;
5541                 int i;
5542
5543                 dback = (struct data_backref *)back;
5544                 if (back->full_backref)
5545                         parent = dback->parent;
5546                 else
5547                         parent = 0;
5548
5549                 for (i = 0; i < dback->found_ref; i++) {
5550                         /* if parent != 0, we're doing a full backref
5551                          * passing BTRFS_FIRST_FREE_OBJECTID as the owner
5552                          * just makes the backref allocator create a data
5553                          * backref
5554                          */
5555                         ret = btrfs_inc_extent_ref(trans, info->extent_root,
5556                                                    rec->start, rec->max_size,
5557                                                    parent,
5558                                                    dback->root,
5559                                                    parent ?
5560                                                    BTRFS_FIRST_FREE_OBJECTID :
5561                                                    dback->owner,
5562                                                    dback->offset);
5563                         if (ret)
5564                                 break;
5565                 }
5566                 fprintf(stderr, "adding new data backref"
5567                                 " on %llu %s %llu owner %llu"
5568                                 " offset %llu found %d\n",
5569                                 (unsigned long long)rec->start,
5570                                 back->full_backref ?
5571                                 "parent" : "root",
5572                                 back->full_backref ?
5573                                 (unsigned long long)parent :
5574                                 (unsigned long long)dback->root,
5575                                 (unsigned long long)dback->owner,
5576                                 (unsigned long long)dback->offset,
5577                                 dback->found_ref);
5578         } else {
5579                 u64 parent;
5580
5581                 tback = (struct tree_backref *)back;
5582                 if (back->full_backref)
5583                         parent = tback->parent;
5584                 else
5585                         parent = 0;
5586
5587                 ret = btrfs_inc_extent_ref(trans, info->extent_root,
5588                                            rec->start, rec->max_size,
5589                                            parent, tback->root, 0, 0);
5590                 fprintf(stderr, "adding new tree backref on "
5591                         "start %llu len %llu parent %llu root %llu\n",
5592                         rec->start, rec->max_size, tback->parent, tback->root);
5593         }
5594         if (ret)
5595                 goto fail;
5596 fail:
5597         btrfs_release_path(path);
5598         return ret;
5599 }
5600
5601 struct extent_entry {
5602         u64 bytenr;
5603         u64 bytes;
5604         int count;
5605         int broken;
5606         struct list_head list;
5607 };
5608
5609 static struct extent_entry *find_entry(struct list_head *entries,
5610                                        u64 bytenr, u64 bytes)
5611 {
5612         struct extent_entry *entry = NULL;
5613
5614         list_for_each_entry(entry, entries, list) {
5615                 if (entry->bytenr == bytenr && entry->bytes == bytes)
5616                         return entry;
5617         }
5618
5619         return NULL;
5620 }
5621
5622 static struct extent_entry *find_most_right_entry(struct list_head *entries)
5623 {
5624         struct extent_entry *entry, *best = NULL, *prev = NULL;
5625
5626         list_for_each_entry(entry, entries, list) {
5627                 if (!prev) {
5628                         prev = entry;
5629                         continue;
5630                 }
5631
5632                 /*
5633                  * If there are as many broken entries as entries then we know
5634                  * not to trust this particular entry.
5635                  */
5636                 if (entry->broken == entry->count)
5637                         continue;
5638
5639                 /*
5640                  * If our current entry == best then we can't be sure our best
5641                  * is really the best, so we need to keep searching.
5642                  */
5643                 if (best && best->count == entry->count) {
5644                         prev = entry;
5645                         best = NULL;
5646                         continue;
5647                 }
5648
5649                 /* Prev == entry, not good enough, have to keep searching */
5650                 if (!prev->broken && prev->count == entry->count)
5651                         continue;
5652
5653                 if (!best)
5654                         best = (prev->count > entry->count) ? prev : entry;
5655                 else if (best->count < entry->count)
5656                         best = entry;
5657                 prev = entry;
5658         }
5659
5660         return best;
5661 }
5662
5663 static int repair_ref(struct btrfs_trans_handle *trans,
5664                       struct btrfs_fs_info *info, struct btrfs_path *path,
5665                       struct data_backref *dback, struct extent_entry *entry)
5666 {
5667         struct btrfs_root *root;
5668         struct btrfs_file_extent_item *fi;
5669         struct extent_buffer *leaf;
5670         struct btrfs_key key;
5671         u64 bytenr, bytes;
5672         int ret;
5673
5674         key.objectid = dback->root;
5675         key.type = BTRFS_ROOT_ITEM_KEY;
5676         key.offset = (u64)-1;
5677         root = btrfs_read_fs_root(info, &key);
5678         if (IS_ERR(root)) {
5679                 fprintf(stderr, "Couldn't find root for our ref\n");
5680                 return -EINVAL;
5681         }
5682
5683         /*
5684          * The backref points to the original offset of the extent if it was
5685          * split, so we need to search down to the offset we have and then walk
5686          * forward until we find the backref we're looking for.
5687          */
5688         key.objectid = dback->owner;
5689         key.type = BTRFS_EXTENT_DATA_KEY;
5690         key.offset = dback->offset;
5691         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5692         if (ret < 0) {
5693                 fprintf(stderr, "Error looking up ref %d\n", ret);
5694                 return ret;
5695         }
5696
5697         while (1) {
5698                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5699                         ret = btrfs_next_leaf(root, path);
5700                         if (ret) {
5701                                 fprintf(stderr, "Couldn't find our ref, next\n");
5702                                 return -EINVAL;
5703                         }
5704                 }
5705                 leaf = path->nodes[0];
5706                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5707                 if (key.objectid != dback->owner ||
5708                     key.type != BTRFS_EXTENT_DATA_KEY) {
5709                         fprintf(stderr, "Couldn't find our ref, search\n");
5710                         return -EINVAL;
5711                 }
5712                 fi = btrfs_item_ptr(leaf, path->slots[0],
5713                                     struct btrfs_file_extent_item);
5714                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5715                 bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
5716
5717                 if (bytenr == dback->disk_bytenr && bytes == dback->bytes)
5718                         break;
5719                 path->slots[0]++;
5720         }
5721
5722         btrfs_release_path(path);
5723
5724         /*
5725          * Have to make sure that this root gets updated when we commit the
5726          * transaction
5727          */
5728         record_root_in_trans(trans, root);
5729
5730         /*
5731          * Ok we have the key of the file extent we want to fix, now we can cow
5732          * down to the thing and fix it.
5733          */
5734         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
5735         if (ret < 0) {
5736                 fprintf(stderr, "Error cowing down to ref [%Lu, %u, %Lu]: %d\n",
5737                         key.objectid, key.type, key.offset, ret);
5738                 return ret;
5739         }
5740         if (ret > 0) {
5741                 fprintf(stderr, "Well that's odd, we just found this key "
5742                         "[%Lu, %u, %Lu]\n", key.objectid, key.type,
5743                         key.offset);
5744                 return -EINVAL;
5745         }
5746         leaf = path->nodes[0];
5747         fi = btrfs_item_ptr(leaf, path->slots[0],
5748                             struct btrfs_file_extent_item);
5749
5750         if (btrfs_file_extent_compression(leaf, fi) &&
5751             dback->disk_bytenr != entry->bytenr) {
5752                 fprintf(stderr, "Ref doesn't match the record start and is "
5753                         "compressed, please take a btrfs-image of this file "
5754                         "system and send it to a btrfs developer so they can "
5755                         "complete this functionality for bytenr %Lu\n",
5756                         dback->disk_bytenr);
5757                 return -EINVAL;
5758         }
5759
5760         if (dback->node.broken && dback->disk_bytenr != entry->bytenr) {
5761                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
5762         } else if (dback->disk_bytenr > entry->bytenr) {
5763                 u64 off_diff, offset;
5764
5765                 off_diff = dback->disk_bytenr - entry->bytenr;
5766                 offset = btrfs_file_extent_offset(leaf, fi);
5767                 if (dback->disk_bytenr + offset +
5768                     btrfs_file_extent_num_bytes(leaf, fi) >
5769                     entry->bytenr + entry->bytes) {
5770                         fprintf(stderr, "Ref is past the entry end, please "
5771                                 "take a btrfs-image of this file system and "
5772                                 "send it to a btrfs developer, ref %Lu\n",
5773                                 dback->disk_bytenr);
5774                         return -EINVAL;
5775                 }
5776                 offset += off_diff;
5777                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
5778                 btrfs_set_file_extent_offset(leaf, fi, offset);
5779         } else if (dback->disk_bytenr < entry->bytenr) {
5780                 u64 offset;
5781
5782                 offset = btrfs_file_extent_offset(leaf, fi);
5783                 if (dback->disk_bytenr + offset < entry->bytenr) {
5784                         fprintf(stderr, "Ref is before the entry start, please"
5785                                 " take a btrfs-image of this file system and "
5786                                 "send it to a btrfs developer, ref %Lu\n",
5787                                 dback->disk_bytenr);
5788                         return -EINVAL;
5789                 }
5790
5791                 offset += dback->disk_bytenr;
5792                 offset -= entry->bytenr;
5793                 btrfs_set_file_extent_disk_bytenr(leaf, fi, entry->bytenr);
5794                 btrfs_set_file_extent_offset(leaf, fi, offset);
5795         }
5796
5797         btrfs_set_file_extent_disk_num_bytes(leaf, fi, entry->bytes);
5798
5799         /*
5800          * Chances are if disk_num_bytes were wrong then so is ram_bytes, but
5801          * only do this if we aren't using compression, otherwise it's a
5802          * trickier case.
5803          */
5804         if (!btrfs_file_extent_compression(leaf, fi))
5805                 btrfs_set_file_extent_ram_bytes(leaf, fi, entry->bytes);
5806         else
5807                 printf("ram bytes may be wrong?\n");
5808         btrfs_mark_buffer_dirty(leaf);
5809         btrfs_release_path(path);
5810         return 0;
5811 }
5812
5813 static int verify_backrefs(struct btrfs_trans_handle *trans,
5814                            struct btrfs_fs_info *info, struct btrfs_path *path,
5815                            struct extent_record *rec)
5816 {
5817         struct extent_backref *back;
5818         struct data_backref *dback;
5819         struct extent_entry *entry, *best = NULL;
5820         LIST_HEAD(entries);
5821         int nr_entries = 0;
5822         int broken_entries = 0;
5823         int ret = 0;
5824         short mismatch = 0;
5825
5826         /*
5827          * Metadata is easy and the backrefs should always agree on bytenr and
5828          * size, if not we've got bigger issues.
5829          */
5830         if (rec->metadata)
5831                 return 0;
5832
5833         list_for_each_entry(back, &rec->backrefs, list) {
5834                 if (back->full_backref || !back->is_data)
5835                         continue;
5836
5837                 dback = (struct data_backref *)back;
5838
5839                 /*
5840                  * We only pay attention to backrefs that we found a real
5841                  * backref for.
5842                  */
5843                 if (dback->found_ref == 0)
5844                         continue;
5845
5846                 /*
5847                  * For now we only catch when the bytes don't match, not the
5848                  * bytenr.  We can easily do this at the same time, but I want
5849                  * to have a fs image to test on before we just add repair
5850                  * functionality willy-nilly so we know we won't screw up the
5851                  * repair.
5852                  */
5853
5854                 entry = find_entry(&entries, dback->disk_bytenr,
5855                                    dback->bytes);
5856                 if (!entry) {
5857                         entry = malloc(sizeof(struct extent_entry));
5858                         if (!entry) {
5859                                 ret = -ENOMEM;
5860                                 goto out;
5861                         }
5862                         memset(entry, 0, sizeof(*entry));
5863                         entry->bytenr = dback->disk_bytenr;
5864                         entry->bytes = dback->bytes;
5865                         list_add_tail(&entry->list, &entries);
5866                         nr_entries++;
5867                 }
5868
5869                 /*
5870                  * If we only have on entry we may think the entries agree when
5871                  * in reality they don't so we have to do some extra checking.
5872                  */
5873                 if (dback->disk_bytenr != rec->start ||
5874                     dback->bytes != rec->nr || back->broken)
5875                         mismatch = 1;
5876
5877                 if (back->broken) {
5878                         entry->broken++;
5879                         broken_entries++;
5880                 }
5881
5882                 entry->count++;
5883         }
5884
5885         /* Yay all the backrefs agree, carry on good sir */
5886         if (nr_entries <= 1 && !mismatch)
5887                 goto out;
5888
5889         fprintf(stderr, "attempting to repair backref discrepency for bytenr "
5890                 "%Lu\n", rec->start);
5891
5892         /*
5893          * First we want to see if the backrefs can agree amongst themselves who
5894          * is right, so figure out which one of the entries has the highest
5895          * count.
5896          */
5897         best = find_most_right_entry(&entries);
5898
5899         /*
5900          * Ok so we may have an even split between what the backrefs think, so
5901          * this is where we use the extent ref to see what it thinks.
5902          */
5903         if (!best) {
5904                 entry = find_entry(&entries, rec->start, rec->nr);
5905                 if (!entry && (!broken_entries || !rec->found_rec)) {
5906                         fprintf(stderr, "Backrefs don't agree with each other "
5907                                 "and extent record doesn't agree with anybody,"
5908                                 " so we can't fix bytenr %Lu bytes %Lu\n",
5909                                 rec->start, rec->nr);
5910                         ret = -EINVAL;
5911                         goto out;
5912                 } else if (!entry) {
5913                         /*
5914                          * Ok our backrefs were broken, we'll assume this is the
5915                          * correct value and add an entry for this range.
5916                          */
5917                         entry = malloc(sizeof(struct extent_entry));
5918                         if (!entry) {
5919                                 ret = -ENOMEM;
5920                                 goto out;
5921                         }
5922                         memset(entry, 0, sizeof(*entry));
5923                         entry->bytenr = rec->start;
5924                         entry->bytes = rec->nr;
5925                         list_add_tail(&entry->list, &entries);
5926                         nr_entries++;
5927                 }
5928                 entry->count++;
5929                 best = find_most_right_entry(&entries);
5930                 if (!best) {
5931                         fprintf(stderr, "Backrefs and extent record evenly "
5932                                 "split on who is right, this is going to "
5933                                 "require user input to fix bytenr %Lu bytes "
5934                                 "%Lu\n", rec->start, rec->nr);
5935                         ret = -EINVAL;
5936                         goto out;
5937                 }
5938         }
5939
5940         /*
5941          * I don't think this can happen currently as we'll abort() if we catch
5942          * this case higher up, but in case somebody removes that we still can't
5943          * deal with it properly here yet, so just bail out of that's the case.
5944          */
5945         if (best->bytenr != rec->start) {
5946                 fprintf(stderr, "Extent start and backref starts don't match, "
5947                         "please use btrfs-image on this file system and send "
5948                         "it to a btrfs developer so they can make fsck fix "
5949                         "this particular case.  bytenr is %Lu, bytes is %Lu\n",
5950                         rec->start, rec->nr);
5951                 ret = -EINVAL;
5952                 goto out;
5953         }
5954
5955         /*
5956          * Ok great we all agreed on an extent record, let's go find the real
5957          * references and fix up the ones that don't match.
5958          */
5959         list_for_each_entry(back, &rec->backrefs, list) {
5960                 if (back->full_backref || !back->is_data)
5961                         continue;
5962
5963                 dback = (struct data_backref *)back;
5964
5965                 /*
5966                  * Still ignoring backrefs that don't have a real ref attached
5967                  * to them.
5968                  */
5969                 if (dback->found_ref == 0)
5970                         continue;
5971
5972                 if (dback->bytes == best->bytes &&
5973                     dback->disk_bytenr == best->bytenr)
5974                         continue;
5975
5976                 ret = repair_ref(trans, info, path, dback, best);
5977                 if (ret)
5978                         goto out;
5979         }
5980
5981         /*
5982          * Ok we messed with the actual refs, which means we need to drop our
5983          * entire cache and go back and rescan.  I know this is a huge pain and
5984          * adds a lot of extra work, but it's the only way to be safe.  Once all
5985          * the backrefs agree we may not need to do anything to the extent
5986          * record itself.
5987          */
5988         ret = -EAGAIN;
5989 out:
5990         while (!list_empty(&entries)) {
5991                 entry = list_entry(entries.next, struct extent_entry, list);
5992                 list_del_init(&entry->list);
5993                 free(entry);
5994         }
5995         return ret;
5996 }
5997
5998 static int process_duplicates(struct btrfs_root *root,
5999                               struct cache_tree *extent_cache,
6000                               struct extent_record *rec)
6001 {
6002         struct extent_record *good, *tmp;
6003         struct cache_extent *cache;
6004         int ret;
6005
6006         /*
6007          * If we found a extent record for this extent then return, or if we
6008          * have more than one duplicate we are likely going to need to delete
6009          * something.
6010          */
6011         if (rec->found_rec || rec->num_duplicates > 1)
6012                 return 0;
6013
6014         /* Shouldn't happen but just in case */
6015         BUG_ON(!rec->num_duplicates);
6016
6017         /*
6018          * So this happens if we end up with a backref that doesn't match the
6019          * actual extent entry.  So either the backref is bad or the extent
6020          * entry is bad.  Either way we want to have the extent_record actually
6021          * reflect what we found in the extent_tree, so we need to take the
6022          * duplicate out and use that as the extent_record since the only way we
6023          * get a duplicate is if we find a real life BTRFS_EXTENT_ITEM_KEY.
6024          */
6025         remove_cache_extent(extent_cache, &rec->cache);
6026
6027         good = list_entry(rec->dups.next, struct extent_record, list);
6028         list_del_init(&good->list);
6029         INIT_LIST_HEAD(&good->backrefs);
6030         INIT_LIST_HEAD(&good->dups);
6031         good->cache.start = good->start;
6032         good->cache.size = good->nr;
6033         good->content_checked = 0;
6034         good->owner_ref_checked = 0;
6035         good->num_duplicates = 0;
6036         good->refs = rec->refs;
6037         list_splice_init(&rec->backrefs, &good->backrefs);
6038         while (1) {
6039                 cache = lookup_cache_extent(extent_cache, good->start,
6040                                             good->nr);
6041                 if (!cache)
6042                         break;
6043                 tmp = container_of(cache, struct extent_record, cache);
6044
6045                 /*
6046                  * If we find another overlapping extent and it's found_rec is
6047                  * set then it's a duplicate and we need to try and delete
6048                  * something.
6049                  */
6050                 if (tmp->found_rec || tmp->num_duplicates > 0) {
6051                         if (list_empty(&good->list))
6052                                 list_add_tail(&good->list,
6053                                               &duplicate_extents);
6054                         good->num_duplicates += tmp->num_duplicates + 1;
6055                         list_splice_init(&tmp->dups, &good->dups);
6056                         list_del_init(&tmp->list);
6057                         list_add_tail(&tmp->list, &good->dups);
6058                         remove_cache_extent(extent_cache, &tmp->cache);
6059                         continue;
6060                 }
6061
6062                 /*
6063                  * Ok we have another non extent item backed extent rec, so lets
6064                  * just add it to this extent and carry on like we did above.
6065                  */
6066                 good->refs += tmp->refs;
6067                 list_splice_init(&tmp->backrefs, &good->backrefs);
6068                 remove_cache_extent(extent_cache, &tmp->cache);
6069                 free(tmp);
6070         }
6071         ret = insert_cache_extent(extent_cache, &good->cache);
6072         BUG_ON(ret);
6073         free(rec);
6074         return good->num_duplicates ? 0 : 1;
6075 }
6076
6077 static int delete_duplicate_records(struct btrfs_trans_handle *trans,
6078                                     struct btrfs_root *root,
6079                                     struct extent_record *rec)
6080 {
6081         LIST_HEAD(delete_list);
6082         struct btrfs_path *path;
6083         struct extent_record *tmp, *good, *n;
6084         int nr_del = 0;
6085         int ret = 0;
6086         struct btrfs_key key;
6087
6088         path = btrfs_alloc_path();
6089         if (!path) {
6090                 ret = -ENOMEM;
6091                 goto out;
6092         }
6093
6094         good = rec;
6095         /* Find the record that covers all of the duplicates. */
6096         list_for_each_entry(tmp, &rec->dups, list) {
6097                 if (good->start < tmp->start)
6098                         continue;
6099                 if (good->nr > tmp->nr)
6100                         continue;
6101
6102                 if (tmp->start + tmp->nr < good->start + good->nr) {
6103                         fprintf(stderr, "Ok we have overlapping extents that "
6104                                 "aren't completely covered by eachother, this "
6105                                 "is going to require more careful thought.  "
6106                                 "The extents are [%Lu-%Lu] and [%Lu-%Lu]\n",
6107                                 tmp->start, tmp->nr, good->start, good->nr);
6108                         abort();
6109                 }
6110                 good = tmp;
6111         }
6112
6113         if (good != rec)
6114                 list_add_tail(&rec->list, &delete_list);
6115
6116         list_for_each_entry_safe(tmp, n, &rec->dups, list) {
6117                 if (tmp == good)
6118                         continue;
6119                 list_move_tail(&tmp->list, &delete_list);
6120         }
6121
6122         root = root->fs_info->extent_root;
6123         list_for_each_entry(tmp, &delete_list, list) {
6124                 if (tmp->found_rec == 0)
6125                         continue;
6126                 key.objectid = tmp->start;
6127                 key.type = BTRFS_EXTENT_ITEM_KEY;
6128                 key.offset = tmp->nr;
6129
6130                 /* Shouldn't happen but just in case */
6131                 if (tmp->metadata) {
6132                         fprintf(stderr, "Well this shouldn't happen, extent "
6133                                 "record overlaps but is metadata? "
6134                                 "[%Lu, %Lu]\n", tmp->start, tmp->nr);
6135                         abort();
6136                 }
6137
6138                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6139                 if (ret) {
6140                         if (ret > 0)
6141                                 ret = -EINVAL;
6142                         goto out;
6143                 }
6144                 ret = btrfs_del_item(trans, root, path);
6145                 if (ret)
6146                         goto out;
6147                 btrfs_release_path(path);
6148                 nr_del++;
6149         }
6150
6151 out:
6152         while (!list_empty(&delete_list)) {
6153                 tmp = list_entry(delete_list.next, struct extent_record, list);
6154                 list_del_init(&tmp->list);
6155                 if (tmp == rec)
6156                         continue;
6157                 free(tmp);
6158         }
6159
6160         while (!list_empty(&rec->dups)) {
6161                 tmp = list_entry(rec->dups.next, struct extent_record, list);
6162                 list_del_init(&tmp->list);
6163                 free(tmp);
6164         }
6165
6166         btrfs_free_path(path);
6167
6168         if (!ret && !nr_del)
6169                 rec->num_duplicates = 0;
6170
6171         return ret ? ret : nr_del;
6172 }
6173
6174 static int find_possible_backrefs(struct btrfs_trans_handle *trans,
6175                                   struct btrfs_fs_info *info,
6176                                   struct btrfs_path *path,
6177                                   struct cache_tree *extent_cache,
6178                                   struct extent_record *rec)
6179 {
6180         struct btrfs_root *root;
6181         struct extent_backref *back;
6182         struct data_backref *dback;
6183         struct cache_extent *cache;
6184         struct btrfs_file_extent_item *fi;
6185         struct btrfs_key key;
6186         u64 bytenr, bytes;
6187         int ret;
6188
6189         list_for_each_entry(back, &rec->backrefs, list) {
6190                 /* Don't care about full backrefs (poor unloved backrefs) */
6191                 if (back->full_backref || !back->is_data)
6192                         continue;
6193
6194                 dback = (struct data_backref *)back;
6195
6196                 /* We found this one, we don't need to do a lookup */
6197                 if (dback->found_ref)
6198                         continue;
6199
6200                 key.objectid = dback->root;
6201                 key.type = BTRFS_ROOT_ITEM_KEY;
6202                 key.offset = (u64)-1;
6203
6204                 root = btrfs_read_fs_root(info, &key);
6205
6206                 /* No root, definitely a bad ref, skip */
6207                 if (IS_ERR(root) && PTR_ERR(root) == -ENOENT)
6208                         continue;
6209                 /* Other err, exit */
6210                 if (IS_ERR(root))
6211                         return PTR_ERR(root);
6212
6213                 key.objectid = dback->owner;
6214                 key.type = BTRFS_EXTENT_DATA_KEY;
6215                 key.offset = dback->offset;
6216                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6217                 if (ret) {
6218                         btrfs_release_path(path);
6219                         if (ret < 0)
6220                                 return ret;
6221                         /* Didn't find it, we can carry on */
6222                         ret = 0;
6223                         continue;
6224                 }
6225
6226                 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6227                                     struct btrfs_file_extent_item);
6228                 bytenr = btrfs_file_extent_disk_bytenr(path->nodes[0], fi);
6229                 bytes = btrfs_file_extent_disk_num_bytes(path->nodes[0], fi);
6230                 btrfs_release_path(path);
6231                 cache = lookup_cache_extent(extent_cache, bytenr, 1);
6232                 if (cache) {
6233                         struct extent_record *tmp;
6234                         tmp = container_of(cache, struct extent_record, cache);
6235
6236                         /*
6237                          * If we found an extent record for the bytenr for this
6238                          * particular backref then we can't add it to our
6239                          * current extent record.  We only want to add backrefs
6240                          * that don't have a corresponding extent item in the
6241                          * extent tree since they likely belong to this record
6242                          * and we need to fix it if it doesn't match bytenrs.
6243                          */
6244                         if  (tmp->found_rec)
6245                                 continue;
6246                 }
6247
6248                 dback->found_ref += 1;
6249                 dback->disk_bytenr = bytenr;
6250                 dback->bytes = bytes;
6251
6252                 /*
6253                  * Set this so the verify backref code knows not to trust the
6254                  * values in this backref.
6255                  */
6256                 back->broken = 1;
6257         }
6258
6259         return 0;
6260 }
6261
6262 /*
6263  * when an incorrect extent item is found, this will delete
6264  * all of the existing entries for it and recreate them
6265  * based on what the tree scan found.
6266  */
6267 static int fixup_extent_refs(struct btrfs_trans_handle *trans,
6268                              struct btrfs_fs_info *info,
6269                              struct cache_tree *extent_cache,
6270                              struct extent_record *rec)
6271 {
6272         int ret;
6273         struct btrfs_path *path;
6274         struct list_head *cur = rec->backrefs.next;
6275         struct cache_extent *cache;
6276         struct extent_backref *back;
6277         int allocated = 0;
6278         u64 flags = 0;
6279
6280         /*
6281          * remember our flags for recreating the extent.
6282          * FIXME, if we have cleared extent tree, we can not
6283          * lookup extent info in extent tree.
6284          */
6285         if (!init_extent_tree) {
6286                 ret = btrfs_lookup_extent_info(NULL, info->extent_root,
6287                                         rec->start, rec->max_size,
6288                                         rec->metadata, NULL, &flags);
6289                 if (ret < 0)
6290                         flags = 0;
6291         } else {
6292                 flags = 0;
6293         }
6294
6295         path = btrfs_alloc_path();
6296         if (!path)
6297                 return -ENOMEM;
6298
6299         if (rec->refs != rec->extent_item_refs && !rec->metadata) {
6300                 /*
6301                  * Sometimes the backrefs themselves are so broken they don't
6302                  * get attached to any meaningful rec, so first go back and
6303                  * check any of our backrefs that we couldn't find and throw
6304                  * them into the list if we find the backref so that
6305                  * verify_backrefs can figure out what to do.
6306                  */
6307                 ret = find_possible_backrefs(trans, info, path, extent_cache,
6308                                              rec);
6309                 if (ret < 0)
6310                         goto out;
6311         }
6312
6313         /* step one, make sure all of the backrefs agree */
6314         ret = verify_backrefs(trans, info, path, rec);
6315         if (ret < 0)
6316                 goto out;
6317
6318         /* step two, delete all the existing records */
6319         ret = delete_extent_records(trans, info->extent_root, path,
6320                                     rec->start, rec->max_size);
6321
6322         if (ret < 0)
6323                 goto out;
6324
6325         /* was this block corrupt?  If so, don't add references to it */
6326         cache = lookup_cache_extent(info->corrupt_blocks,
6327                                     rec->start, rec->max_size);
6328         if (cache) {
6329                 ret = 0;
6330                 goto out;
6331         }
6332
6333         /* step three, recreate all the refs we did find */
6334         while(cur != &rec->backrefs) {
6335                 back = list_entry(cur, struct extent_backref, list);
6336                 cur = cur->next;
6337
6338                 /*
6339                  * if we didn't find any references, don't create a
6340                  * new extent record
6341                  */
6342                 if (!back->found_ref)
6343                         continue;
6344
6345                 ret = record_extent(trans, info, path, rec, back, allocated, flags);
6346                 allocated = 1;
6347
6348                 if (ret)
6349                         goto out;
6350         }
6351 out:
6352         btrfs_free_path(path);
6353         return ret;
6354 }
6355
6356 /* right now we only prune from the extent allocation tree */
6357 static int prune_one_block(struct btrfs_trans_handle *trans,
6358                            struct btrfs_fs_info *info,
6359                            struct btrfs_corrupt_block *corrupt)
6360 {
6361         int ret;
6362         struct btrfs_path path;
6363         struct extent_buffer *eb;
6364         u64 found;
6365         int slot;
6366         int nritems;
6367         int level = corrupt->level + 1;
6368
6369         btrfs_init_path(&path);
6370 again:
6371         /* we want to stop at the parent to our busted block */
6372         path.lowest_level = level;
6373
6374         ret = btrfs_search_slot(trans, info->extent_root,
6375                                 &corrupt->key, &path, -1, 1);
6376
6377         if (ret < 0)
6378                 goto out;
6379
6380         eb = path.nodes[level];
6381         if (!eb) {
6382                 ret = -ENOENT;
6383                 goto out;
6384         }
6385
6386         /*
6387          * hopefully the search gave us the block we want to prune,
6388          * lets try that first
6389          */
6390         slot = path.slots[level];
6391         found =  btrfs_node_blockptr(eb, slot);
6392         if (found == corrupt->cache.start)
6393                 goto del_ptr;
6394
6395         nritems = btrfs_header_nritems(eb);
6396
6397         /* the search failed, lets scan this node and hope we find it */
6398         for (slot = 0; slot < nritems; slot++) {
6399                 found =  btrfs_node_blockptr(eb, slot);
6400                 if (found == corrupt->cache.start)
6401                         goto del_ptr;
6402         }
6403         /*
6404          * we couldn't find the bad block.  TODO, search all the nodes for pointers
6405          * to this block
6406          */
6407         if (eb == info->extent_root->node) {
6408                 ret = -ENOENT;
6409                 goto out;
6410         } else {
6411                 level++;
6412                 btrfs_release_path(&path);
6413                 goto again;
6414         }
6415
6416 del_ptr:
6417         printk("deleting pointer to block %Lu\n", corrupt->cache.start);
6418         ret = btrfs_del_ptr(trans, info->extent_root, &path, level, slot);
6419
6420 out:
6421         btrfs_release_path(&path);
6422         return ret;
6423 }
6424
6425 static int prune_corrupt_blocks(struct btrfs_trans_handle *trans,
6426                                 struct btrfs_fs_info *info)
6427 {
6428         struct cache_extent *cache;
6429         struct btrfs_corrupt_block *corrupt;
6430
6431         cache = search_cache_extent(info->corrupt_blocks, 0);
6432         while (1) {
6433                 if (!cache)
6434                         break;
6435                 corrupt = container_of(cache, struct btrfs_corrupt_block, cache);
6436                 prune_one_block(trans, info, corrupt);
6437                 cache = next_cache_extent(cache);
6438         }
6439         return 0;
6440 }
6441
6442 static void reset_cached_block_groups(struct btrfs_fs_info *fs_info)
6443 {
6444         struct btrfs_block_group_cache *cache;
6445         u64 start, end;
6446         int ret;
6447
6448         while (1) {
6449                 ret = find_first_extent_bit(&fs_info->free_space_cache, 0,
6450                                             &start, &end, EXTENT_DIRTY);
6451                 if (ret)
6452                         break;
6453                 clear_extent_dirty(&fs_info->free_space_cache, start, end,
6454                                    GFP_NOFS);
6455         }
6456
6457         start = 0;
6458         while (1) {
6459                 cache = btrfs_lookup_first_block_group(fs_info, start);
6460                 if (!cache)
6461                         break;
6462                 if (cache->cached)
6463                         cache->cached = 0;
6464                 start = cache->key.objectid + cache->key.offset;
6465         }
6466 }
6467
6468 static int check_extent_refs(struct btrfs_trans_handle *trans,
6469                              struct btrfs_root *root,
6470                              struct cache_tree *extent_cache)
6471 {
6472         struct extent_record *rec;
6473         struct cache_extent *cache;
6474         int err = 0;
6475         int ret = 0;
6476         int fixed = 0;
6477         int had_dups = 0;
6478
6479         if (repair) {
6480                 /*
6481                  * if we're doing a repair, we have to make sure
6482                  * we don't allocate from the problem extents.
6483                  * In the worst case, this will be all the
6484                  * extents in the FS
6485                  */
6486                 cache = search_cache_extent(extent_cache, 0);
6487                 while(cache) {
6488                         rec = container_of(cache, struct extent_record, cache);
6489                         btrfs_pin_extent(root->fs_info,
6490                                          rec->start, rec->max_size);
6491                         cache = next_cache_extent(cache);
6492                 }
6493
6494                 /* pin down all the corrupted blocks too */
6495                 cache = search_cache_extent(root->fs_info->corrupt_blocks, 0);
6496                 while(cache) {
6497                         btrfs_pin_extent(root->fs_info,
6498                                          cache->start, cache->size);
6499                         cache = next_cache_extent(cache);
6500                 }
6501                 prune_corrupt_blocks(trans, root->fs_info);
6502                 reset_cached_block_groups(root->fs_info);
6503         }
6504
6505         /*
6506          * We need to delete any duplicate entries we find first otherwise we
6507          * could mess up the extent tree when we have backrefs that actually
6508          * belong to a different extent item and not the weird duplicate one.
6509          */
6510         while (repair && !list_empty(&duplicate_extents)) {
6511                 rec = list_entry(duplicate_extents.next, struct extent_record,
6512                                  list);
6513                 list_del_init(&rec->list);
6514
6515                 /* Sometimes we can find a backref before we find an actual
6516                  * extent, so we need to process it a little bit to see if there
6517                  * truly are multiple EXTENT_ITEM_KEY's for the same range, or
6518                  * if this is a backref screwup.  If we need to delete stuff
6519                  * process_duplicates() will return 0, otherwise it will return
6520                  * 1 and we
6521                  */
6522                 if (process_duplicates(root, extent_cache, rec))
6523                         continue;
6524                 ret = delete_duplicate_records(trans, root, rec);
6525                 if (ret < 0)
6526                         return ret;
6527                 /*
6528                  * delete_duplicate_records will return the number of entries
6529                  * deleted, so if it's greater than 0 then we know we actually
6530                  * did something and we need to remove.
6531                  */
6532                 if (ret)
6533                         had_dups = 1;
6534         }
6535
6536         if (had_dups)
6537                 return -EAGAIN;
6538
6539         while(1) {
6540                 fixed = 0;
6541                 cache = search_cache_extent(extent_cache, 0);
6542                 if (!cache)
6543                         break;
6544                 rec = container_of(cache, struct extent_record, cache);
6545                 if (rec->num_duplicates) {
6546                         fprintf(stderr, "extent item %llu has multiple extent "
6547                                 "items\n", (unsigned long long)rec->start);
6548                         err = 1;
6549                 }
6550
6551                 if (rec->refs != rec->extent_item_refs) {
6552                         fprintf(stderr, "ref mismatch on [%llu %llu] ",
6553                                 (unsigned long long)rec->start,
6554                                 (unsigned long long)rec->nr);
6555                         fprintf(stderr, "extent item %llu, found %llu\n",
6556                                 (unsigned long long)rec->extent_item_refs,
6557                                 (unsigned long long)rec->refs);
6558                         if (!fixed && repair) {
6559                                 ret = fixup_extent_refs(trans, root->fs_info,
6560                                                         extent_cache, rec);
6561                                 if (ret)
6562                                         goto repair_abort;
6563                                 fixed = 1;
6564                         }
6565                         err = 1;
6566
6567                 }
6568                 if (all_backpointers_checked(rec, 1)) {
6569                         fprintf(stderr, "backpointer mismatch on [%llu %llu]\n",
6570                                 (unsigned long long)rec->start,
6571                                 (unsigned long long)rec->nr);
6572
6573                         if (!fixed && repair) {
6574                                 ret = fixup_extent_refs(trans, root->fs_info,
6575                                                         extent_cache, rec);
6576                                 if (ret)
6577                                         goto repair_abort;
6578                                 fixed = 1;
6579                         }
6580
6581                         err = 1;
6582                 }
6583                 if (!rec->owner_ref_checked) {
6584                         fprintf(stderr, "owner ref check failed [%llu %llu]\n",
6585                                 (unsigned long long)rec->start,
6586                                 (unsigned long long)rec->nr);
6587                         if (!fixed && repair) {
6588                                 ret = fixup_extent_refs(trans, root->fs_info,
6589                                                         extent_cache, rec);
6590                                 if (ret)
6591                                         goto repair_abort;
6592                                 fixed = 1;
6593                         }
6594                         err = 1;
6595                 }
6596
6597                 remove_cache_extent(extent_cache, cache);
6598                 free_all_extent_backrefs(rec);
6599                 free(rec);
6600         }
6601 repair_abort:
6602         if (repair) {
6603                 if (ret && ret != -EAGAIN) {
6604                         fprintf(stderr, "failed to repair damaged filesystem, aborting\n");
6605                         exit(1);
6606                 } else if (!ret) {
6607                         btrfs_fix_block_accounting(trans, root);
6608                 }
6609                 if (err)
6610                         fprintf(stderr, "repaired damaged extent references\n");
6611                 return ret;
6612         }
6613         return err;
6614 }
6615
6616 u64 calc_stripe_length(u64 type, u64 length, int num_stripes)
6617 {
6618         u64 stripe_size;
6619
6620         if (type & BTRFS_BLOCK_GROUP_RAID0) {
6621                 stripe_size = length;
6622                 stripe_size /= num_stripes;
6623         } else if (type & BTRFS_BLOCK_GROUP_RAID10) {
6624                 stripe_size = length * 2;
6625                 stripe_size /= num_stripes;
6626         } else if (type & BTRFS_BLOCK_GROUP_RAID5) {
6627                 stripe_size = length;
6628                 stripe_size /= (num_stripes - 1);
6629         } else if (type & BTRFS_BLOCK_GROUP_RAID6) {
6630                 stripe_size = length;
6631                 stripe_size /= (num_stripes - 2);
6632         } else {
6633                 stripe_size = length;
6634         }
6635         return stripe_size;
6636 }
6637
6638 /*
6639  * Check the chunk with its block group/dev list ref:
6640  * Return 0 if all refs seems valid.
6641  * Return 1 if part of refs seems valid, need later check for rebuild ref
6642  * like missing block group and needs to search extent tree to rebuild them.
6643  * Return -1 if essential refs are missing and unable to rebuild.
6644  */
6645 static int check_chunk_refs(struct chunk_record *chunk_rec,
6646                             struct block_group_tree *block_group_cache,
6647                             struct device_extent_tree *dev_extent_cache,
6648                             int silent)
6649 {
6650         struct cache_extent *block_group_item;
6651         struct block_group_record *block_group_rec;
6652         struct cache_extent *dev_extent_item;
6653         struct device_extent_record *dev_extent_rec;
6654         u64 devid;
6655         u64 offset;
6656         u64 length;
6657         int i;
6658         int ret = 0;
6659
6660         block_group_item = lookup_cache_extent(&block_group_cache->tree,
6661                                                chunk_rec->offset,
6662                                                chunk_rec->length);
6663         if (block_group_item) {
6664                 block_group_rec = container_of(block_group_item,
6665                                                struct block_group_record,
6666                                                cache);
6667                 if (chunk_rec->length != block_group_rec->offset ||
6668                     chunk_rec->offset != block_group_rec->objectid ||
6669                     chunk_rec->type_flags != block_group_rec->flags) {
6670                         if (!silent)
6671                                 fprintf(stderr,
6672                                         "Chunk[%llu, %u, %llu]: length(%llu), offset(%llu), type(%llu) mismatch with block group[%llu, %u, %llu]: offset(%llu), objectid(%llu), flags(%llu)\n",
6673                                         chunk_rec->objectid,
6674                                         chunk_rec->type,
6675                                         chunk_rec->offset,
6676                                         chunk_rec->length,
6677                                         chunk_rec->offset,
6678                                         chunk_rec->type_flags,
6679                                         block_group_rec->objectid,
6680                                         block_group_rec->type,
6681                                         block_group_rec->offset,
6682                                         block_group_rec->offset,
6683                                         block_group_rec->objectid,
6684                                         block_group_rec->flags);
6685                         ret = -1;
6686                 } else {
6687                         list_del_init(&block_group_rec->list);
6688                         chunk_rec->bg_rec = block_group_rec;
6689                 }
6690         } else {
6691                 if (!silent)
6692                         fprintf(stderr,
6693                                 "Chunk[%llu, %u, %llu]: length(%llu), offset(%llu), type(%llu) is not found in block group\n",
6694                                 chunk_rec->objectid,
6695                                 chunk_rec->type,
6696                                 chunk_rec->offset,
6697                                 chunk_rec->length,
6698                                 chunk_rec->offset,
6699                                 chunk_rec->type_flags);
6700                 ret = 1;
6701         }
6702
6703         length = calc_stripe_length(chunk_rec->type_flags, chunk_rec->length,
6704                                     chunk_rec->num_stripes);
6705         for (i = 0; i < chunk_rec->num_stripes; ++i) {
6706                 devid = chunk_rec->stripes[i].devid;
6707                 offset = chunk_rec->stripes[i].offset;
6708                 dev_extent_item = lookup_cache_extent2(&dev_extent_cache->tree,
6709                                                        devid, offset, length);
6710                 if (dev_extent_item) {
6711                         dev_extent_rec = container_of(dev_extent_item,
6712                                                 struct device_extent_record,
6713                                                 cache);
6714                         if (dev_extent_rec->objectid != devid ||
6715                             dev_extent_rec->offset != offset ||
6716                             dev_extent_rec->chunk_offset != chunk_rec->offset ||
6717                             dev_extent_rec->length != length) {
6718                                 if (!silent)
6719                                         fprintf(stderr,
6720                                                 "Chunk[%llu, %u, %llu] stripe[%llu, %llu] dismatch dev extent[%llu, %llu, %llu]\n",
6721                                                 chunk_rec->objectid,
6722                                                 chunk_rec->type,
6723                                                 chunk_rec->offset,
6724                                                 chunk_rec->stripes[i].devid,
6725                                                 chunk_rec->stripes[i].offset,
6726                                                 dev_extent_rec->objectid,
6727                                                 dev_extent_rec->offset,
6728                                                 dev_extent_rec->length);
6729                                 ret = -1;
6730                         } else {
6731                                 list_move(&dev_extent_rec->chunk_list,
6732                                           &chunk_rec->dextents);
6733                         }
6734                 } else {
6735                         if (!silent)
6736                                 fprintf(stderr,
6737                                         "Chunk[%llu, %u, %llu] stripe[%llu, %llu] is not found in dev extent\n",
6738                                         chunk_rec->objectid,
6739                                         chunk_rec->type,
6740                                         chunk_rec->offset,
6741                                         chunk_rec->stripes[i].devid,
6742                                         chunk_rec->stripes[i].offset);
6743                         ret = -1;
6744                 }
6745         }
6746         return ret;
6747 }
6748
6749 /* check btrfs_chunk -> btrfs_dev_extent / btrfs_block_group_item */
6750 int check_chunks(struct cache_tree *chunk_cache,
6751                  struct block_group_tree *block_group_cache,
6752                  struct device_extent_tree *dev_extent_cache,
6753                  struct list_head *good, struct list_head *bad,
6754                  struct list_head *rebuild, int silent)
6755 {
6756         struct cache_extent *chunk_item;
6757         struct chunk_record *chunk_rec;
6758         struct block_group_record *bg_rec;
6759         struct device_extent_record *dext_rec;
6760         int err;
6761         int ret = 0;
6762
6763         chunk_item = first_cache_extent(chunk_cache);
6764         while (chunk_item) {
6765                 chunk_rec = container_of(chunk_item, struct chunk_record,
6766                                          cache);
6767                 err = check_chunk_refs(chunk_rec, block_group_cache,
6768                                        dev_extent_cache, silent);
6769                 if (err)
6770                         ret = err;
6771                 if (err == 0 && good)
6772                         list_add_tail(&chunk_rec->list, good);
6773                 if (err > 0 && rebuild)
6774                         list_add_tail(&chunk_rec->list, rebuild);
6775                 if (err < 0 && bad)
6776                         list_add_tail(&chunk_rec->list, bad);
6777                 chunk_item = next_cache_extent(chunk_item);
6778         }
6779
6780         list_for_each_entry(bg_rec, &block_group_cache->block_groups, list) {
6781                 if (!silent)
6782                         fprintf(stderr,
6783                                 "Block group[%llu, %llu] (flags = %llu) didn't find the relative chunk.\n",
6784                                 bg_rec->objectid,
6785                                 bg_rec->offset,
6786                                 bg_rec->flags);
6787                 if (!ret)
6788                         ret = 1;
6789         }
6790
6791         list_for_each_entry(dext_rec, &dev_extent_cache->no_chunk_orphans,
6792                             chunk_list) {
6793                 if (!silent)
6794                         fprintf(stderr,
6795                                 "Device extent[%llu, %llu, %llu] didn't find the relative chunk.\n",
6796                                 dext_rec->objectid,
6797                                 dext_rec->offset,
6798                                 dext_rec->length);
6799                 if (!ret)
6800                         ret = 1;
6801         }
6802         return ret;
6803 }
6804
6805
6806 static int check_device_used(struct device_record *dev_rec,
6807                              struct device_extent_tree *dext_cache)
6808 {
6809         struct cache_extent *cache;
6810         struct device_extent_record *dev_extent_rec;
6811         u64 total_byte = 0;
6812
6813         cache = search_cache_extent2(&dext_cache->tree, dev_rec->devid, 0);
6814         while (cache) {
6815                 dev_extent_rec = container_of(cache,
6816                                               struct device_extent_record,
6817                                               cache);
6818                 if (dev_extent_rec->objectid != dev_rec->devid)
6819                         break;
6820
6821                 list_del_init(&dev_extent_rec->device_list);
6822                 total_byte += dev_extent_rec->length;
6823                 cache = next_cache_extent(cache);
6824         }
6825
6826         if (total_byte != dev_rec->byte_used) {
6827                 fprintf(stderr,
6828                         "Dev extent's total-byte(%llu) is not equal to byte-used(%llu) in dev[%llu, %u, %llu]\n",
6829                         total_byte, dev_rec->byte_used, dev_rec->objectid,
6830                         dev_rec->type, dev_rec->offset);
6831                 return -1;
6832         } else {
6833                 return 0;
6834         }
6835 }
6836
6837 /* check btrfs_dev_item -> btrfs_dev_extent */
6838 static int check_devices(struct rb_root *dev_cache,
6839                          struct device_extent_tree *dev_extent_cache)
6840 {
6841         struct rb_node *dev_node;
6842         struct device_record *dev_rec;
6843         struct device_extent_record *dext_rec;
6844         int err;
6845         int ret = 0;
6846
6847         dev_node = rb_first(dev_cache);
6848         while (dev_node) {
6849                 dev_rec = container_of(dev_node, struct device_record, node);
6850                 err = check_device_used(dev_rec, dev_extent_cache);
6851                 if (err)
6852                         ret = err;
6853
6854                 dev_node = rb_next(dev_node);
6855         }
6856         list_for_each_entry(dext_rec, &dev_extent_cache->no_device_orphans,
6857                             device_list) {
6858                 fprintf(stderr,
6859                         "Device extent[%llu, %llu, %llu] didn't find its device.\n",
6860                         dext_rec->objectid, dext_rec->offset, dext_rec->length);
6861                 if (!ret)
6862                         ret = 1;
6863         }
6864         return ret;
6865 }
6866
6867 static int check_chunks_and_extents(struct btrfs_root *root)
6868 {
6869         struct rb_root dev_cache;
6870         struct cache_tree chunk_cache;
6871         struct block_group_tree block_group_cache;
6872         struct device_extent_tree dev_extent_cache;
6873         struct cache_tree extent_cache;
6874         struct cache_tree seen;
6875         struct cache_tree pending;
6876         struct cache_tree reada;
6877         struct cache_tree nodes;
6878         struct cache_tree corrupt_blocks;
6879         struct btrfs_path path;
6880         struct btrfs_key key;
6881         struct btrfs_key found_key;
6882         int ret, err = 0;
6883         u64 last = 0;
6884         struct block_info *bits;
6885         int bits_nr;
6886         struct extent_buffer *leaf;
6887         struct btrfs_trans_handle *trans = NULL;
6888         int slot;
6889         struct btrfs_root_item ri;
6890         struct list_head dropping_trees;
6891
6892         dev_cache = RB_ROOT;
6893         cache_tree_init(&chunk_cache);
6894         block_group_tree_init(&block_group_cache);
6895         device_extent_tree_init(&dev_extent_cache);
6896
6897         cache_tree_init(&extent_cache);
6898         cache_tree_init(&seen);
6899         cache_tree_init(&pending);
6900         cache_tree_init(&nodes);
6901         cache_tree_init(&reada);
6902         cache_tree_init(&corrupt_blocks);
6903         INIT_LIST_HEAD(&dropping_trees);
6904
6905         if (repair) {
6906                 trans = btrfs_start_transaction(root, 1);
6907                 if (IS_ERR(trans)) {
6908                         fprintf(stderr, "Error starting transaction\n");
6909                         return PTR_ERR(trans);
6910                 }
6911                 root->fs_info->fsck_extent_cache = &extent_cache;
6912                 root->fs_info->free_extent_hook = free_extent_hook;
6913                 root->fs_info->corrupt_blocks = &corrupt_blocks;
6914         }
6915
6916         bits_nr = 1024;
6917         bits = malloc(bits_nr * sizeof(struct block_info));
6918         if (!bits) {
6919                 perror("malloc");
6920                 exit(1);
6921         }
6922
6923 again:
6924         add_root_to_pending(root->fs_info->tree_root->node,
6925                             &extent_cache, &pending, &seen, &nodes,
6926                             &root->fs_info->tree_root->root_key);
6927
6928         add_root_to_pending(root->fs_info->chunk_root->node,
6929                             &extent_cache, &pending, &seen, &nodes,
6930                             &root->fs_info->chunk_root->root_key);
6931
6932         btrfs_init_path(&path);
6933         key.offset = 0;
6934         key.objectid = 0;
6935         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
6936         ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
6937                                         &key, &path, 0, 0);
6938         if (ret < 0)
6939                 goto out;
6940         while(1) {
6941                 leaf = path.nodes[0];
6942                 slot = path.slots[0];
6943                 if (slot >= btrfs_header_nritems(path.nodes[0])) {
6944                         ret = btrfs_next_leaf(root, &path);
6945                         if (ret != 0)
6946                                 break;
6947                         leaf = path.nodes[0];
6948                         slot = path.slots[0];
6949                 }
6950                 btrfs_item_key_to_cpu(leaf, &found_key, path.slots[0]);
6951                 if (btrfs_key_type(&found_key) == BTRFS_ROOT_ITEM_KEY) {
6952                         unsigned long offset;
6953                         struct extent_buffer *buf;
6954
6955                         offset = btrfs_item_ptr_offset(leaf, path.slots[0]);
6956                         read_extent_buffer(leaf, &ri, offset, sizeof(ri));
6957                         if (btrfs_disk_key_objectid(&ri.drop_progress) == 0) {
6958                                 buf = read_tree_block(root->fs_info->tree_root,
6959                                                       btrfs_root_bytenr(&ri),
6960                                                       btrfs_level_size(root,
6961                                                       btrfs_root_level(&ri)),
6962                                                       0);
6963                                 if (!buf) {
6964                                         ret = -EIO;
6965                                         goto out;
6966                                 }
6967                                 add_root_to_pending(buf, &extent_cache,
6968                                                     &pending, &seen, &nodes,
6969                                                     &found_key);
6970                                 free_extent_buffer(buf);
6971                         } else {
6972                                 struct dropping_root_item_record *dri_rec;
6973                                 dri_rec = malloc(sizeof(*dri_rec));
6974                                 if (!dri_rec) {
6975                                         perror("malloc");
6976                                         exit(1);
6977                                 }
6978                                 memcpy(&dri_rec->ri, &ri, sizeof(ri));
6979                                 memcpy(&dri_rec->found_key, &found_key,
6980                                        sizeof(found_key));
6981                                 list_add_tail(&dri_rec->list, &dropping_trees);
6982                         }
6983                 }
6984                 path.slots[0]++;
6985         }
6986         btrfs_release_path(&path);
6987         while (1) {
6988                 ret = run_next_block(trans, root, bits, bits_nr, &last,
6989                                      &pending, &seen, &reada, &nodes,
6990                                      &extent_cache, &chunk_cache, &dev_cache,
6991                                      &block_group_cache, &dev_extent_cache,
6992                                      NULL);
6993                 if (ret != 0)
6994                         break;
6995         }
6996
6997         while (!list_empty(&dropping_trees)) {
6998                 struct dropping_root_item_record *rec;
6999                 struct extent_buffer *buf;
7000                 rec = list_entry(dropping_trees.next,
7001                                  struct dropping_root_item_record, list);
7002                 last = 0;
7003                 if (!bits) {
7004                         perror("realloc");
7005                         exit(1);
7006                 }
7007                 buf = read_tree_block(root->fs_info->tree_root,
7008                                       btrfs_root_bytenr(&rec->ri),
7009                                       btrfs_level_size(root,
7010                                       btrfs_root_level(&rec->ri)), 0);
7011                 if (!buf) {
7012                         ret = -EIO;
7013                         goto out;
7014                 }
7015                 add_root_to_pending(buf, &extent_cache, &pending,
7016                                     &seen, &nodes, &rec->found_key);
7017                 while (1) {
7018                         ret = run_next_block(trans, root, bits, bits_nr, &last,
7019                                              &pending, &seen, &reada,
7020                                              &nodes, &extent_cache,
7021                                              &chunk_cache, &dev_cache,
7022                                              &block_group_cache,
7023                                              &dev_extent_cache,
7024                                              &rec->ri);
7025                         if (ret != 0)
7026                                 break;
7027                 }
7028                 free_extent_buffer(buf);
7029                 list_del(&rec->list);
7030                 free(rec);
7031         }
7032
7033         if (ret >= 0)
7034                 ret = check_extent_refs(trans, root, &extent_cache);
7035         if (ret == -EAGAIN) {
7036                 ret = btrfs_commit_transaction(trans, root);
7037                 if (ret)
7038                         goto out;
7039
7040                 trans = btrfs_start_transaction(root, 1);
7041                 if (IS_ERR(trans)) {
7042                         ret = PTR_ERR(trans);
7043                         goto out;
7044                 }
7045
7046                 free_corrupt_blocks_tree(root->fs_info->corrupt_blocks);
7047                 free_extent_cache_tree(&seen);
7048                 free_extent_cache_tree(&pending);
7049                 free_extent_cache_tree(&reada);
7050                 free_extent_cache_tree(&nodes);
7051                 free_chunk_cache_tree(&chunk_cache);
7052                 free_block_group_tree(&block_group_cache);
7053                 free_device_cache_tree(&dev_cache);
7054                 free_device_extent_tree(&dev_extent_cache);
7055                 free_extent_record_cache(root->fs_info, &extent_cache);
7056                 goto again;
7057         }
7058
7059         err = check_chunks(&chunk_cache, &block_group_cache,
7060                            &dev_extent_cache, NULL, NULL, NULL, 0);
7061         if (err && !ret)
7062                 ret = err;
7063
7064         err = check_devices(&dev_cache, &dev_extent_cache);
7065         if (err && !ret)
7066                 ret = err;
7067
7068 out:
7069         if (trans) {
7070                 err = btrfs_commit_transaction(trans, root);
7071                 if (!ret)
7072                         ret = err;
7073         }
7074         if (repair) {
7075                 free_corrupt_blocks_tree(root->fs_info->corrupt_blocks);
7076                 root->fs_info->fsck_extent_cache = NULL;
7077                 root->fs_info->free_extent_hook = NULL;
7078                 root->fs_info->corrupt_blocks = NULL;
7079         }
7080         free(bits);
7081         free_chunk_cache_tree(&chunk_cache);
7082         free_device_cache_tree(&dev_cache);
7083         free_block_group_tree(&block_group_cache);
7084         free_device_extent_tree(&dev_extent_cache);
7085         free_extent_cache_tree(&seen);
7086         free_extent_cache_tree(&pending);
7087         free_extent_cache_tree(&reada);
7088         free_extent_cache_tree(&nodes);
7089         return ret;
7090 }
7091
7092 static int btrfs_fsck_reinit_root(struct btrfs_trans_handle *trans,
7093                            struct btrfs_root *root, int overwrite)
7094 {
7095         struct extent_buffer *c;
7096         struct extent_buffer *old = root->node;
7097         int level;
7098         int ret;
7099         struct btrfs_disk_key disk_key = {0,0,0};
7100
7101         level = 0;
7102
7103         if (overwrite) {
7104                 c = old;
7105                 extent_buffer_get(c);
7106                 goto init;
7107         }
7108         c = btrfs_alloc_free_block(trans, root,
7109                                    btrfs_level_size(root, 0),
7110                                    root->root_key.objectid,
7111                                    &disk_key, level, 0, 0);
7112         if (IS_ERR(c)) {
7113                 c = old;
7114                 extent_buffer_get(c);
7115                 overwrite = 1;
7116         }
7117 init:
7118         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
7119         btrfs_set_header_level(c, level);
7120         btrfs_set_header_bytenr(c, c->start);
7121         btrfs_set_header_generation(c, trans->transid);
7122         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
7123         btrfs_set_header_owner(c, root->root_key.objectid);
7124
7125         write_extent_buffer(c, root->fs_info->fsid,
7126                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
7127
7128         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
7129                             btrfs_header_chunk_tree_uuid(c),
7130                             BTRFS_UUID_SIZE);
7131
7132         btrfs_mark_buffer_dirty(c);
7133         /*
7134          * this case can happen in the following case:
7135          *
7136          * 1.overwrite previous root.
7137          *
7138          * 2.reinit reloc data root, this is because we skip pin
7139          * down reloc data tree before which means we can allocate
7140          * same block bytenr here.
7141          */
7142         if (old->start == c->start) {
7143                 btrfs_set_root_generation(&root->root_item,
7144                                           trans->transid);
7145                 root->root_item.level = btrfs_header_level(root->node);
7146                 ret = btrfs_update_root(trans, root->fs_info->tree_root,
7147                                         &root->root_key, &root->root_item);
7148                 if (ret) {
7149                         free_extent_buffer(c);
7150                         return ret;
7151                 }
7152         }
7153         free_extent_buffer(old);
7154         root->node = c;
7155         add_root_to_dirty_list(root);
7156         return 0;
7157 }
7158
7159 static int pin_down_tree_blocks(struct btrfs_fs_info *fs_info,
7160                                 struct extent_buffer *eb, int tree_root)
7161 {
7162         struct extent_buffer *tmp;
7163         struct btrfs_root_item *ri;
7164         struct btrfs_key key;
7165         u64 bytenr;
7166         u32 leafsize;
7167         int level = btrfs_header_level(eb);
7168         int nritems;
7169         int ret;
7170         int i;
7171
7172         /*
7173          * If we have pinned this block before, don't pin it again.
7174          * This can not only avoid forever loop with broken filesystem
7175          * but also give us some speedups.
7176          */
7177         if (test_range_bit(&fs_info->pinned_extents, eb->start,
7178                            eb->start + eb->len - 1, EXTENT_DIRTY, 0))
7179                 return 0;
7180
7181         btrfs_pin_extent(fs_info, eb->start, eb->len);
7182
7183         leafsize = btrfs_super_leafsize(fs_info->super_copy);
7184         nritems = btrfs_header_nritems(eb);
7185         for (i = 0; i < nritems; i++) {
7186                 if (level == 0) {
7187                         btrfs_item_key_to_cpu(eb, &key, i);
7188                         if (key.type != BTRFS_ROOT_ITEM_KEY)
7189                                 continue;
7190                         /* Skip the extent root and reloc roots */
7191                         if (key.objectid == BTRFS_EXTENT_TREE_OBJECTID ||
7192                             key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
7193                             key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
7194                                 continue;
7195                         ri = btrfs_item_ptr(eb, i, struct btrfs_root_item);
7196                         bytenr = btrfs_disk_root_bytenr(eb, ri);
7197
7198                         /*
7199                          * If at any point we start needing the real root we
7200                          * will have to build a stump root for the root we are
7201                          * in, but for now this doesn't actually use the root so
7202                          * just pass in extent_root.
7203                          */
7204                         tmp = read_tree_block(fs_info->extent_root, bytenr,
7205                                               leafsize, 0);
7206                         if (!tmp) {
7207                                 fprintf(stderr, "Error reading root block\n");
7208                                 return -EIO;
7209                         }
7210                         ret = pin_down_tree_blocks(fs_info, tmp, 0);
7211                         free_extent_buffer(tmp);
7212                         if (ret)
7213                                 return ret;
7214                 } else {
7215                         bytenr = btrfs_node_blockptr(eb, i);
7216
7217                         /* If we aren't the tree root don't read the block */
7218                         if (level == 1 && !tree_root) {
7219                                 btrfs_pin_extent(fs_info, bytenr, leafsize);
7220                                 continue;
7221                         }
7222
7223                         tmp = read_tree_block(fs_info->extent_root, bytenr,
7224                                               leafsize, 0);
7225                         if (!tmp) {
7226                                 fprintf(stderr, "Error reading tree block\n");
7227                                 return -EIO;
7228                         }
7229                         ret = pin_down_tree_blocks(fs_info, tmp, tree_root);
7230                         free_extent_buffer(tmp);
7231                         if (ret)
7232                                 return ret;
7233                 }
7234         }
7235
7236         return 0;
7237 }
7238
7239 static int pin_metadata_blocks(struct btrfs_fs_info *fs_info)
7240 {
7241         int ret;
7242
7243         ret = pin_down_tree_blocks(fs_info, fs_info->chunk_root->node, 0);
7244         if (ret)
7245                 return ret;
7246
7247         return pin_down_tree_blocks(fs_info, fs_info->tree_root->node, 1);
7248 }
7249
7250 static int reset_block_groups(struct btrfs_fs_info *fs_info)
7251 {
7252         struct btrfs_block_group_cache *cache;
7253         struct btrfs_path *path;
7254         struct extent_buffer *leaf;
7255         struct btrfs_chunk *chunk;
7256         struct btrfs_key key;
7257         int ret;
7258         u64 start;
7259
7260         path = btrfs_alloc_path();
7261         if (!path)
7262                 return -ENOMEM;
7263
7264         key.objectid = 0;
7265         key.type = BTRFS_CHUNK_ITEM_KEY;
7266         key.offset = 0;
7267
7268         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
7269         if (ret < 0) {
7270                 btrfs_free_path(path);
7271                 return ret;
7272         }
7273
7274         /*
7275          * We do this in case the block groups were screwed up and had alloc
7276          * bits that aren't actually set on the chunks.  This happens with
7277          * restored images every time and could happen in real life I guess.
7278          */
7279         fs_info->avail_data_alloc_bits = 0;
7280         fs_info->avail_metadata_alloc_bits = 0;
7281         fs_info->avail_system_alloc_bits = 0;
7282
7283         /* First we need to create the in-memory block groups */
7284         while (1) {
7285                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7286                         ret = btrfs_next_leaf(fs_info->chunk_root, path);
7287                         if (ret < 0) {
7288                                 btrfs_free_path(path);
7289                                 return ret;
7290                         }
7291                         if (ret) {
7292                                 ret = 0;
7293                                 break;
7294                         }
7295                 }
7296                 leaf = path->nodes[0];
7297                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7298                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7299                         path->slots[0]++;
7300                         continue;
7301                 }
7302
7303                 chunk = btrfs_item_ptr(leaf, path->slots[0],
7304                                        struct btrfs_chunk);
7305                 btrfs_add_block_group(fs_info, 0,
7306                                       btrfs_chunk_type(leaf, chunk),
7307                                       key.objectid, key.offset,
7308                                       btrfs_chunk_length(leaf, chunk));
7309                 set_extent_dirty(&fs_info->free_space_cache, key.offset,
7310                                  key.offset + btrfs_chunk_length(leaf, chunk),
7311                                  GFP_NOFS);
7312                 path->slots[0]++;
7313         }
7314         start = 0;
7315         while (1) {
7316                 cache = btrfs_lookup_first_block_group(fs_info, start);
7317                 if (!cache)
7318                         break;
7319                 cache->cached = 1;
7320                 start = cache->key.objectid + cache->key.offset;
7321         }
7322
7323         btrfs_free_path(path);
7324         return 0;
7325 }
7326
7327 static int reset_balance(struct btrfs_trans_handle *trans,
7328                          struct btrfs_fs_info *fs_info)
7329 {
7330         struct btrfs_root *root = fs_info->tree_root;
7331         struct btrfs_path *path;
7332         struct extent_buffer *leaf;
7333         struct btrfs_key key;
7334         int del_slot, del_nr = 0;
7335         int ret;
7336         int found = 0;
7337
7338         path = btrfs_alloc_path();
7339         if (!path)
7340                 return -ENOMEM;
7341
7342         key.objectid = BTRFS_BALANCE_OBJECTID;
7343         key.type = BTRFS_BALANCE_ITEM_KEY;
7344         key.offset = 0;
7345
7346         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7347         if (ret) {
7348                 if (ret > 0)
7349                         ret = 0;
7350                 if (!ret)
7351                         goto reinit_data_reloc;
7352                 else
7353                         goto out;
7354         }
7355
7356         ret = btrfs_del_item(trans, root, path);
7357         if (ret)
7358                 goto out;
7359         btrfs_release_path(path);
7360
7361         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
7362         key.type = BTRFS_ROOT_ITEM_KEY;
7363         key.offset = 0;
7364
7365         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7366         if (ret < 0)
7367                 goto out;
7368         while (1) {
7369                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7370                         if (!found)
7371                                 break;
7372
7373                         if (del_nr) {
7374                                 ret = btrfs_del_items(trans, root, path,
7375                                                       del_slot, del_nr);
7376                                 del_nr = 0;
7377                                 if (ret)
7378                                         goto out;
7379                         }
7380                         key.offset++;
7381                         btrfs_release_path(path);
7382
7383                         found = 0;
7384                         ret = btrfs_search_slot(trans, root, &key, path,
7385                                                 -1, 1);
7386                         if (ret < 0)
7387                                 goto out;
7388                         continue;
7389                 }
7390                 found = 1;
7391                 leaf = path->nodes[0];
7392                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7393                 if (key.objectid > BTRFS_TREE_RELOC_OBJECTID)
7394                         break;
7395                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7396                         path->slots[0]++;
7397                         continue;
7398                 }
7399                 if (!del_nr) {
7400                         del_slot = path->slots[0];
7401                         del_nr = 1;
7402                 } else {
7403                         del_nr++;
7404                 }
7405                 path->slots[0]++;
7406         }
7407
7408         if (del_nr) {
7409                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
7410                 if (ret)
7411                         goto out;
7412         }
7413         btrfs_release_path(path);
7414
7415 reinit_data_reloc:
7416         key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
7417         key.type = BTRFS_ROOT_ITEM_KEY;
7418         key.offset = (u64)-1;
7419         root = btrfs_read_fs_root(fs_info, &key);
7420         if (IS_ERR(root)) {
7421                 fprintf(stderr, "Error reading data reloc tree\n");
7422                 return PTR_ERR(root);
7423         }
7424         record_root_in_trans(trans, root);
7425         ret = btrfs_fsck_reinit_root(trans, root, 0);
7426         if (ret)
7427                 goto out;
7428         ret = btrfs_make_root_dir(trans, root, BTRFS_FIRST_FREE_OBJECTID);
7429 out:
7430         btrfs_free_path(path);
7431         return ret;
7432 }
7433
7434 static int reinit_extent_tree(struct btrfs_trans_handle *trans,
7435                               struct btrfs_fs_info *fs_info)
7436 {
7437         u64 start = 0;
7438         int ret;
7439
7440         /*
7441          * The only reason we don't do this is because right now we're just
7442          * walking the trees we find and pinning down their bytes, we don't look
7443          * at any of the leaves.  In order to do mixed groups we'd have to check
7444          * the leaves of any fs roots and pin down the bytes for any file
7445          * extents we find.  Not hard but why do it if we don't have to?
7446          */
7447         if (btrfs_fs_incompat(fs_info, BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)) {
7448                 fprintf(stderr, "We don't support re-initing the extent tree "
7449                         "for mixed block groups yet, please notify a btrfs "
7450                         "developer you want to do this so they can add this "
7451                         "functionality.\n");
7452                 return -EINVAL;
7453         }
7454
7455         /*
7456          * first we need to walk all of the trees except the extent tree and pin
7457          * down the bytes that are in use so we don't overwrite any existing
7458          * metadata.
7459          */
7460         ret = pin_metadata_blocks(fs_info);
7461         if (ret) {
7462                 fprintf(stderr, "error pinning down used bytes\n");
7463                 return ret;
7464         }
7465
7466         /*
7467          * Need to drop all the block groups since we're going to recreate all
7468          * of them again.
7469          */
7470         btrfs_free_block_groups(fs_info);
7471         ret = reset_block_groups(fs_info);
7472         if (ret) {
7473                 fprintf(stderr, "error resetting the block groups\n");
7474                 return ret;
7475         }
7476
7477         /* Ok we can allocate now, reinit the extent root */
7478         ret = btrfs_fsck_reinit_root(trans, fs_info->extent_root, 0);
7479         if (ret) {
7480                 fprintf(stderr, "extent root initialization failed\n");
7481                 /*
7482                  * When the transaction code is updated we should end the
7483                  * transaction, but for now progs only knows about commit so
7484                  * just return an error.
7485                  */
7486                 return ret;
7487         }
7488
7489         /*
7490          * Now we have all the in-memory block groups setup so we can make
7491          * allocations properly, and the metadata we care about is safe since we
7492          * pinned all of it above.
7493          */
7494         while (1) {
7495                 struct btrfs_block_group_cache *cache;
7496
7497                 cache = btrfs_lookup_first_block_group(fs_info, start);
7498                 if (!cache)
7499                         break;
7500                 start = cache->key.objectid + cache->key.offset;
7501                 ret = btrfs_insert_item(trans, fs_info->extent_root,
7502                                         &cache->key, &cache->item,
7503                                         sizeof(cache->item));
7504                 if (ret) {
7505                         fprintf(stderr, "Error adding block group\n");
7506                         return ret;
7507                 }
7508                 btrfs_extent_post_op(trans, fs_info->extent_root);
7509         }
7510
7511         ret = reset_balance(trans, fs_info);
7512         if (ret)
7513                 fprintf(stderr, "error reseting the pending balance\n");
7514
7515         return ret;
7516 }
7517
7518 static int recow_extent_buffer(struct btrfs_root *root, struct extent_buffer *eb)
7519 {
7520         struct btrfs_path *path;
7521         struct btrfs_trans_handle *trans;
7522         struct btrfs_key key;
7523         int ret;
7524
7525         printf("Recowing metadata block %llu\n", eb->start);
7526         key.objectid = btrfs_header_owner(eb);
7527         key.type = BTRFS_ROOT_ITEM_KEY;
7528         key.offset = (u64)-1;
7529
7530         root = btrfs_read_fs_root(root->fs_info, &key);
7531         if (IS_ERR(root)) {
7532                 fprintf(stderr, "Couldn't find owner root %llu\n",
7533                         key.objectid);
7534                 return PTR_ERR(root);
7535         }
7536
7537         path = btrfs_alloc_path();
7538         if (!path)
7539                 return -ENOMEM;
7540
7541         trans = btrfs_start_transaction(root, 1);
7542         if (IS_ERR(trans)) {
7543                 btrfs_free_path(path);
7544                 return PTR_ERR(trans);
7545         }
7546
7547         path->lowest_level = btrfs_header_level(eb);
7548         if (path->lowest_level)
7549                 btrfs_node_key_to_cpu(eb, &key, 0);
7550         else
7551                 btrfs_item_key_to_cpu(eb, &key, 0);
7552
7553         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
7554         btrfs_commit_transaction(trans, root);
7555         btrfs_free_path(path);
7556         return ret;
7557 }
7558
7559 static int delete_bad_item(struct btrfs_root *root, struct bad_item *bad)
7560 {
7561         struct btrfs_path *path;
7562         struct btrfs_trans_handle *trans;
7563         struct btrfs_key key;
7564         int ret;
7565
7566         printf("Deleting bad item [%llu,%u,%llu]\n", bad->key.objectid,
7567                bad->key.type, bad->key.offset);
7568         key.objectid = bad->root_id;
7569         key.type = BTRFS_ROOT_ITEM_KEY;
7570         key.offset = (u64)-1;
7571
7572         root = btrfs_read_fs_root(root->fs_info, &key);
7573         if (IS_ERR(root)) {
7574                 fprintf(stderr, "Couldn't find owner root %llu\n",
7575                         key.objectid);
7576                 return PTR_ERR(root);
7577         }
7578
7579         path = btrfs_alloc_path();
7580         if (!path)
7581                 return -ENOMEM;
7582
7583         trans = btrfs_start_transaction(root, 1);
7584         if (IS_ERR(trans)) {
7585                 btrfs_free_path(path);
7586                 return PTR_ERR(trans);
7587         }
7588
7589         ret = btrfs_search_slot(trans, root, &bad->key, path, -1, 1);
7590         if (ret) {
7591                 if (ret > 0)
7592                         ret = 0;
7593                 goto out;
7594         }
7595         ret = btrfs_del_item(trans, root, path);
7596 out:
7597         btrfs_commit_transaction(trans, root);
7598         btrfs_free_path(path);
7599         return ret;
7600 }
7601
7602 static int zero_log_tree(struct btrfs_root *root)
7603 {
7604         struct btrfs_trans_handle *trans;
7605         int ret;
7606
7607         trans = btrfs_start_transaction(root, 1);
7608         if (IS_ERR(trans)) {
7609                 ret = PTR_ERR(trans);
7610                 return ret;
7611         }
7612         btrfs_set_super_log_root(root->fs_info->super_copy, 0);
7613         btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
7614         ret = btrfs_commit_transaction(trans, root);
7615         return ret;
7616 }
7617
7618 static int populate_csum(struct btrfs_trans_handle *trans,
7619                          struct btrfs_root *csum_root, char *buf, u64 start,
7620                          u64 len)
7621 {
7622         u64 offset = 0;
7623         u64 sectorsize;
7624         int ret = 0;
7625
7626         while (offset < len) {
7627                 sectorsize = csum_root->sectorsize;
7628                 ret = read_extent_data(csum_root, buf, start + offset,
7629                                        &sectorsize, 0);
7630                 if (ret)
7631                         break;
7632                 ret = btrfs_csum_file_block(trans, csum_root, start + len,
7633                                             start + offset, buf, sectorsize);
7634                 if (ret)
7635                         break;
7636                 offset += sectorsize;
7637         }
7638         return ret;
7639 }
7640
7641 static int fill_csum_tree(struct btrfs_trans_handle *trans,
7642                           struct btrfs_root *csum_root)
7643 {
7644         struct btrfs_root *extent_root = csum_root->fs_info->extent_root;
7645         struct btrfs_path *path;
7646         struct btrfs_extent_item *ei;
7647         struct extent_buffer *leaf;
7648         char *buf;
7649         struct btrfs_key key;
7650         int ret;
7651
7652         path = btrfs_alloc_path();
7653         if (!path)
7654                 return -ENOMEM;
7655
7656         key.objectid = 0;
7657         key.type = BTRFS_EXTENT_ITEM_KEY;
7658         key.offset = 0;
7659
7660         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
7661         if (ret < 0) {
7662                 btrfs_free_path(path);
7663                 return ret;
7664         }
7665
7666         buf = malloc(csum_root->sectorsize);
7667         if (!buf) {
7668                 btrfs_free_path(path);
7669                 return -ENOMEM;
7670         }
7671
7672         while (1) {
7673                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7674                         ret = btrfs_next_leaf(extent_root, path);
7675                         if (ret < 0)
7676                                 break;
7677                         if (ret) {
7678                                 ret = 0;
7679                                 break;
7680                         }
7681                 }
7682                 leaf = path->nodes[0];
7683
7684                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7685                 if (key.type != BTRFS_EXTENT_ITEM_KEY) {
7686                         path->slots[0]++;
7687                         continue;
7688                 }
7689
7690                 ei = btrfs_item_ptr(leaf, path->slots[0],
7691                                     struct btrfs_extent_item);
7692                 if (!(btrfs_extent_flags(leaf, ei) &
7693                       BTRFS_EXTENT_FLAG_DATA)) {
7694                         path->slots[0]++;
7695                         continue;
7696                 }
7697
7698                 ret = populate_csum(trans, csum_root, buf, key.objectid,
7699                                     key.offset);
7700                 if (ret)
7701                         break;
7702                 path->slots[0]++;
7703         }
7704
7705         btrfs_free_path(path);
7706         free(buf);
7707         return ret;
7708 }
7709
7710 struct root_item_info {
7711         /* level of the root */
7712         u8 level;
7713         /* number of nodes at this level, must be 1 for a root */
7714         int node_count;
7715         u64 bytenr;
7716         u64 gen;
7717         struct cache_extent cache_extent;
7718 };
7719
7720 static struct cache_tree *roots_info_cache = NULL;
7721
7722 static void free_roots_info_cache(void)
7723 {
7724         if (!roots_info_cache)
7725                 return;
7726
7727         while (!cache_tree_empty(roots_info_cache)) {
7728                 struct cache_extent *entry;
7729                 struct root_item_info *rii;
7730
7731                 entry = first_cache_extent(roots_info_cache);
7732                 remove_cache_extent(roots_info_cache, entry);
7733                 rii = container_of(entry, struct root_item_info, cache_extent);
7734                 free(rii);
7735         }
7736
7737         free(roots_info_cache);
7738         roots_info_cache = NULL;
7739 }
7740
7741 static int build_roots_info_cache(struct btrfs_fs_info *info)
7742 {
7743         int ret = 0;
7744         struct btrfs_key key;
7745         struct extent_buffer *leaf;
7746         struct btrfs_path *path;
7747
7748         if (!roots_info_cache) {
7749                 roots_info_cache = malloc(sizeof(*roots_info_cache));
7750                 if (!roots_info_cache)
7751                         return -ENOMEM;
7752                 cache_tree_init(roots_info_cache);
7753         }
7754
7755         path = btrfs_alloc_path();
7756         if (!path)
7757                 return -ENOMEM;
7758
7759         key.objectid = 0;
7760         key.type = BTRFS_EXTENT_ITEM_KEY;
7761         key.offset = 0;
7762
7763         ret = btrfs_search_slot(NULL, info->extent_root, &key, path, 0, 0);
7764         if (ret < 0)
7765                 goto out;
7766         leaf = path->nodes[0];
7767
7768         while (1) {
7769                 struct btrfs_key found_key;
7770                 struct btrfs_extent_item *ei;
7771                 struct btrfs_extent_inline_ref *iref;
7772                 int slot = path->slots[0];
7773                 int type;
7774                 u64 flags;
7775                 u64 root_id;
7776                 u8 level;
7777                 struct cache_extent *entry;
7778                 struct root_item_info *rii;
7779
7780                 if (slot >= btrfs_header_nritems(leaf)) {
7781                         ret = btrfs_next_leaf(info->extent_root, path);
7782                         if (ret < 0) {
7783                                 break;
7784                         } else if (ret) {
7785                                 ret = 0;
7786                                 break;
7787                         }
7788                         leaf = path->nodes[0];
7789                         slot = path->slots[0];
7790                 }
7791
7792                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7793
7794                 if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
7795                     found_key.type != BTRFS_METADATA_ITEM_KEY)
7796                         goto next;
7797
7798                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
7799                 flags = btrfs_extent_flags(leaf, ei);
7800
7801                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
7802                     !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
7803                         goto next;
7804
7805                 if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
7806                         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
7807                         level = found_key.offset;
7808                 } else {
7809                         struct btrfs_tree_block_info *info;
7810
7811                         info = (struct btrfs_tree_block_info *)(ei + 1);
7812                         iref = (struct btrfs_extent_inline_ref *)(info + 1);
7813                         level = btrfs_tree_block_level(leaf, info);
7814                 }
7815
7816                 /*
7817                  * For a root extent, it must be of the following type and the
7818                  * first (and only one) iref in the item.
7819                  */
7820                 type = btrfs_extent_inline_ref_type(leaf, iref);
7821                 if (type != BTRFS_TREE_BLOCK_REF_KEY)
7822                         goto next;
7823
7824                 root_id = btrfs_extent_inline_ref_offset(leaf, iref);
7825                 entry = lookup_cache_extent(roots_info_cache, root_id, 1);
7826                 if (!entry) {
7827                         rii = malloc(sizeof(struct root_item_info));
7828                         if (!rii) {
7829                                 ret = -ENOMEM;
7830                                 goto out;
7831                         }
7832                         rii->cache_extent.start = root_id;
7833                         rii->cache_extent.size = 1;
7834                         rii->level = (u8)-1;
7835                         entry = &rii->cache_extent;
7836                         ret = insert_cache_extent(roots_info_cache, entry);
7837                         ASSERT(ret == 0);
7838                 } else {
7839                         rii = container_of(entry, struct root_item_info,
7840                                            cache_extent);
7841                 }
7842
7843                 ASSERT(rii->cache_extent.start == root_id);
7844                 ASSERT(rii->cache_extent.size == 1);
7845
7846                 if (level > rii->level || rii->level == (u8)-1) {
7847                         rii->level = level;
7848                         rii->bytenr = found_key.objectid;
7849                         rii->gen = btrfs_extent_generation(leaf, ei);
7850                         rii->node_count = 1;
7851                 } else if (level == rii->level) {
7852                         rii->node_count++;
7853                 }
7854 next:
7855                 path->slots[0]++;
7856         }
7857
7858 out:
7859         btrfs_free_path(path);
7860
7861         return ret;
7862 }
7863
7864 static int maybe_repair_root_item(struct btrfs_fs_info *info,
7865                                   struct btrfs_path *path,
7866                                   const struct btrfs_key *root_key,
7867                                   const int read_only_mode)
7868 {
7869         const u64 root_id = root_key->objectid;
7870         struct cache_extent *entry;
7871         struct root_item_info *rii;
7872         struct btrfs_root_item ri;
7873         unsigned long offset;
7874
7875         entry = lookup_cache_extent(roots_info_cache, root_id, 1);
7876         if (!entry) {
7877                 fprintf(stderr,
7878                         "Error: could not find extent items for root %llu\n",
7879                         root_key->objectid);
7880                 return -ENOENT;
7881         }
7882
7883         rii = container_of(entry, struct root_item_info, cache_extent);
7884         ASSERT(rii->cache_extent.start == root_id);
7885         ASSERT(rii->cache_extent.size == 1);
7886
7887         if (rii->node_count != 1) {
7888                 fprintf(stderr,
7889                         "Error: could not find btree root extent for root %llu\n",
7890                         root_id);
7891                 return -ENOENT;
7892         }
7893
7894         offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
7895         read_extent_buffer(path->nodes[0], &ri, offset, sizeof(ri));
7896
7897         if (btrfs_root_bytenr(&ri) != rii->bytenr ||
7898             btrfs_root_level(&ri) != rii->level ||
7899             btrfs_root_generation(&ri) != rii->gen) {
7900
7901                 /*
7902                  * If we're in repair mode but our caller told us to not update
7903                  * the root item, i.e. just check if it needs to be updated, don't
7904                  * print this message, since the caller will call us again shortly
7905                  * for the same root item without read only mode (the caller will
7906                  * open a transaction first).
7907                  */
7908                 if (!(read_only_mode && repair))
7909                         fprintf(stderr,
7910                                 "%sroot item for root %llu,"
7911                                 " current bytenr %llu, current gen %llu, current level %u,"
7912                                 " new bytenr %llu, new gen %llu, new level %u\n",
7913                                 (read_only_mode ? "" : "fixing "),
7914                                 root_id,
7915                                 btrfs_root_bytenr(&ri), btrfs_root_generation(&ri),
7916                                 btrfs_root_level(&ri),
7917                                 rii->bytenr, rii->gen, rii->level);
7918
7919                 if (btrfs_root_generation(&ri) > rii->gen) {
7920                         fprintf(stderr,
7921                                 "root %llu has a root item with a more recent gen (%llu) compared to the found root node (%llu)\n",
7922                                 root_id, btrfs_root_generation(&ri), rii->gen);
7923                         return -EINVAL;
7924                 }
7925
7926                 if (!read_only_mode) {
7927                         btrfs_set_root_bytenr(&ri, rii->bytenr);
7928                         btrfs_set_root_level(&ri, rii->level);
7929                         btrfs_set_root_generation(&ri, rii->gen);
7930                         write_extent_buffer(path->nodes[0], &ri,
7931                                             offset, sizeof(ri));
7932                 }
7933
7934                 return 1;
7935         }
7936
7937         return 0;
7938 }
7939
7940 /*
7941  * A regression introduced in the 3.17 kernel (more specifically in 3.17-rc2),
7942  * caused read-only snapshots to be corrupted if they were created at a moment
7943  * when the source subvolume/snapshot had orphan items. The issue was that the
7944  * on-disk root items became incorrect, referring to the pre orphan cleanup root
7945  * node instead of the post orphan cleanup root node.
7946  * So this function, and its callees, just detects and fixes those cases. Even
7947  * though the regression was for read-only snapshots, this function applies to
7948  * any snapshot/subvolume root.
7949  * This must be run before any other repair code - not doing it so, makes other
7950  * repair code delete or modify backrefs in the extent tree for example, which
7951  * will result in an inconsistent fs after repairing the root items.
7952  */
7953 static int repair_root_items(struct btrfs_fs_info *info)
7954 {
7955         struct btrfs_path *path = NULL;
7956         struct btrfs_key key;
7957         struct extent_buffer *leaf;
7958         struct btrfs_trans_handle *trans = NULL;
7959         int ret = 0;
7960         int bad_roots = 0;
7961         int need_trans = 0;
7962
7963         ret = build_roots_info_cache(info);
7964         if (ret)
7965                 goto out;
7966
7967         path = btrfs_alloc_path();
7968         if (!path) {
7969                 ret = -ENOMEM;
7970                 goto out;
7971         }
7972
7973         key.objectid = BTRFS_FIRST_FREE_OBJECTID;
7974         key.type = BTRFS_ROOT_ITEM_KEY;
7975         key.offset = 0;
7976
7977 again:
7978         /*
7979          * Avoid opening and committing transactions if a leaf doesn't have
7980          * any root items that need to be fixed, so that we avoid rotating
7981          * backup roots unnecessarily.
7982          */
7983         if (need_trans) {
7984                 trans = btrfs_start_transaction(info->tree_root, 1);
7985                 if (IS_ERR(trans)) {
7986                         ret = PTR_ERR(trans);
7987                         goto out;
7988                 }
7989         }
7990
7991         ret = btrfs_search_slot(trans, info->tree_root, &key, path,
7992                                 0, trans ? 1 : 0);
7993         if (ret < 0)
7994                 goto out;
7995         leaf = path->nodes[0];
7996
7997         while (1) {
7998                 struct btrfs_key found_key;
7999
8000                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
8001                         int no_more_keys = find_next_key(path, &key);
8002
8003                         btrfs_release_path(path);
8004                         if (trans) {
8005                                 ret = btrfs_commit_transaction(trans,
8006                                                                info->tree_root);
8007                                 trans = NULL;
8008                                 if (ret < 0)
8009                                         goto out;
8010                         }
8011                         need_trans = 0;
8012                         if (no_more_keys)
8013                                 break;
8014                         goto again;
8015                 }
8016
8017                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8018
8019                 if (found_key.type != BTRFS_ROOT_ITEM_KEY)
8020                         goto next;
8021
8022                 ret = maybe_repair_root_item(info, path, &found_key,
8023                                              trans ? 0 : 1);
8024                 if (ret < 0)
8025                         goto out;
8026                 if (ret) {
8027                         if (!trans && repair) {
8028                                 need_trans = 1;
8029                                 key = found_key;
8030                                 btrfs_release_path(path);
8031                                 goto again;
8032                         }
8033                         bad_roots++;
8034                 }
8035 next:
8036                 path->slots[0]++;
8037         }
8038         ret = 0;
8039 out:
8040         free_roots_info_cache();
8041         if (path)
8042                 btrfs_free_path(path);
8043         if (ret < 0)
8044                 return ret;
8045
8046         return bad_roots;
8047 }
8048
8049 static struct option long_options[] = {
8050         { "super", 1, NULL, 's' },
8051         { "repair", 0, NULL, 0 },
8052         { "init-csum-tree", 0, NULL, 0 },
8053         { "init-extent-tree", 0, NULL, 0 },
8054         { "check-data-csum", 0, NULL, 0 },
8055         { "backup", 0, NULL, 0 },
8056         { "subvol-extents", 1, NULL, 'E' },
8057         { "qgroup-report", 0, NULL, 'Q' },
8058         { "tree-root", 1, NULL, 'r' },
8059         { NULL, 0, NULL, 0}
8060 };
8061
8062 const char * const cmd_check_usage[] = {
8063         "btrfs check [options] <device>",
8064         "Check an unmounted btrfs filesystem.",
8065         "",
8066         "-s|--super <superblock>     use this superblock copy",
8067         "-b|--backup                 use the backup root copy",
8068         "--repair                    try to repair the filesystem",
8069         "--init-csum-tree            create a new CRC tree",
8070         "--init-extent-tree          create a new extent tree",
8071         "--check-data-csum           verify checkums of data blocks",
8072         "--qgroup-report             print a report on qgroup consistency",
8073         "--subvol-extents <subvolid> print subvolume extents and sharing state",
8074         "--tree-root <bytenr>        use the given bytenr for the tree root",
8075         NULL
8076 };
8077
8078 int cmd_check(int argc, char **argv)
8079 {
8080         struct cache_tree root_cache;
8081         struct btrfs_root *root;
8082         struct btrfs_fs_info *info;
8083         u64 bytenr = 0;
8084         u64 subvolid = 0;
8085         u64 tree_root_bytenr = 0;
8086         char uuidbuf[BTRFS_UUID_UNPARSED_SIZE];
8087         int ret;
8088         u64 num;
8089         int option_index = 0;
8090         int init_csum_tree = 0;
8091         int qgroup_report = 0;
8092         enum btrfs_open_ctree_flags ctree_flags = OPEN_CTREE_EXCLUSIVE;
8093
8094         while(1) {
8095                 int c;
8096                 c = getopt_long(argc, argv, "as:br:", long_options,
8097                                 &option_index);
8098                 if (c < 0)
8099                         break;
8100                 switch(c) {
8101                         case 'a': /* ignored */ break;
8102                         case 'b':
8103                                 ctree_flags |= OPEN_CTREE_BACKUP_ROOT;
8104                                 break;
8105                         case 's':
8106                                 num = arg_strtou64(optarg);
8107                                 if (num >= BTRFS_SUPER_MIRROR_MAX) {
8108                                         fprintf(stderr,
8109                                                 "ERROR: super mirror should be less than: %d\n",
8110                                                 BTRFS_SUPER_MIRROR_MAX);
8111                                         exit(1);
8112                                 }
8113                                 bytenr = btrfs_sb_offset(((int)num));
8114                                 printf("using SB copy %llu, bytenr %llu\n", num,
8115                                        (unsigned long long)bytenr);
8116                                 break;
8117                         case 'Q':
8118                                 qgroup_report = 1;
8119                                 break;
8120                         case 'E':
8121                                 subvolid = arg_strtou64(optarg);
8122                                 break;
8123                         case 'r':
8124                                 tree_root_bytenr = arg_strtou64(optarg);
8125                                 break;
8126                         case '?':
8127                         case 'h':
8128                                 usage(cmd_check_usage);
8129                 }
8130                 if (option_index == 1) {
8131                         printf("enabling repair mode\n");
8132                         repair = 1;
8133                         ctree_flags |= OPEN_CTREE_WRITES;
8134                 } else if (option_index == 2) {
8135                         printf("Creating a new CRC tree\n");
8136                         init_csum_tree = 1;
8137                         repair = 1;
8138                         ctree_flags |= OPEN_CTREE_WRITES;
8139                 } else if (option_index == 3) {
8140                         init_extent_tree = 1;
8141                         ctree_flags |= (OPEN_CTREE_WRITES |
8142                                         OPEN_CTREE_NO_BLOCK_GROUPS);
8143                         repair = 1;
8144                 } else if (option_index == 4) {
8145                         check_data_csum = 1;
8146                 }
8147         }
8148         argc = argc - optind;
8149
8150         if (check_argc_exact(argc, 1))
8151                 usage(cmd_check_usage);
8152
8153         radix_tree_init();
8154         cache_tree_init(&root_cache);
8155
8156         if((ret = check_mounted(argv[optind])) < 0) {
8157                 fprintf(stderr, "Could not check mount status: %s\n", strerror(-ret));
8158                 goto err_out;
8159         } else if(ret) {
8160                 fprintf(stderr, "%s is currently mounted. Aborting.\n", argv[optind]);
8161                 ret = -EBUSY;
8162                 goto err_out;
8163         }
8164
8165         /* only allow partial opening under repair mode */
8166         if (repair)
8167                 ctree_flags |= OPEN_CTREE_PARTIAL;
8168
8169         info = open_ctree_fs_info(argv[optind], bytenr, tree_root_bytenr,
8170                                   ctree_flags);
8171         if (!info) {
8172                 fprintf(stderr, "Couldn't open file system\n");
8173                 ret = -EIO;
8174                 goto err_out;
8175         }
8176
8177         root = info->fs_root;
8178
8179         ret = repair_root_items(info);
8180         if (ret < 0)
8181                 goto close_out;
8182         if (repair) {
8183                 fprintf(stderr, "Fixed %d roots.\n", ret);
8184                 ret = 0;
8185         } else if (ret > 0) {
8186                 fprintf(stderr,
8187                        "Found %d roots with an outdated root item.\n",
8188                        ret);
8189                 fprintf(stderr,
8190                         "Please run a filesystem check with the option --repair to fix them.\n");
8191                 ret = 1;
8192                 goto close_out;
8193         }
8194
8195         /*
8196          * repair mode will force us to commit transaction which
8197          * will make us fail to load log tree when mounting.
8198          */
8199         if (repair && btrfs_super_log_root(info->super_copy)) {
8200                 ret = ask_user("repair mode will force to clear out log tree, Are you sure?");
8201                 if (!ret) {
8202                         ret = 1;
8203                         goto close_out;
8204                 }
8205                 ret = zero_log_tree(root);
8206                 if (ret) {
8207                         fprintf(stderr, "fail to zero log tree\n");
8208                         goto close_out;
8209                 }
8210         }
8211
8212         uuid_unparse(info->super_copy->fsid, uuidbuf);
8213         if (qgroup_report) {
8214                 printf("Print quota groups for %s\nUUID: %s\n", argv[optind],
8215                        uuidbuf);
8216                 ret = qgroup_verify_all(info);
8217                 if (ret == 0)
8218                         print_qgroup_report(1);
8219                 goto close_out;
8220         }
8221         if (subvolid) {
8222                 printf("Print extent state for subvolume %llu on %s\nUUID: %s\n",
8223                        subvolid, argv[optind], uuidbuf);
8224                 ret = print_extent_state(info, subvolid);
8225                 goto close_out;
8226         }
8227         printf("Checking filesystem on %s\nUUID: %s\n", argv[optind], uuidbuf);
8228
8229         if (!extent_buffer_uptodate(info->tree_root->node) ||
8230             !extent_buffer_uptodate(info->dev_root->node) ||
8231             !extent_buffer_uptodate(info->chunk_root->node)) {
8232                 fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
8233                 ret = -EIO;
8234                 goto close_out;
8235         }
8236
8237         if (init_extent_tree || init_csum_tree) {
8238                 struct btrfs_trans_handle *trans;
8239
8240                 trans = btrfs_start_transaction(info->extent_root, 0);
8241                 if (IS_ERR(trans)) {
8242                         fprintf(stderr, "Error starting transaction\n");
8243                         ret = PTR_ERR(trans);
8244                         goto close_out;
8245                 }
8246
8247                 if (init_extent_tree) {
8248                         printf("Creating a new extent tree\n");
8249                         ret = reinit_extent_tree(trans, info);
8250                         if (ret)
8251                                 goto close_out;
8252                 }
8253
8254                 if (init_csum_tree) {
8255                         fprintf(stderr, "Reinit crc root\n");
8256                         ret = btrfs_fsck_reinit_root(trans, info->csum_root, 0);
8257                         if (ret) {
8258                                 fprintf(stderr, "crc root initialization failed\n");
8259                                 ret = -EIO;
8260                                 goto close_out;
8261                         }
8262
8263                         ret = fill_csum_tree(trans, info->csum_root);
8264                         if (ret) {
8265                                 fprintf(stderr, "crc refilling failed\n");
8266                                 return -EIO;
8267                         }
8268                 }
8269                 /*
8270                  * Ok now we commit and run the normal fsck, which will add
8271                  * extent entries for all of the items it finds.
8272                  */
8273                 ret = btrfs_commit_transaction(trans, info->extent_root);
8274                 if (ret)
8275                         goto close_out;
8276         }
8277         if (!extent_buffer_uptodate(info->extent_root->node)) {
8278                 fprintf(stderr, "Critical roots corrupted, unable to fsck the FS\n");
8279                 ret = -EIO;
8280                 goto close_out;
8281         }
8282         if (!extent_buffer_uptodate(info->csum_root->node)) {
8283                 fprintf(stderr, "Checksum root corrupted, rerun with --init-csum-tree option\n");
8284                 ret = -EIO;
8285                 goto close_out;
8286         }
8287
8288         fprintf(stderr, "checking extents\n");
8289         ret = check_chunks_and_extents(root);
8290         if (ret)
8291                 fprintf(stderr, "Errors found in extent allocation tree or chunk allocation\n");
8292
8293         fprintf(stderr, "checking free space cache\n");
8294         ret = check_space_cache(root);
8295         if (ret)
8296                 goto out;
8297
8298         /*
8299          * We used to have to have these hole extents in between our real
8300          * extents so if we don't have this flag set we need to make sure there
8301          * are no gaps in the file extents for inodes, otherwise we can just
8302          * ignore it when this happens.
8303          */
8304         no_holes = btrfs_fs_incompat(root->fs_info,
8305                                      BTRFS_FEATURE_INCOMPAT_NO_HOLES);
8306         fprintf(stderr, "checking fs roots\n");
8307         ret = check_fs_roots(root, &root_cache);
8308         if (ret)
8309                 goto out;
8310
8311         fprintf(stderr, "checking csums\n");
8312         ret = check_csums(root);
8313         if (ret)
8314                 goto out;
8315
8316         fprintf(stderr, "checking root refs\n");
8317         ret = check_root_refs(root, &root_cache);
8318         if (ret)
8319                 goto out;
8320
8321         while (repair && !list_empty(&root->fs_info->recow_ebs)) {
8322                 struct extent_buffer *eb;
8323
8324                 eb = list_first_entry(&root->fs_info->recow_ebs,
8325                                       struct extent_buffer, recow);
8326                 list_del_init(&eb->recow);
8327                 ret = recow_extent_buffer(root, eb);
8328                 if (ret)
8329                         break;
8330         }
8331
8332         while (!list_empty(&delete_items)) {
8333                 struct bad_item *bad;
8334
8335                 bad = list_first_entry(&delete_items, struct bad_item, list);
8336                 list_del_init(&bad->list);
8337                 if (repair)
8338                         ret = delete_bad_item(root, bad);
8339                 free(bad);
8340         }
8341
8342         if (info->quota_enabled) {
8343                 int err;
8344                 fprintf(stderr, "checking quota groups\n");
8345                 err = qgroup_verify_all(info);
8346                 if (err)
8347                         goto out;
8348         }
8349
8350         if (!list_empty(&root->fs_info->recow_ebs)) {
8351                 fprintf(stderr, "Transid errors in file system\n");
8352                 ret = 1;
8353         }
8354 out:
8355         print_qgroup_report(0);
8356         if (found_old_backref) { /*
8357                  * there was a disk format change when mixed
8358                  * backref was in testing tree. The old format
8359                  * existed about one week.
8360                  */
8361                 printf("\n * Found old mixed backref format. "
8362                        "The old format is not supported! *"
8363                        "\n * Please mount the FS in readonly mode, "
8364                        "backup data and re-format the FS. *\n\n");
8365                 ret = 1;
8366         }
8367         printf("found %llu bytes used err is %d\n",
8368                (unsigned long long)bytes_used, ret);
8369         printf("total csum bytes: %llu\n",(unsigned long long)total_csum_bytes);
8370         printf("total tree bytes: %llu\n",
8371                (unsigned long long)total_btree_bytes);
8372         printf("total fs tree bytes: %llu\n",
8373                (unsigned long long)total_fs_tree_bytes);
8374         printf("total extent tree bytes: %llu\n",
8375                (unsigned long long)total_extent_tree_bytes);
8376         printf("btree space waste bytes: %llu\n",
8377                (unsigned long long)btree_space_waste);
8378         printf("file data blocks allocated: %llu\n referenced %llu\n",
8379                 (unsigned long long)data_bytes_allocated,
8380                 (unsigned long long)data_bytes_referenced);
8381         printf("%s\n", BTRFS_BUILD_VERSION);
8382
8383         free_root_recs_tree(&root_cache);
8384 close_out:
8385         close_ctree(root);
8386 err_out:
8387         return ret;
8388 }