btrfs-progs: convert: Fix offset-by-one error in read_data_extent()
[platform/upstream/btrfs-progs.git] / image / main.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <pthread.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <fcntl.h>
25 #include <unistd.h>
26 #include <dirent.h>
27 #include <zlib.h>
28 #include <getopt.h>
29
30 #include "kerncompat.h"
31 #include "crc32c.h"
32 #include "ctree.h"
33 #include "disk-io.h"
34 #include "transaction.h"
35 #include "utils.h"
36 #include "volumes.h"
37 #include "extent_io.h"
38 #include "help.h"
39 #include "image/metadump.h"
40 #include "image/sanitize.h"
41
42 #define MAX_WORKER_THREADS      (32)
43
44 struct async_work {
45         struct list_head list;
46         struct list_head ordered;
47         u64 start;
48         u64 size;
49         u8 *buffer;
50         size_t bufsize;
51         int error;
52 };
53
54 struct metadump_struct {
55         struct btrfs_root *root;
56         FILE *out;
57
58         union {
59                 struct meta_cluster cluster;
60                 char meta_cluster_bytes[BLOCK_SIZE];
61         };
62
63         pthread_t threads[MAX_WORKER_THREADS];
64         size_t num_threads;
65         pthread_mutex_t mutex;
66         pthread_cond_t cond;
67         struct rb_root name_tree;
68
69         struct list_head list;
70         struct list_head ordered;
71         size_t num_items;
72         size_t num_ready;
73
74         u64 pending_start;
75         u64 pending_size;
76
77         int compress_level;
78         int done;
79         int data;
80         enum sanitize_mode sanitize_names;
81
82         int error;
83 };
84
85 struct mdrestore_struct {
86         FILE *in;
87         FILE *out;
88
89         pthread_t threads[MAX_WORKER_THREADS];
90         size_t num_threads;
91         pthread_mutex_t mutex;
92         pthread_cond_t cond;
93
94         struct rb_root chunk_tree;
95         struct rb_root physical_tree;
96         struct list_head list;
97         struct list_head overlapping_chunks;
98         size_t num_items;
99         u32 nodesize;
100         u64 devid;
101         u64 alloced_chunks;
102         u64 last_physical_offset;
103         u8 uuid[BTRFS_UUID_SIZE];
104         u8 fsid[BTRFS_FSID_SIZE];
105
106         int compress_method;
107         int done;
108         int error;
109         int old_restore;
110         int fixup_offset;
111         int multi_devices;
112         int clear_space_cache;
113         struct btrfs_fs_info *info;
114 };
115
116 static int search_for_chunk_blocks(struct mdrestore_struct *mdres,
117                                    u64 search, u64 cluster_bytenr);
118 static struct extent_buffer *alloc_dummy_eb(u64 bytenr, u32 size);
119
120 static void csum_block(u8 *buf, size_t len)
121 {
122         u8 result[btrfs_csum_sizes[BTRFS_CSUM_TYPE_CRC32]];
123         u32 crc = ~(u32)0;
124         crc = crc32c(crc, buf + BTRFS_CSUM_SIZE, len - BTRFS_CSUM_SIZE);
125         btrfs_csum_final(crc, result);
126         memcpy(buf, result, btrfs_csum_sizes[BTRFS_CSUM_TYPE_CRC32]);
127 }
128
129 static int has_name(struct btrfs_key *key)
130 {
131         switch (key->type) {
132         case BTRFS_DIR_ITEM_KEY:
133         case BTRFS_DIR_INDEX_KEY:
134         case BTRFS_INODE_REF_KEY:
135         case BTRFS_INODE_EXTREF_KEY:
136         case BTRFS_XATTR_ITEM_KEY:
137                 return 1;
138         default:
139                 break;
140         }
141
142         return 0;
143 }
144
145 static int chunk_cmp(struct rb_node *a, struct rb_node *b, int fuzz)
146 {
147         struct fs_chunk *entry = rb_entry(a, struct fs_chunk, l);
148         struct fs_chunk *ins = rb_entry(b, struct fs_chunk, l);
149
150         if (fuzz && ins->logical >= entry->logical &&
151             ins->logical < entry->logical + entry->bytes)
152                 return 0;
153
154         if (ins->logical < entry->logical)
155                 return -1;
156         else if (ins->logical > entry->logical)
157                 return 1;
158         return 0;
159 }
160
161 static int physical_cmp(struct rb_node *a, struct rb_node *b, int fuzz)
162 {
163         struct fs_chunk *entry = rb_entry(a, struct fs_chunk, p);
164         struct fs_chunk *ins = rb_entry(b, struct fs_chunk, p);
165
166         if (fuzz && ins->physical >= entry->physical &&
167             ins->physical < entry->physical + entry->bytes)
168                 return 0;
169
170         if (fuzz && entry->physical >= ins->physical &&
171             entry->physical < ins->physical + ins->bytes)
172                 return 0;
173
174         if (ins->physical < entry->physical)
175                 return -1;
176         else if (ins->physical > entry->physical)
177                 return 1;
178         return 0;
179 }
180
181 static void tree_insert(struct rb_root *root, struct rb_node *ins,
182                         int (*cmp)(struct rb_node *a, struct rb_node *b,
183                                    int fuzz))
184 {
185         struct rb_node ** p = &root->rb_node;
186         struct rb_node * parent = NULL;
187         int dir;
188
189         while(*p) {
190                 parent = *p;
191
192                 dir = cmp(*p, ins, 1);
193                 if (dir < 0)
194                         p = &(*p)->rb_left;
195                 else if (dir > 0)
196                         p = &(*p)->rb_right;
197                 else
198                         BUG();
199         }
200
201         rb_link_node(ins, parent, p);
202         rb_insert_color(ins, root);
203 }
204
205 static struct rb_node *tree_search(struct rb_root *root,
206                                    struct rb_node *search,
207                                    int (*cmp)(struct rb_node *a,
208                                               struct rb_node *b, int fuzz),
209                                    int fuzz)
210 {
211         struct rb_node *n = root->rb_node;
212         int dir;
213
214         while (n) {
215                 dir = cmp(n, search, fuzz);
216                 if (dir < 0)
217                         n = n->rb_left;
218                 else if (dir > 0)
219                         n = n->rb_right;
220                 else
221                         return n;
222         }
223
224         return NULL;
225 }
226
227 static u64 logical_to_physical(struct mdrestore_struct *mdres, u64 logical,
228                                u64 *size, u64 *physical_dup)
229 {
230         struct fs_chunk *fs_chunk;
231         struct rb_node *entry;
232         struct fs_chunk search;
233         u64 offset;
234
235         if (logical == BTRFS_SUPER_INFO_OFFSET)
236                 return logical;
237
238         search.logical = logical;
239         entry = tree_search(&mdres->chunk_tree, &search.l, chunk_cmp, 1);
240         if (!entry) {
241                 if (mdres->in != stdin)
242                         warning("cannot find a chunk, using logical");
243                 return logical;
244         }
245         fs_chunk = rb_entry(entry, struct fs_chunk, l);
246         if (fs_chunk->logical > logical || fs_chunk->logical + fs_chunk->bytes < logical)
247                 BUG();
248         offset = search.logical - fs_chunk->logical;
249
250         if (physical_dup) {
251                 /* Only in dup case, physical_dup is not equal to 0 */
252                 if (fs_chunk->physical_dup)
253                         *physical_dup = fs_chunk->physical_dup + offset;
254                 else
255                         *physical_dup = 0;
256         }
257
258         *size = min(*size, fs_chunk->bytes + fs_chunk->logical - logical);
259         return fs_chunk->physical + offset;
260 }
261
262 /*
263  * zero inline extents and csum items
264  */
265 static void zero_items(struct metadump_struct *md, u8 *dst,
266                        struct extent_buffer *src)
267 {
268         struct btrfs_file_extent_item *fi;
269         struct btrfs_item *item;
270         struct btrfs_key key;
271         u32 nritems = btrfs_header_nritems(src);
272         size_t size;
273         unsigned long ptr;
274         int i, extent_type;
275
276         for (i = 0; i < nritems; i++) {
277                 item = btrfs_item_nr(i);
278                 btrfs_item_key_to_cpu(src, &key, i);
279                 if (key.type == BTRFS_CSUM_ITEM_KEY) {
280                         size = btrfs_item_size_nr(src, i);
281                         memset(dst + btrfs_leaf_data(src) +
282                                btrfs_item_offset_nr(src, i), 0, size);
283                         continue;
284                 }
285
286                 if (md->sanitize_names && has_name(&key)) {
287                         sanitize_name(md->sanitize_names, &md->name_tree, dst,
288                                         src, &key, i);
289                         continue;
290                 }
291
292                 if (key.type != BTRFS_EXTENT_DATA_KEY)
293                         continue;
294
295                 fi = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
296                 extent_type = btrfs_file_extent_type(src, fi);
297                 if (extent_type != BTRFS_FILE_EXTENT_INLINE)
298                         continue;
299
300                 ptr = btrfs_file_extent_inline_start(fi);
301                 size = btrfs_file_extent_inline_item_len(src, item);
302                 memset(dst + ptr, 0, size);
303         }
304 }
305
306 /*
307  * copy buffer and zero useless data in the buffer
308  */
309 static void copy_buffer(struct metadump_struct *md, u8 *dst,
310                         struct extent_buffer *src)
311 {
312         int level;
313         size_t size;
314         u32 nritems;
315
316         memcpy(dst, src->data, src->len);
317         if (src->start == BTRFS_SUPER_INFO_OFFSET)
318                 return;
319
320         level = btrfs_header_level(src);
321         nritems = btrfs_header_nritems(src);
322
323         if (nritems == 0) {
324                 size = sizeof(struct btrfs_header);
325                 memset(dst + size, 0, src->len - size);
326         } else if (level == 0) {
327                 size = btrfs_leaf_data(src) +
328                         btrfs_item_offset_nr(src, nritems - 1) -
329                         btrfs_item_nr_offset(nritems);
330                 memset(dst + btrfs_item_nr_offset(nritems), 0, size);
331                 zero_items(md, dst, src);
332         } else {
333                 size = offsetof(struct btrfs_node, ptrs) +
334                         sizeof(struct btrfs_key_ptr) * nritems;
335                 memset(dst + size, 0, src->len - size);
336         }
337         csum_block(dst, src->len);
338 }
339
340 static void *dump_worker(void *data)
341 {
342         struct metadump_struct *md = (struct metadump_struct *)data;
343         struct async_work *async;
344         int ret;
345
346         while (1) {
347                 pthread_mutex_lock(&md->mutex);
348                 while (list_empty(&md->list)) {
349                         if (md->done) {
350                                 pthread_mutex_unlock(&md->mutex);
351                                 goto out;
352                         }
353                         pthread_cond_wait(&md->cond, &md->mutex);
354                 }
355                 async = list_entry(md->list.next, struct async_work, list);
356                 list_del_init(&async->list);
357                 pthread_mutex_unlock(&md->mutex);
358
359                 if (md->compress_level > 0) {
360                         u8 *orig = async->buffer;
361
362                         async->bufsize = compressBound(async->size);
363                         async->buffer = malloc(async->bufsize);
364                         if (!async->buffer) {
365                                 error("not enough memory for async buffer");
366                                 pthread_mutex_lock(&md->mutex);
367                                 if (!md->error)
368                                         md->error = -ENOMEM;
369                                 pthread_mutex_unlock(&md->mutex);
370                                 pthread_exit(NULL);
371                         }
372
373                         ret = compress2(async->buffer,
374                                          (unsigned long *)&async->bufsize,
375                                          orig, async->size, md->compress_level);
376
377                         if (ret != Z_OK)
378                                 async->error = 1;
379
380                         free(orig);
381                 }
382
383                 pthread_mutex_lock(&md->mutex);
384                 md->num_ready++;
385                 pthread_mutex_unlock(&md->mutex);
386         }
387 out:
388         pthread_exit(NULL);
389 }
390
391 static void meta_cluster_init(struct metadump_struct *md, u64 start)
392 {
393         struct meta_cluster_header *header;
394
395         md->num_items = 0;
396         md->num_ready = 0;
397         header = &md->cluster.header;
398         header->magic = cpu_to_le64(HEADER_MAGIC);
399         header->bytenr = cpu_to_le64(start);
400         header->nritems = cpu_to_le32(0);
401         header->compress = md->compress_level > 0 ?
402                            COMPRESS_ZLIB : COMPRESS_NONE;
403 }
404
405 static void metadump_destroy(struct metadump_struct *md, int num_threads)
406 {
407         int i;
408         struct rb_node *n;
409
410         pthread_mutex_lock(&md->mutex);
411         md->done = 1;
412         pthread_cond_broadcast(&md->cond);
413         pthread_mutex_unlock(&md->mutex);
414
415         for (i = 0; i < num_threads; i++)
416                 pthread_join(md->threads[i], NULL);
417
418         pthread_cond_destroy(&md->cond);
419         pthread_mutex_destroy(&md->mutex);
420
421         while ((n = rb_first(&md->name_tree))) {
422                 struct name *name;
423
424                 name = rb_entry(n, struct name, n);
425                 rb_erase(n, &md->name_tree);
426                 free(name->val);
427                 free(name->sub);
428                 free(name);
429         }
430 }
431
432 static int metadump_init(struct metadump_struct *md, struct btrfs_root *root,
433                          FILE *out, int num_threads, int compress_level,
434                          enum sanitize_mode sanitize_names)
435 {
436         int i, ret = 0;
437
438         memset(md, 0, sizeof(*md));
439         INIT_LIST_HEAD(&md->list);
440         INIT_LIST_HEAD(&md->ordered);
441         md->root = root;
442         md->out = out;
443         md->pending_start = (u64)-1;
444         md->compress_level = compress_level;
445         md->sanitize_names = sanitize_names;
446         if (sanitize_names == SANITIZE_COLLISIONS)
447                 crc32c_optimization_init();
448
449         md->name_tree.rb_node = NULL;
450         md->num_threads = num_threads;
451         pthread_cond_init(&md->cond, NULL);
452         pthread_mutex_init(&md->mutex, NULL);
453         meta_cluster_init(md, 0);
454
455         if (!num_threads)
456                 return 0;
457
458         for (i = 0; i < num_threads; i++) {
459                 ret = pthread_create(md->threads + i, NULL, dump_worker, md);
460                 if (ret)
461                         break;
462         }
463
464         if (ret)
465                 metadump_destroy(md, i + 1);
466
467         return ret;
468 }
469
470 static int write_zero(FILE *out, size_t size)
471 {
472         static char zero[BLOCK_SIZE];
473         return fwrite(zero, size, 1, out);
474 }
475
476 static int write_buffers(struct metadump_struct *md, u64 *next)
477 {
478         struct meta_cluster_header *header = &md->cluster.header;
479         struct meta_cluster_item *item;
480         struct async_work *async;
481         u64 bytenr = 0;
482         u32 nritems = 0;
483         int ret;
484         int err = 0;
485
486         if (list_empty(&md->ordered))
487                 goto out;
488
489         /* wait until all buffers are compressed */
490         while (!err && md->num_items > md->num_ready) {
491                 struct timespec ts = {
492                         .tv_sec = 0,
493                         .tv_nsec = 10000000,
494                 };
495                 pthread_mutex_unlock(&md->mutex);
496                 nanosleep(&ts, NULL);
497                 pthread_mutex_lock(&md->mutex);
498                 err = md->error;
499         }
500
501         if (err) {
502                 error("one of the threads failed: %s", strerror(-err));
503                 goto out;
504         }
505
506         /* setup and write index block */
507         list_for_each_entry(async, &md->ordered, ordered) {
508                 item = &md->cluster.items[nritems];
509                 item->bytenr = cpu_to_le64(async->start);
510                 item->size = cpu_to_le32(async->bufsize);
511                 nritems++;
512         }
513         header->nritems = cpu_to_le32(nritems);
514
515         ret = fwrite(&md->cluster, BLOCK_SIZE, 1, md->out);
516         if (ret != 1) {
517                 error("unable to write out cluster: %m");
518                 return -errno;
519         }
520
521         /* write buffers */
522         bytenr += le64_to_cpu(header->bytenr) + BLOCK_SIZE;
523         while (!list_empty(&md->ordered)) {
524                 async = list_entry(md->ordered.next, struct async_work,
525                                    ordered);
526                 list_del_init(&async->ordered);
527
528                 bytenr += async->bufsize;
529                 if (!err)
530                         ret = fwrite(async->buffer, async->bufsize, 1,
531                                      md->out);
532                 if (ret != 1) {
533                         error("unable to write out cluster: %m");
534                         err = -errno;
535                         ret = 0;
536                 }
537
538                 free(async->buffer);
539                 free(async);
540         }
541
542         /* zero unused space in the last block */
543         if (!err && bytenr & BLOCK_MASK) {
544                 size_t size = BLOCK_SIZE - (bytenr & BLOCK_MASK);
545
546                 bytenr += size;
547                 ret = write_zero(md->out, size);
548                 if (ret != 1) {
549                         error("unable to zero out buffer: %m");
550                         err = -errno;
551                 }
552         }
553 out:
554         *next = bytenr;
555         return err;
556 }
557
558 static int read_data_extent(struct metadump_struct *md,
559                             struct async_work *async)
560 {
561         struct btrfs_root *root = md->root;
562         struct btrfs_fs_info *fs_info = root->fs_info;
563         u64 bytes_left = async->size;
564         u64 logical = async->start;
565         u64 offset = 0;
566         u64 read_len;
567         int num_copies;
568         int cur_mirror;
569         int ret;
570
571         num_copies = btrfs_num_copies(root->fs_info, logical, bytes_left);
572
573         /* Try our best to read data, just like read_tree_block() */
574         for (cur_mirror = 1; cur_mirror <= num_copies; cur_mirror++) {
575                 while (bytes_left) {
576                         read_len = bytes_left;
577                         ret = read_extent_data(fs_info,
578                                         (char *)(async->buffer + offset),
579                                         logical, &read_len, cur_mirror);
580                         if (ret < 0)
581                                 break;
582                         offset += read_len;
583                         logical += read_len;
584                         bytes_left -= read_len;
585                 }
586         }
587         if (bytes_left)
588                 return -EIO;
589         return 0;
590 }
591
592 static int get_dev_fd(struct btrfs_root *root)
593 {
594         struct btrfs_device *dev;
595
596         dev = list_first_entry(&root->fs_info->fs_devices->devices,
597                                struct btrfs_device, dev_list);
598         return dev->fd;
599 }
600
601 static int flush_pending(struct metadump_struct *md, int done)
602 {
603         struct async_work *async = NULL;
604         struct extent_buffer *eb;
605         u64 start = 0;
606         u64 size;
607         size_t offset;
608         int ret = 0;
609
610         if (md->pending_size) {
611                 async = calloc(1, sizeof(*async));
612                 if (!async)
613                         return -ENOMEM;
614
615                 async->start = md->pending_start;
616                 async->size = md->pending_size;
617                 async->bufsize = async->size;
618                 async->buffer = malloc(async->bufsize);
619                 if (!async->buffer) {
620                         free(async);
621                         return -ENOMEM;
622                 }
623                 offset = 0;
624                 start = async->start;
625                 size = async->size;
626
627                 if (md->data) {
628                         ret = read_data_extent(md, async);
629                         if (ret) {
630                                 free(async->buffer);
631                                 free(async);
632                                 return ret;
633                         }
634                 }
635
636                 /*
637                  * Balance can make the mapping not cover the super block, so
638                  * just copy directly from one of the devices.
639                  */
640                 if (start == BTRFS_SUPER_INFO_OFFSET) {
641                         int fd = get_dev_fd(md->root);
642
643                         ret = pread64(fd, async->buffer, size, start);
644                         if (ret < size) {
645                                 free(async->buffer);
646                                 free(async);
647                                 error("unable to read superblock at %llu: %m",
648                                                 (unsigned long long)start);
649                                 return -errno;
650                         }
651                         size = 0;
652                         ret = 0;
653                 }
654
655                 while (!md->data && size > 0) {
656                         u64 this_read = min((u64)md->root->fs_info->nodesize,
657                                         size);
658
659                         eb = read_tree_block(md->root->fs_info, start, 0);
660                         if (!extent_buffer_uptodate(eb)) {
661                                 free(async->buffer);
662                                 free(async);
663                                 error("unable to read metadata block %llu",
664                                         (unsigned long long)start);
665                                 return -EIO;
666                         }
667                         copy_buffer(md, async->buffer + offset, eb);
668                         free_extent_buffer(eb);
669                         start += this_read;
670                         offset += this_read;
671                         size -= this_read;
672                 }
673
674                 md->pending_start = (u64)-1;
675                 md->pending_size = 0;
676         } else if (!done) {
677                 return 0;
678         }
679
680         pthread_mutex_lock(&md->mutex);
681         if (async) {
682                 list_add_tail(&async->ordered, &md->ordered);
683                 md->num_items++;
684                 if (md->compress_level > 0) {
685                         list_add_tail(&async->list, &md->list);
686                         pthread_cond_signal(&md->cond);
687                 } else {
688                         md->num_ready++;
689                 }
690         }
691         if (md->num_items >= ITEMS_PER_CLUSTER || done) {
692                 ret = write_buffers(md, &start);
693                 if (ret)
694                         error("unable to write buffers: %s", strerror(-ret));
695                 else
696                         meta_cluster_init(md, start);
697         }
698         pthread_mutex_unlock(&md->mutex);
699         return ret;
700 }
701
702 static int add_extent(u64 start, u64 size, struct metadump_struct *md,
703                       int data)
704 {
705         int ret;
706         if (md->data != data ||
707             md->pending_size + size > MAX_PENDING_SIZE ||
708             md->pending_start + md->pending_size != start) {
709                 ret = flush_pending(md, 0);
710                 if (ret)
711                         return ret;
712                 md->pending_start = start;
713         }
714         readahead_tree_block(md->root->fs_info, start, 0);
715         md->pending_size += size;
716         md->data = data;
717         return 0;
718 }
719
720 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
721 static int is_tree_block(struct btrfs_root *extent_root,
722                          struct btrfs_path *path, u64 bytenr)
723 {
724         struct extent_buffer *leaf;
725         struct btrfs_key key;
726         u64 ref_objectid;
727         int ret;
728
729         leaf = path->nodes[0];
730         while (1) {
731                 struct btrfs_extent_ref_v0 *ref_item;
732                 path->slots[0]++;
733                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
734                         ret = btrfs_next_leaf(extent_root, path);
735                         if (ret < 0)
736                                 return ret;
737                         if (ret > 0)
738                                 break;
739                         leaf = path->nodes[0];
740                 }
741                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
742                 if (key.objectid != bytenr)
743                         break;
744                 if (key.type != BTRFS_EXTENT_REF_V0_KEY)
745                         continue;
746                 ref_item = btrfs_item_ptr(leaf, path->slots[0],
747                                           struct btrfs_extent_ref_v0);
748                 ref_objectid = btrfs_ref_objectid_v0(leaf, ref_item);
749                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID)
750                         return 1;
751                 break;
752         }
753         return 0;
754 }
755 #endif
756
757 static int copy_tree_blocks(struct btrfs_root *root, struct extent_buffer *eb,
758                             struct metadump_struct *metadump, int root_tree)
759 {
760         struct extent_buffer *tmp;
761         struct btrfs_root_item *ri;
762         struct btrfs_key key;
763         struct btrfs_fs_info *fs_info = root->fs_info;
764         u64 bytenr;
765         int level;
766         int nritems = 0;
767         int i = 0;
768         int ret;
769
770         ret = add_extent(btrfs_header_bytenr(eb), fs_info->nodesize,
771                          metadump, 0);
772         if (ret) {
773                 error("unable to add metadata block %llu: %d",
774                                 btrfs_header_bytenr(eb), ret);
775                 return ret;
776         }
777
778         if (btrfs_header_level(eb) == 0 && !root_tree)
779                 return 0;
780
781         level = btrfs_header_level(eb);
782         nritems = btrfs_header_nritems(eb);
783         for (i = 0; i < nritems; i++) {
784                 if (level == 0) {
785                         btrfs_item_key_to_cpu(eb, &key, i);
786                         if (key.type != BTRFS_ROOT_ITEM_KEY)
787                                 continue;
788                         ri = btrfs_item_ptr(eb, i, struct btrfs_root_item);
789                         bytenr = btrfs_disk_root_bytenr(eb, ri);
790                         tmp = read_tree_block(fs_info, bytenr, 0);
791                         if (!extent_buffer_uptodate(tmp)) {
792                                 error("unable to read log root block");
793                                 return -EIO;
794                         }
795                         ret = copy_tree_blocks(root, tmp, metadump, 0);
796                         free_extent_buffer(tmp);
797                         if (ret)
798                                 return ret;
799                 } else {
800                         bytenr = btrfs_node_blockptr(eb, i);
801                         tmp = read_tree_block(fs_info, bytenr, 0);
802                         if (!extent_buffer_uptodate(tmp)) {
803                                 error("unable to read log root block");
804                                 return -EIO;
805                         }
806                         ret = copy_tree_blocks(root, tmp, metadump, root_tree);
807                         free_extent_buffer(tmp);
808                         if (ret)
809                                 return ret;
810                 }
811         }
812
813         return 0;
814 }
815
816 static int copy_log_trees(struct btrfs_root *root,
817                           struct metadump_struct *metadump)
818 {
819         u64 blocknr = btrfs_super_log_root(root->fs_info->super_copy);
820
821         if (blocknr == 0)
822                 return 0;
823
824         if (!root->fs_info->log_root_tree ||
825             !root->fs_info->log_root_tree->node) {
826                 error("unable to copy tree log, it has not been setup");
827                 return -EIO;
828         }
829
830         return copy_tree_blocks(root, root->fs_info->log_root_tree->node,
831                                 metadump, 1);
832 }
833
834 static int copy_space_cache(struct btrfs_root *root,
835                             struct metadump_struct *metadump,
836                             struct btrfs_path *path)
837 {
838         struct extent_buffer *leaf;
839         struct btrfs_file_extent_item *fi;
840         struct btrfs_key key;
841         u64 bytenr, num_bytes;
842         int ret;
843
844         root = root->fs_info->tree_root;
845
846         key.objectid = 0;
847         key.type = BTRFS_EXTENT_DATA_KEY;
848         key.offset = 0;
849
850         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
851         if (ret < 0) {
852                 error("free space inode not found: %d", ret);
853                 return ret;
854         }
855
856         leaf = path->nodes[0];
857
858         while (1) {
859                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
860                         ret = btrfs_next_leaf(root, path);
861                         if (ret < 0) {
862                                 error("cannot go to next leaf %d", ret);
863                                 return ret;
864                         }
865                         if (ret > 0)
866                                 break;
867                         leaf = path->nodes[0];
868                 }
869
870                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
871                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
872                         path->slots[0]++;
873                         continue;
874                 }
875
876                 fi = btrfs_item_ptr(leaf, path->slots[0],
877                                     struct btrfs_file_extent_item);
878                 if (btrfs_file_extent_type(leaf, fi) !=
879                     BTRFS_FILE_EXTENT_REG) {
880                         path->slots[0]++;
881                         continue;
882                 }
883
884                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
885                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
886                 ret = add_extent(bytenr, num_bytes, metadump, 1);
887                 if (ret) {
888                         error("unable to add space cache blocks %d", ret);
889                         btrfs_release_path(path);
890                         return ret;
891                 }
892                 path->slots[0]++;
893         }
894
895         return 0;
896 }
897
898 static int copy_from_extent_tree(struct metadump_struct *metadump,
899                                  struct btrfs_path *path)
900 {
901         struct btrfs_root *extent_root;
902         struct extent_buffer *leaf;
903         struct btrfs_extent_item *ei;
904         struct btrfs_key key;
905         u64 bytenr;
906         u64 num_bytes;
907         int ret;
908
909         extent_root = metadump->root->fs_info->extent_root;
910         bytenr = BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE;
911         key.objectid = bytenr;
912         key.type = BTRFS_EXTENT_ITEM_KEY;
913         key.offset = 0;
914
915         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
916         if (ret < 0) {
917                 error("extent root not found: %d", ret);
918                 return ret;
919         }
920         ret = 0;
921
922         leaf = path->nodes[0];
923
924         while (1) {
925                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
926                         ret = btrfs_next_leaf(extent_root, path);
927                         if (ret < 0) {
928                                 error("cannot go to next leaf %d", ret);
929                                 break;
930                         }
931                         if (ret > 0) {
932                                 ret = 0;
933                                 break;
934                         }
935                         leaf = path->nodes[0];
936                 }
937
938                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
939                 if (key.objectid < bytenr ||
940                     (key.type != BTRFS_EXTENT_ITEM_KEY &&
941                      key.type != BTRFS_METADATA_ITEM_KEY)) {
942                         path->slots[0]++;
943                         continue;
944                 }
945
946                 bytenr = key.objectid;
947                 if (key.type == BTRFS_METADATA_ITEM_KEY) {
948                         num_bytes = extent_root->fs_info->nodesize;
949                 } else {
950                         num_bytes = key.offset;
951                 }
952
953                 if (num_bytes == 0) {
954                         error("extent length 0 at bytenr %llu key type %d",
955                                         (unsigned long long)bytenr, key.type);
956                         ret = -EIO;
957                         break;
958                 }
959
960                 if (btrfs_item_size_nr(leaf, path->slots[0]) > sizeof(*ei)) {
961                         ei = btrfs_item_ptr(leaf, path->slots[0],
962                                             struct btrfs_extent_item);
963                         if (btrfs_extent_flags(leaf, ei) &
964                             BTRFS_EXTENT_FLAG_TREE_BLOCK) {
965                                 ret = add_extent(bytenr, num_bytes, metadump,
966                                                  0);
967                                 if (ret) {
968                                         error("unable to add block %llu: %d",
969                                                 (unsigned long long)bytenr, ret);
970                                         break;
971                                 }
972                         }
973                 } else {
974 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
975                         ret = is_tree_block(extent_root, path, bytenr);
976                         if (ret < 0) {
977                                 error("failed to check tree block %llu: %d",
978                                         (unsigned long long)bytenr, ret);
979                                 break;
980                         }
981
982                         if (ret) {
983                                 ret = add_extent(bytenr, num_bytes, metadump,
984                                                  0);
985                                 if (ret) {
986                                         error("unable to add block %llu: %d",
987                                                 (unsigned long long)bytenr, ret);
988                                         break;
989                                 }
990                         }
991                         ret = 0;
992 #else
993                         error(
994         "either extent tree is corrupted or you haven't built with V0 support");
995                         ret = -EIO;
996                         break;
997 #endif
998                 }
999                 bytenr += num_bytes;
1000         }
1001
1002         btrfs_release_path(path);
1003
1004         return ret;
1005 }
1006
1007 static int create_metadump(const char *input, FILE *out, int num_threads,
1008                            int compress_level, enum sanitize_mode sanitize,
1009                            int walk_trees)
1010 {
1011         struct btrfs_root *root;
1012         struct btrfs_path path;
1013         struct metadump_struct metadump;
1014         int ret;
1015         int err = 0;
1016
1017         root = open_ctree(input, 0, 0);
1018         if (!root) {
1019                 error("open ctree failed");
1020                 return -EIO;
1021         }
1022
1023         ret = metadump_init(&metadump, root, out, num_threads,
1024                             compress_level, sanitize);
1025         if (ret) {
1026                 error("failed to initialize metadump: %d", ret);
1027                 close_ctree(root);
1028                 return ret;
1029         }
1030
1031         ret = add_extent(BTRFS_SUPER_INFO_OFFSET, BTRFS_SUPER_INFO_SIZE,
1032                         &metadump, 0);
1033         if (ret) {
1034                 error("unable to add metadata: %d", ret);
1035                 err = ret;
1036                 goto out;
1037         }
1038
1039         btrfs_init_path(&path);
1040
1041         if (walk_trees) {
1042                 ret = copy_tree_blocks(root, root->fs_info->chunk_root->node,
1043                                        &metadump, 1);
1044                 if (ret) {
1045                         err = ret;
1046                         goto out;
1047                 }
1048
1049                 ret = copy_tree_blocks(root, root->fs_info->tree_root->node,
1050                                        &metadump, 1);
1051                 if (ret) {
1052                         err = ret;
1053                         goto out;
1054                 }
1055         } else {
1056                 ret = copy_from_extent_tree(&metadump, &path);
1057                 if (ret) {
1058                         err = ret;
1059                         goto out;
1060                 }
1061         }
1062
1063         ret = copy_log_trees(root, &metadump);
1064         if (ret) {
1065                 err = ret;
1066                 goto out;
1067         }
1068
1069         ret = copy_space_cache(root, &metadump, &path);
1070 out:
1071         ret = flush_pending(&metadump, 1);
1072         if (ret) {
1073                 if (!err)
1074                         err = ret;
1075                 error("failed to flush pending data: %d", ret);
1076         }
1077
1078         metadump_destroy(&metadump, num_threads);
1079
1080         btrfs_release_path(&path);
1081         ret = close_ctree(root);
1082         return err ? err : ret;
1083 }
1084
1085 static void update_super_old(u8 *buffer)
1086 {
1087         struct btrfs_super_block *super = (struct btrfs_super_block *)buffer;
1088         struct btrfs_chunk *chunk;
1089         struct btrfs_disk_key *key;
1090         u32 sectorsize = btrfs_super_sectorsize(super);
1091         u64 flags = btrfs_super_flags(super);
1092
1093         flags |= BTRFS_SUPER_FLAG_METADUMP;
1094         btrfs_set_super_flags(super, flags);
1095
1096         key = (struct btrfs_disk_key *)(super->sys_chunk_array);
1097         chunk = (struct btrfs_chunk *)(super->sys_chunk_array +
1098                                        sizeof(struct btrfs_disk_key));
1099
1100         btrfs_set_disk_key_objectid(key, BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1101         btrfs_set_disk_key_type(key, BTRFS_CHUNK_ITEM_KEY);
1102         btrfs_set_disk_key_offset(key, 0);
1103
1104         btrfs_set_stack_chunk_length(chunk, (u64)-1);
1105         btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
1106         btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
1107         btrfs_set_stack_chunk_type(chunk, BTRFS_BLOCK_GROUP_SYSTEM);
1108         btrfs_set_stack_chunk_io_align(chunk, sectorsize);
1109         btrfs_set_stack_chunk_io_width(chunk, sectorsize);
1110         btrfs_set_stack_chunk_sector_size(chunk, sectorsize);
1111         btrfs_set_stack_chunk_num_stripes(chunk, 1);
1112         btrfs_set_stack_chunk_sub_stripes(chunk, 0);
1113         chunk->stripe.devid = super->dev_item.devid;
1114         btrfs_set_stack_stripe_offset(&chunk->stripe, 0);
1115         memcpy(chunk->stripe.dev_uuid, super->dev_item.uuid, BTRFS_UUID_SIZE);
1116         btrfs_set_super_sys_array_size(super, sizeof(*key) + sizeof(*chunk));
1117         csum_block(buffer, BTRFS_SUPER_INFO_SIZE);
1118 }
1119
1120 static int update_super(struct mdrestore_struct *mdres, u8 *buffer)
1121 {
1122         struct btrfs_super_block *super = (struct btrfs_super_block *)buffer;
1123         struct btrfs_chunk *chunk;
1124         struct btrfs_disk_key *disk_key;
1125         struct btrfs_key key;
1126         u64 flags = btrfs_super_flags(super);
1127         u32 new_array_size = 0;
1128         u32 array_size;
1129         u32 cur = 0;
1130         u8 *ptr, *write_ptr;
1131         int old_num_stripes;
1132
1133         write_ptr = ptr = super->sys_chunk_array;
1134         array_size = btrfs_super_sys_array_size(super);
1135
1136         while (cur < array_size) {
1137                 disk_key = (struct btrfs_disk_key *)ptr;
1138                 btrfs_disk_key_to_cpu(&key, disk_key);
1139
1140                 new_array_size += sizeof(*disk_key);
1141                 memmove(write_ptr, ptr, sizeof(*disk_key));
1142
1143                 write_ptr += sizeof(*disk_key);
1144                 ptr += sizeof(*disk_key);
1145                 cur += sizeof(*disk_key);
1146
1147                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1148                         u64 type, physical, physical_dup, size = 0;
1149
1150                         chunk = (struct btrfs_chunk *)ptr;
1151                         old_num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1152                         chunk = (struct btrfs_chunk *)write_ptr;
1153
1154                         memmove(write_ptr, ptr, sizeof(*chunk));
1155                         btrfs_set_stack_chunk_sub_stripes(chunk, 0);
1156                         type = btrfs_stack_chunk_type(chunk);
1157                         if (type & BTRFS_BLOCK_GROUP_DUP) {
1158                                 new_array_size += sizeof(struct btrfs_stripe);
1159                                 write_ptr += sizeof(struct btrfs_stripe);
1160                         } else {
1161                                 btrfs_set_stack_chunk_num_stripes(chunk, 1);
1162                                 btrfs_set_stack_chunk_type(chunk,
1163                                                 BTRFS_BLOCK_GROUP_SYSTEM);
1164                         }
1165                         chunk->stripe.devid = super->dev_item.devid;
1166                         physical = logical_to_physical(mdres, key.offset,
1167                                                        &size, &physical_dup);
1168                         if (size != (u64)-1)
1169                                 btrfs_set_stack_stripe_offset(&chunk->stripe,
1170                                                               physical);
1171                         memcpy(chunk->stripe.dev_uuid, super->dev_item.uuid,
1172                                BTRFS_UUID_SIZE);
1173                         new_array_size += sizeof(*chunk);
1174                 } else {
1175                         error("bogus key in the sys array %d", key.type);
1176                         return -EIO;
1177                 }
1178                 write_ptr += sizeof(*chunk);
1179                 ptr += btrfs_chunk_item_size(old_num_stripes);
1180                 cur += btrfs_chunk_item_size(old_num_stripes);
1181         }
1182
1183         if (mdres->clear_space_cache)
1184                 btrfs_set_super_cache_generation(super, 0);
1185
1186         flags |= BTRFS_SUPER_FLAG_METADUMP_V2;
1187         btrfs_set_super_flags(super, flags);
1188         btrfs_set_super_sys_array_size(super, new_array_size);
1189         btrfs_set_super_num_devices(super, 1);
1190         csum_block(buffer, BTRFS_SUPER_INFO_SIZE);
1191
1192         return 0;
1193 }
1194
1195 static struct extent_buffer *alloc_dummy_eb(u64 bytenr, u32 size)
1196 {
1197         struct extent_buffer *eb;
1198
1199         eb = calloc(1, sizeof(struct extent_buffer) + size);
1200         if (!eb)
1201                 return NULL;
1202
1203         eb->start = bytenr;
1204         eb->len = size;
1205         return eb;
1206 }
1207
1208 static void truncate_item(struct extent_buffer *eb, int slot, u32 new_size)
1209 {
1210         struct btrfs_item *item;
1211         u32 nritems;
1212         u32 old_size;
1213         u32 old_data_start;
1214         u32 size_diff;
1215         u32 data_end;
1216         int i;
1217
1218         old_size = btrfs_item_size_nr(eb, slot);
1219         if (old_size == new_size)
1220                 return;
1221
1222         nritems = btrfs_header_nritems(eb);
1223         data_end = btrfs_item_offset_nr(eb, nritems - 1);
1224
1225         old_data_start = btrfs_item_offset_nr(eb, slot);
1226         size_diff = old_size - new_size;
1227
1228         for (i = slot; i < nritems; i++) {
1229                 u32 ioff;
1230                 item = btrfs_item_nr(i);
1231                 ioff = btrfs_item_offset(eb, item);
1232                 btrfs_set_item_offset(eb, item, ioff + size_diff);
1233         }
1234
1235         memmove_extent_buffer(eb, btrfs_leaf_data(eb) + data_end + size_diff,
1236                               btrfs_leaf_data(eb) + data_end,
1237                               old_data_start + new_size - data_end);
1238         item = btrfs_item_nr(slot);
1239         btrfs_set_item_size(eb, item, new_size);
1240 }
1241
1242 static int fixup_chunk_tree_block(struct mdrestore_struct *mdres,
1243                                   struct async_work *async, u8 *buffer,
1244                                   size_t size)
1245 {
1246         struct extent_buffer *eb;
1247         size_t size_left = size;
1248         u64 bytenr = async->start;
1249         int i;
1250
1251         if (size_left % mdres->nodesize)
1252                 return 0;
1253
1254         eb = alloc_dummy_eb(bytenr, mdres->nodesize);
1255         if (!eb)
1256                 return -ENOMEM;
1257
1258         while (size_left) {
1259                 eb->start = bytenr;
1260                 memcpy(eb->data, buffer, mdres->nodesize);
1261
1262                 if (btrfs_header_bytenr(eb) != bytenr)
1263                         break;
1264                 if (memcmp(mdres->fsid,
1265                            eb->data + offsetof(struct btrfs_header, fsid),
1266                            BTRFS_FSID_SIZE))
1267                         break;
1268
1269                 if (btrfs_header_owner(eb) != BTRFS_CHUNK_TREE_OBJECTID)
1270                         goto next;
1271
1272                 if (btrfs_header_level(eb) != 0)
1273                         goto next;
1274
1275                 for (i = 0; i < btrfs_header_nritems(eb); i++) {
1276                         struct btrfs_chunk *chunk;
1277                         struct btrfs_key key;
1278                         u64 type, physical, physical_dup, size = (u64)-1;
1279
1280                         btrfs_item_key_to_cpu(eb, &key, i);
1281                         if (key.type != BTRFS_CHUNK_ITEM_KEY)
1282                                 continue;
1283
1284                         size = 0;
1285                         physical = logical_to_physical(mdres, key.offset,
1286                                                        &size, &physical_dup);
1287
1288                         if (!physical_dup)
1289                                 truncate_item(eb, i, sizeof(*chunk));
1290                         chunk = btrfs_item_ptr(eb, i, struct btrfs_chunk);
1291
1292
1293                         /* Zero out the RAID profile */
1294                         type = btrfs_chunk_type(eb, chunk);
1295                         type &= (BTRFS_BLOCK_GROUP_DATA |
1296                                  BTRFS_BLOCK_GROUP_SYSTEM |
1297                                  BTRFS_BLOCK_GROUP_METADATA |
1298                                  BTRFS_BLOCK_GROUP_DUP);
1299                         btrfs_set_chunk_type(eb, chunk, type);
1300
1301                         if (!physical_dup)
1302                                 btrfs_set_chunk_num_stripes(eb, chunk, 1);
1303                         btrfs_set_chunk_sub_stripes(eb, chunk, 0);
1304                         btrfs_set_stripe_devid_nr(eb, chunk, 0, mdres->devid);
1305                         if (size != (u64)-1)
1306                                 btrfs_set_stripe_offset_nr(eb, chunk, 0,
1307                                                            physical);
1308                         /* update stripe 2 offset */
1309                         if (physical_dup)
1310                                 btrfs_set_stripe_offset_nr(eb, chunk, 1,
1311                                                            physical_dup);
1312
1313                         write_extent_buffer(eb, mdres->uuid,
1314                                         (unsigned long)btrfs_stripe_dev_uuid_nr(
1315                                                 chunk, 0),
1316                                         BTRFS_UUID_SIZE);
1317                 }
1318                 memcpy(buffer, eb->data, eb->len);
1319                 csum_block(buffer, eb->len);
1320 next:
1321                 size_left -= mdres->nodesize;
1322                 buffer += mdres->nodesize;
1323                 bytenr += mdres->nodesize;
1324         }
1325
1326         free(eb);
1327         return 0;
1328 }
1329
1330 static void write_backup_supers(int fd, u8 *buf)
1331 {
1332         struct btrfs_super_block *super = (struct btrfs_super_block *)buf;
1333         struct stat st;
1334         u64 size;
1335         u64 bytenr;
1336         int i;
1337         int ret;
1338
1339         if (fstat(fd, &st)) {
1340                 error(
1341         "cannot stat restore point, won't be able to write backup supers: %m");
1342                 return;
1343         }
1344
1345         size = btrfs_device_size(fd, &st);
1346
1347         for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1348                 bytenr = btrfs_sb_offset(i);
1349                 if (bytenr + BTRFS_SUPER_INFO_SIZE > size)
1350                         break;
1351                 btrfs_set_super_bytenr(super, bytenr);
1352                 csum_block(buf, BTRFS_SUPER_INFO_SIZE);
1353                 ret = pwrite64(fd, buf, BTRFS_SUPER_INFO_SIZE, bytenr);
1354                 if (ret < BTRFS_SUPER_INFO_SIZE) {
1355                         if (ret < 0)
1356                                 error(
1357                                 "problem writing out backup super block %d: %m", i);
1358                         else
1359                                 error("short write writing out backup super block");
1360                         break;
1361                 }
1362         }
1363 }
1364
1365 static void *restore_worker(void *data)
1366 {
1367         struct mdrestore_struct *mdres = (struct mdrestore_struct *)data;
1368         struct async_work *async;
1369         size_t size;
1370         u8 *buffer;
1371         u8 *outbuf;
1372         int outfd;
1373         int ret;
1374         int compress_size = MAX_PENDING_SIZE * 4;
1375
1376         outfd = fileno(mdres->out);
1377         buffer = malloc(compress_size);
1378         if (!buffer) {
1379                 error("not enough memory for restore worker buffer");
1380                 pthread_mutex_lock(&mdres->mutex);
1381                 if (!mdres->error)
1382                         mdres->error = -ENOMEM;
1383                 pthread_mutex_unlock(&mdres->mutex);
1384                 pthread_exit(NULL);
1385         }
1386
1387         while (1) {
1388                 u64 bytenr, physical_dup;
1389                 off_t offset = 0;
1390                 int err = 0;
1391
1392                 pthread_mutex_lock(&mdres->mutex);
1393                 while (!mdres->nodesize || list_empty(&mdres->list)) {
1394                         if (mdres->done) {
1395                                 pthread_mutex_unlock(&mdres->mutex);
1396                                 goto out;
1397                         }
1398                         pthread_cond_wait(&mdres->cond, &mdres->mutex);
1399                 }
1400                 async = list_entry(mdres->list.next, struct async_work, list);
1401                 list_del_init(&async->list);
1402
1403                 if (mdres->compress_method == COMPRESS_ZLIB) {
1404                         size = compress_size; 
1405                         pthread_mutex_unlock(&mdres->mutex);
1406                         ret = uncompress(buffer, (unsigned long *)&size,
1407                                          async->buffer, async->bufsize);
1408                         pthread_mutex_lock(&mdres->mutex);
1409                         if (ret != Z_OK) {
1410                                 error("decompression failed with %d", ret);
1411                                 err = -EIO;
1412                         }
1413                         outbuf = buffer;
1414                 } else {
1415                         outbuf = async->buffer;
1416                         size = async->bufsize;
1417                 }
1418
1419                 if (!mdres->multi_devices) {
1420                         if (async->start == BTRFS_SUPER_INFO_OFFSET) {
1421                                 if (mdres->old_restore) {
1422                                         update_super_old(outbuf);
1423                                 } else {
1424                                         ret = update_super(mdres, outbuf);
1425                                         if (ret)
1426                                                 err = ret;
1427                                 }
1428                         } else if (!mdres->old_restore) {
1429                                 ret = fixup_chunk_tree_block(mdres, async, outbuf, size);
1430                                 if (ret)
1431                                         err = ret;
1432                         }
1433                 }
1434
1435                 if (!mdres->fixup_offset) {
1436                         while (size) {
1437                                 u64 chunk_size = size;
1438                                 physical_dup = 0;
1439                                 if (!mdres->multi_devices && !mdres->old_restore)
1440                                         bytenr = logical_to_physical(mdres,
1441                                                      async->start + offset,
1442                                                      &chunk_size,
1443                                                      &physical_dup);
1444                                 else
1445                                         bytenr = async->start + offset;
1446
1447                                 ret = pwrite64(outfd, outbuf+offset, chunk_size,
1448                                                bytenr);
1449                                 if (ret != chunk_size)
1450                                         goto error;
1451
1452                                 if (physical_dup)
1453                                         ret = pwrite64(outfd, outbuf+offset,
1454                                                        chunk_size,
1455                                                        physical_dup);
1456                                 if (ret != chunk_size)
1457                                         goto error;
1458
1459                                 size -= chunk_size;
1460                                 offset += chunk_size;
1461                                 continue;
1462
1463 error:
1464                                 if (ret < 0) {
1465                                         error("unable to write to device: %m");
1466                                         err = errno;
1467                                 } else {
1468                                         error("short write");
1469                                         err = -EIO;
1470                                 }
1471                         }
1472                 } else if (async->start != BTRFS_SUPER_INFO_OFFSET) {
1473                         ret = write_data_to_disk(mdres->info, outbuf, async->start, size, 0);
1474                         if (ret) {
1475                                 error("failed to write data");
1476                                 exit(1);
1477                         }
1478                 }
1479
1480
1481                 /* backup super blocks are already there at fixup_offset stage */
1482                 if (!mdres->multi_devices && async->start == BTRFS_SUPER_INFO_OFFSET)
1483                         write_backup_supers(outfd, outbuf);
1484
1485                 if (err && !mdres->error)
1486                         mdres->error = err;
1487                 mdres->num_items--;
1488                 pthread_mutex_unlock(&mdres->mutex);
1489
1490                 free(async->buffer);
1491                 free(async);
1492         }
1493 out:
1494         free(buffer);
1495         pthread_exit(NULL);
1496 }
1497
1498 static void mdrestore_destroy(struct mdrestore_struct *mdres, int num_threads)
1499 {
1500         struct rb_node *n;
1501         int i;
1502
1503         while ((n = rb_first(&mdres->chunk_tree))) {
1504                 struct fs_chunk *entry;
1505
1506                 entry = rb_entry(n, struct fs_chunk, l);
1507                 rb_erase(n, &mdres->chunk_tree);
1508                 rb_erase(&entry->p, &mdres->physical_tree);
1509                 free(entry);
1510         }
1511         pthread_mutex_lock(&mdres->mutex);
1512         mdres->done = 1;
1513         pthread_cond_broadcast(&mdres->cond);
1514         pthread_mutex_unlock(&mdres->mutex);
1515
1516         for (i = 0; i < num_threads; i++)
1517                 pthread_join(mdres->threads[i], NULL);
1518
1519         pthread_cond_destroy(&mdres->cond);
1520         pthread_mutex_destroy(&mdres->mutex);
1521 }
1522
1523 static int mdrestore_init(struct mdrestore_struct *mdres,
1524                           FILE *in, FILE *out, int old_restore,
1525                           int num_threads, int fixup_offset,
1526                           struct btrfs_fs_info *info, int multi_devices)
1527 {
1528         int i, ret = 0;
1529
1530         memset(mdres, 0, sizeof(*mdres));
1531         pthread_cond_init(&mdres->cond, NULL);
1532         pthread_mutex_init(&mdres->mutex, NULL);
1533         INIT_LIST_HEAD(&mdres->list);
1534         INIT_LIST_HEAD(&mdres->overlapping_chunks);
1535         mdres->in = in;
1536         mdres->out = out;
1537         mdres->old_restore = old_restore;
1538         mdres->chunk_tree.rb_node = NULL;
1539         mdres->fixup_offset = fixup_offset;
1540         mdres->info = info;
1541         mdres->multi_devices = multi_devices;
1542         mdres->clear_space_cache = 0;
1543         mdres->last_physical_offset = 0;
1544         mdres->alloced_chunks = 0;
1545
1546         if (!num_threads)
1547                 return 0;
1548
1549         mdres->num_threads = num_threads;
1550         for (i = 0; i < num_threads; i++) {
1551                 ret = pthread_create(&mdres->threads[i], NULL, restore_worker,
1552                                      mdres);
1553                 if (ret) {
1554                         /* pthread_create returns errno directly */
1555                         ret = -ret;
1556                         break;
1557                 }
1558         }
1559         if (ret)
1560                 mdrestore_destroy(mdres, i + 1);
1561         return ret;
1562 }
1563
1564 static int fill_mdres_info(struct mdrestore_struct *mdres,
1565                            struct async_work *async)
1566 {
1567         struct btrfs_super_block *super;
1568         u8 *buffer = NULL;
1569         u8 *outbuf;
1570         int ret;
1571
1572         /* We've already been initialized */
1573         if (mdres->nodesize)
1574                 return 0;
1575
1576         if (mdres->compress_method == COMPRESS_ZLIB) {
1577                 size_t size = MAX_PENDING_SIZE * 2;
1578
1579                 buffer = malloc(MAX_PENDING_SIZE * 2);
1580                 if (!buffer)
1581                         return -ENOMEM;
1582                 ret = uncompress(buffer, (unsigned long *)&size,
1583                                  async->buffer, async->bufsize);
1584                 if (ret != Z_OK) {
1585                         error("decompression failed with %d", ret);
1586                         free(buffer);
1587                         return -EIO;
1588                 }
1589                 outbuf = buffer;
1590         } else {
1591                 outbuf = async->buffer;
1592         }
1593
1594         super = (struct btrfs_super_block *)outbuf;
1595         mdres->nodesize = btrfs_super_nodesize(super);
1596         memcpy(mdres->fsid, super->fsid, BTRFS_FSID_SIZE);
1597         memcpy(mdres->uuid, super->dev_item.uuid,
1598                        BTRFS_UUID_SIZE);
1599         mdres->devid = le64_to_cpu(super->dev_item.devid);
1600         free(buffer);
1601         return 0;
1602 }
1603
1604 static int add_cluster(struct meta_cluster *cluster,
1605                        struct mdrestore_struct *mdres, u64 *next)
1606 {
1607         struct meta_cluster_item *item;
1608         struct meta_cluster_header *header = &cluster->header;
1609         struct async_work *async;
1610         u64 bytenr;
1611         u32 i, nritems;
1612         int ret;
1613
1614         pthread_mutex_lock(&mdres->mutex);
1615         mdres->compress_method = header->compress;
1616         pthread_mutex_unlock(&mdres->mutex);
1617
1618         bytenr = le64_to_cpu(header->bytenr) + BLOCK_SIZE;
1619         nritems = le32_to_cpu(header->nritems);
1620         for (i = 0; i < nritems; i++) {
1621                 item = &cluster->items[i];
1622                 async = calloc(1, sizeof(*async));
1623                 if (!async) {
1624                         error("not enough memory for async data");
1625                         return -ENOMEM;
1626                 }
1627                 async->start = le64_to_cpu(item->bytenr);
1628                 async->bufsize = le32_to_cpu(item->size);
1629                 async->buffer = malloc(async->bufsize);
1630                 if (!async->buffer) {
1631                         error("not enough memory for async buffer");
1632                         free(async);
1633                         return -ENOMEM;
1634                 }
1635                 ret = fread(async->buffer, async->bufsize, 1, mdres->in);
1636                 if (ret != 1) {
1637                         error("unable to read buffer: %m");
1638                         free(async->buffer);
1639                         free(async);
1640                         return -EIO;
1641                 }
1642                 bytenr += async->bufsize;
1643
1644                 pthread_mutex_lock(&mdres->mutex);
1645                 if (async->start == BTRFS_SUPER_INFO_OFFSET) {
1646                         ret = fill_mdres_info(mdres, async);
1647                         if (ret) {
1648                                 error("unable to set up restore state");
1649                                 pthread_mutex_unlock(&mdres->mutex);
1650                                 free(async->buffer);
1651                                 free(async);
1652                                 return ret;
1653                         }
1654                 }
1655                 list_add_tail(&async->list, &mdres->list);
1656                 mdres->num_items++;
1657                 pthread_cond_signal(&mdres->cond);
1658                 pthread_mutex_unlock(&mdres->mutex);
1659         }
1660         if (bytenr & BLOCK_MASK) {
1661                 char buffer[BLOCK_MASK];
1662                 size_t size = BLOCK_SIZE - (bytenr & BLOCK_MASK);
1663
1664                 bytenr += size;
1665                 ret = fread(buffer, size, 1, mdres->in);
1666                 if (ret != 1) {
1667                         error("failed to read buffer: %m");
1668                         return -EIO;
1669                 }
1670         }
1671         *next = bytenr;
1672         return 0;
1673 }
1674
1675 static int wait_for_worker(struct mdrestore_struct *mdres)
1676 {
1677         int ret = 0;
1678
1679         pthread_mutex_lock(&mdres->mutex);
1680         ret = mdres->error;
1681         while (!ret && mdres->num_items > 0) {
1682                 struct timespec ts = {
1683                         .tv_sec = 0,
1684                         .tv_nsec = 10000000,
1685                 };
1686                 pthread_mutex_unlock(&mdres->mutex);
1687                 nanosleep(&ts, NULL);
1688                 pthread_mutex_lock(&mdres->mutex);
1689                 ret = mdres->error;
1690         }
1691         pthread_mutex_unlock(&mdres->mutex);
1692         return ret;
1693 }
1694
1695 static int read_chunk_block(struct mdrestore_struct *mdres, u8 *buffer,
1696                             u64 bytenr, u64 item_bytenr, u32 bufsize,
1697                             u64 cluster_bytenr)
1698 {
1699         struct extent_buffer *eb;
1700         int ret = 0;
1701         int i;
1702
1703         eb = alloc_dummy_eb(bytenr, mdres->nodesize);
1704         if (!eb) {
1705                 ret = -ENOMEM;
1706                 goto out;
1707         }
1708
1709         while (item_bytenr != bytenr) {
1710                 buffer += mdres->nodesize;
1711                 item_bytenr += mdres->nodesize;
1712         }
1713
1714         memcpy(eb->data, buffer, mdres->nodesize);
1715         if (btrfs_header_bytenr(eb) != bytenr) {
1716                 error("eb bytenr does not match found bytenr: %llu != %llu",
1717                                 (unsigned long long)btrfs_header_bytenr(eb),
1718                                 (unsigned long long)bytenr);
1719                 ret = -EIO;
1720                 goto out;
1721         }
1722
1723         if (memcmp(mdres->fsid, eb->data + offsetof(struct btrfs_header, fsid),
1724                    BTRFS_FSID_SIZE)) {
1725                 error("filesystem UUID of eb %llu does not match",
1726                                 (unsigned long long)bytenr);
1727                 ret = -EIO;
1728                 goto out;
1729         }
1730
1731         if (btrfs_header_owner(eb) != BTRFS_CHUNK_TREE_OBJECTID) {
1732                 error("wrong eb %llu owner %llu",
1733                                 (unsigned long long)bytenr,
1734                                 (unsigned long long)btrfs_header_owner(eb));
1735                 ret = -EIO;
1736                 goto out;
1737         }
1738
1739         for (i = 0; i < btrfs_header_nritems(eb); i++) {
1740                 struct btrfs_chunk *chunk;
1741                 struct fs_chunk *fs_chunk;
1742                 struct btrfs_key key;
1743                 u64 type;
1744
1745                 if (btrfs_header_level(eb)) {
1746                         u64 blockptr = btrfs_node_blockptr(eb, i);
1747
1748                         ret = search_for_chunk_blocks(mdres, blockptr,
1749                                                       cluster_bytenr);
1750                         if (ret)
1751                                 break;
1752                         continue;
1753                 }
1754
1755                 /* Yay a leaf!  We loves leafs! */
1756                 btrfs_item_key_to_cpu(eb, &key, i);
1757                 if (key.type != BTRFS_CHUNK_ITEM_KEY)
1758                         continue;
1759
1760                 fs_chunk = malloc(sizeof(struct fs_chunk));
1761                 if (!fs_chunk) {
1762                         error("not enough memory to allocate chunk");
1763                         ret = -ENOMEM;
1764                         break;
1765                 }
1766                 memset(fs_chunk, 0, sizeof(*fs_chunk));
1767                 chunk = btrfs_item_ptr(eb, i, struct btrfs_chunk);
1768
1769                 fs_chunk->logical = key.offset;
1770                 fs_chunk->physical = btrfs_stripe_offset_nr(eb, chunk, 0);
1771                 fs_chunk->bytes = btrfs_chunk_length(eb, chunk);
1772                 INIT_LIST_HEAD(&fs_chunk->list);
1773                 if (tree_search(&mdres->physical_tree, &fs_chunk->p,
1774                                 physical_cmp, 1) != NULL)
1775                         list_add(&fs_chunk->list, &mdres->overlapping_chunks);
1776                 else
1777                         tree_insert(&mdres->physical_tree, &fs_chunk->p,
1778                                     physical_cmp);
1779
1780                 type = btrfs_chunk_type(eb, chunk);
1781                 if (type & BTRFS_BLOCK_GROUP_DUP) {
1782                         fs_chunk->physical_dup =
1783                                         btrfs_stripe_offset_nr(eb, chunk, 1);
1784                 }
1785
1786                 if (fs_chunk->physical_dup + fs_chunk->bytes >
1787                     mdres->last_physical_offset)
1788                         mdres->last_physical_offset = fs_chunk->physical_dup +
1789                                 fs_chunk->bytes;
1790                 else if (fs_chunk->physical + fs_chunk->bytes >
1791                     mdres->last_physical_offset)
1792                         mdres->last_physical_offset = fs_chunk->physical +
1793                                 fs_chunk->bytes;
1794                 mdres->alloced_chunks += fs_chunk->bytes;
1795                 /* in dup case, fs_chunk->bytes should add twice */
1796                 if (fs_chunk->physical_dup)
1797                         mdres->alloced_chunks += fs_chunk->bytes;
1798                 tree_insert(&mdres->chunk_tree, &fs_chunk->l, chunk_cmp);
1799         }
1800 out:
1801         free(eb);
1802         return ret;
1803 }
1804
1805 /* If you have to ask you aren't worthy */
1806 static int search_for_chunk_blocks(struct mdrestore_struct *mdres,
1807                                    u64 search, u64 cluster_bytenr)
1808 {
1809         struct meta_cluster *cluster;
1810         struct meta_cluster_header *header;
1811         struct meta_cluster_item *item;
1812         u64 current_cluster = cluster_bytenr, bytenr;
1813         u64 item_bytenr;
1814         u32 bufsize, nritems, i;
1815         u32 max_size = MAX_PENDING_SIZE * 2;
1816         u8 *buffer, *tmp = NULL;
1817         int ret = 0;
1818
1819         cluster = malloc(BLOCK_SIZE);
1820         if (!cluster) {
1821                 error("not enough memory for cluster");
1822                 return -ENOMEM;
1823         }
1824
1825         buffer = malloc(max_size);
1826         if (!buffer) {
1827                 error("not enough memory for buffer");
1828                 free(cluster);
1829                 return -ENOMEM;
1830         }
1831
1832         if (mdres->compress_method == COMPRESS_ZLIB) {
1833                 tmp = malloc(max_size);
1834                 if (!tmp) {
1835                         error("not enough memory for buffer");
1836                         free(cluster);
1837                         free(buffer);
1838                         return -ENOMEM;
1839                 }
1840         }
1841
1842         bytenr = current_cluster;
1843         while (1) {
1844                 if (fseek(mdres->in, current_cluster, SEEK_SET)) {
1845                         error("seek failed: %m");
1846                         ret = -EIO;
1847                         break;
1848                 }
1849
1850                 ret = fread(cluster, BLOCK_SIZE, 1, mdres->in);
1851                 if (ret == 0) {
1852                         if (cluster_bytenr != 0) {
1853                                 cluster_bytenr = 0;
1854                                 current_cluster = 0;
1855                                 bytenr = 0;
1856                                 continue;
1857                         }
1858                         error(
1859         "unknown state after reading cluster at %llu, probably corrupted data",
1860                                         cluster_bytenr);
1861                         ret = -EIO;
1862                         break;
1863                 } else if (ret < 0) {
1864                         error("unable to read image at %llu: %m",
1865                                         (unsigned long long)cluster_bytenr);
1866                         break;
1867                 }
1868                 ret = 0;
1869
1870                 header = &cluster->header;
1871                 if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
1872                     le64_to_cpu(header->bytenr) != current_cluster) {
1873                         error("bad header in metadump image");
1874                         ret = -EIO;
1875                         break;
1876                 }
1877
1878                 bytenr += BLOCK_SIZE;
1879                 nritems = le32_to_cpu(header->nritems);
1880                 for (i = 0; i < nritems; i++) {
1881                         size_t size;
1882
1883                         item = &cluster->items[i];
1884                         bufsize = le32_to_cpu(item->size);
1885                         item_bytenr = le64_to_cpu(item->bytenr);
1886
1887                         if (bufsize > max_size) {
1888                                 error("item %u too big: %u > %u", i, bufsize,
1889                                                 max_size);
1890                                 ret = -EIO;
1891                                 break;
1892                         }
1893
1894                         if (mdres->compress_method == COMPRESS_ZLIB) {
1895                                 ret = fread(tmp, bufsize, 1, mdres->in);
1896                                 if (ret != 1) {
1897                                         error("read error: %m");
1898                                         ret = -EIO;
1899                                         break;
1900                                 }
1901
1902                                 size = max_size;
1903                                 ret = uncompress(buffer,
1904                                                  (unsigned long *)&size, tmp,
1905                                                  bufsize);
1906                                 if (ret != Z_OK) {
1907                                         error("decompression failed with %d",
1908                                                         ret);
1909                                         ret = -EIO;
1910                                         break;
1911                                 }
1912                         } else {
1913                                 ret = fread(buffer, bufsize, 1, mdres->in);
1914                                 if (ret != 1) {
1915                                         error("read error: %m");
1916                                         ret = -EIO;
1917                                         break;
1918                                 }
1919                                 size = bufsize;
1920                         }
1921                         ret = 0;
1922
1923                         if (item_bytenr <= search &&
1924                             item_bytenr + size > search) {
1925                                 ret = read_chunk_block(mdres, buffer, search,
1926                                                        item_bytenr, size,
1927                                                        current_cluster);
1928                                 if (!ret)
1929                                         ret = 1;
1930                                 break;
1931                         }
1932                         bytenr += bufsize;
1933                 }
1934                 if (ret) {
1935                         if (ret > 0)
1936                                 ret = 0;
1937                         break;
1938                 }
1939                 if (bytenr & BLOCK_MASK)
1940                         bytenr += BLOCK_SIZE - (bytenr & BLOCK_MASK);
1941                 current_cluster = bytenr;
1942         }
1943
1944         free(tmp);
1945         free(buffer);
1946         free(cluster);
1947         return ret;
1948 }
1949
1950 static int build_chunk_tree(struct mdrestore_struct *mdres,
1951                             struct meta_cluster *cluster)
1952 {
1953         struct btrfs_super_block *super;
1954         struct meta_cluster_header *header;
1955         struct meta_cluster_item *item = NULL;
1956         u64 chunk_root_bytenr = 0;
1957         u32 i, nritems;
1958         u64 bytenr = 0;
1959         u8 *buffer;
1960         int ret;
1961
1962         /* We can't seek with stdin so don't bother doing this */
1963         if (mdres->in == stdin)
1964                 return 0;
1965
1966         ret = fread(cluster, BLOCK_SIZE, 1, mdres->in);
1967         if (ret <= 0) {
1968                 error("unable to read cluster: %m");
1969                 return -EIO;
1970         }
1971         ret = 0;
1972
1973         header = &cluster->header;
1974         if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
1975             le64_to_cpu(header->bytenr) != 0) {
1976                 error("bad header in metadump image");
1977                 return -EIO;
1978         }
1979
1980         bytenr += BLOCK_SIZE;
1981         mdres->compress_method = header->compress;
1982         nritems = le32_to_cpu(header->nritems);
1983         for (i = 0; i < nritems; i++) {
1984                 item = &cluster->items[i];
1985
1986                 if (le64_to_cpu(item->bytenr) == BTRFS_SUPER_INFO_OFFSET)
1987                         break;
1988                 bytenr += le32_to_cpu(item->size);
1989                 if (fseek(mdres->in, le32_to_cpu(item->size), SEEK_CUR)) {
1990                         error("seek failed: %m");
1991                         return -EIO;
1992                 }
1993         }
1994
1995         if (!item || le64_to_cpu(item->bytenr) != BTRFS_SUPER_INFO_OFFSET) {
1996                 error("did not find superblock at %llu",
1997                                 le64_to_cpu(item->bytenr));
1998                 return -EINVAL;
1999         }
2000
2001         buffer = malloc(le32_to_cpu(item->size));
2002         if (!buffer) {
2003                 error("not enough memory to allocate buffer");
2004                 return -ENOMEM;
2005         }
2006
2007         ret = fread(buffer, le32_to_cpu(item->size), 1, mdres->in);
2008         if (ret != 1) {
2009                 error("unable to read buffer: %m");
2010                 free(buffer);
2011                 return -EIO;
2012         }
2013
2014         if (mdres->compress_method == COMPRESS_ZLIB) {
2015                 size_t size = MAX_PENDING_SIZE * 2;
2016                 u8 *tmp;
2017
2018                 tmp = malloc(MAX_PENDING_SIZE * 2);
2019                 if (!tmp) {
2020                         free(buffer);
2021                         return -ENOMEM;
2022                 }
2023                 ret = uncompress(tmp, (unsigned long *)&size,
2024                                  buffer, le32_to_cpu(item->size));
2025                 if (ret != Z_OK) {
2026                         error("decompression failed with %d", ret);
2027                         free(buffer);
2028                         free(tmp);
2029                         return -EIO;
2030                 }
2031                 free(buffer);
2032                 buffer = tmp;
2033         }
2034
2035         pthread_mutex_lock(&mdres->mutex);
2036         super = (struct btrfs_super_block *)buffer;
2037         chunk_root_bytenr = btrfs_super_chunk_root(super);
2038         mdres->nodesize = btrfs_super_nodesize(super);
2039         memcpy(mdres->fsid, super->fsid, BTRFS_FSID_SIZE);
2040         memcpy(mdres->uuid, super->dev_item.uuid,
2041                        BTRFS_UUID_SIZE);
2042         mdres->devid = le64_to_cpu(super->dev_item.devid);
2043         free(buffer);
2044         pthread_mutex_unlock(&mdres->mutex);
2045
2046         return search_for_chunk_blocks(mdres, chunk_root_bytenr, 0);
2047 }
2048
2049 static int range_contains_super(u64 physical, u64 bytes)
2050 {
2051         u64 super_bytenr;
2052         int i;
2053
2054         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2055                 super_bytenr = btrfs_sb_offset(i);
2056                 if (super_bytenr >= physical &&
2057                     super_bytenr < physical + bytes)
2058                         return 1;
2059         }
2060
2061         return 0;
2062 }
2063
2064 static void remap_overlapping_chunks(struct mdrestore_struct *mdres)
2065 {
2066         struct fs_chunk *fs_chunk;
2067
2068         while (!list_empty(&mdres->overlapping_chunks)) {
2069                 fs_chunk = list_first_entry(&mdres->overlapping_chunks,
2070                                             struct fs_chunk, list);
2071                 list_del_init(&fs_chunk->list);
2072                 if (range_contains_super(fs_chunk->physical,
2073                                          fs_chunk->bytes)) {
2074                         warning(
2075 "remapping a chunk that had a super mirror inside of it, clearing space cache so we don't end up with corruption");
2076                         mdres->clear_space_cache = 1;
2077                 }
2078                 fs_chunk->physical = mdres->last_physical_offset;
2079                 tree_insert(&mdres->physical_tree, &fs_chunk->p, physical_cmp);
2080                 mdres->last_physical_offset += fs_chunk->bytes;
2081         }
2082 }
2083
2084 static int fixup_devices(struct btrfs_fs_info *fs_info,
2085                          struct mdrestore_struct *mdres, off_t dev_size)
2086 {
2087         struct btrfs_trans_handle *trans;
2088         struct btrfs_dev_item *dev_item;
2089         struct btrfs_path path;
2090         struct extent_buffer *leaf;
2091         struct btrfs_root *root = fs_info->chunk_root;
2092         struct btrfs_key key;
2093         u64 devid, cur_devid;
2094         int ret;
2095
2096         trans = btrfs_start_transaction(fs_info->tree_root, 1);
2097         if (IS_ERR(trans)) {
2098                 error("cannot starting transaction %ld", PTR_ERR(trans));
2099                 return PTR_ERR(trans);
2100         }
2101
2102         dev_item = &fs_info->super_copy->dev_item;
2103
2104         devid = btrfs_stack_device_id(dev_item);
2105
2106         btrfs_set_stack_device_total_bytes(dev_item, dev_size);
2107         btrfs_set_stack_device_bytes_used(dev_item, mdres->alloced_chunks);
2108
2109         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2110         key.type = BTRFS_DEV_ITEM_KEY;
2111         key.offset = 0;
2112
2113         btrfs_init_path(&path);
2114
2115 again:
2116         ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
2117         if (ret < 0) {
2118                 error("search failed: %d", ret);
2119                 exit(1);
2120         }
2121
2122         while (1) {
2123                 leaf = path.nodes[0];
2124                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
2125                         ret = btrfs_next_leaf(root, &path);
2126                         if (ret < 0) {
2127                                 error("cannot go to next leaf %d", ret);
2128                                 exit(1);
2129                         }
2130                         if (ret > 0) {
2131                                 ret = 0;
2132                                 break;
2133                         }
2134                         leaf = path.nodes[0];
2135                 }
2136
2137                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
2138                 if (key.type > BTRFS_DEV_ITEM_KEY)
2139                         break;
2140                 if (key.type != BTRFS_DEV_ITEM_KEY) {
2141                         path.slots[0]++;
2142                         continue;
2143                 }
2144
2145                 dev_item = btrfs_item_ptr(leaf, path.slots[0],
2146                                           struct btrfs_dev_item);
2147                 cur_devid = btrfs_device_id(leaf, dev_item);
2148                 if (devid != cur_devid) {
2149                         ret = btrfs_del_item(trans, root, &path);
2150                         if (ret) {
2151                                 error("cannot delete item: %d", ret);
2152                                 exit(1);
2153                         }
2154                         btrfs_release_path(&path);
2155                         goto again;
2156                 }
2157
2158                 btrfs_set_device_total_bytes(leaf, dev_item, dev_size);
2159                 btrfs_set_device_bytes_used(leaf, dev_item,
2160                                             mdres->alloced_chunks);
2161                 btrfs_mark_buffer_dirty(leaf);
2162                 path.slots[0]++;
2163         }
2164
2165         btrfs_release_path(&path);
2166         ret = btrfs_commit_transaction(trans, fs_info->tree_root);
2167         if (ret) {
2168                 error("unable to commit transaction: %d", ret);
2169                 return ret;
2170         }
2171         return 0;
2172 }
2173
2174 static int restore_metadump(const char *input, FILE *out, int old_restore,
2175                             int num_threads, int fixup_offset,
2176                             const char *target, int multi_devices)
2177 {
2178         struct meta_cluster *cluster = NULL;
2179         struct meta_cluster_header *header;
2180         struct mdrestore_struct mdrestore;
2181         struct btrfs_fs_info *info = NULL;
2182         u64 bytenr = 0;
2183         FILE *in = NULL;
2184         int ret = 0;
2185
2186         if (!strcmp(input, "-")) {
2187                 in = stdin;
2188         } else {
2189                 in = fopen(input, "r");
2190                 if (!in) {
2191                         error("unable to open metadump image: %m");
2192                         return 1;
2193                 }
2194         }
2195
2196         /* NOTE: open with write mode */
2197         if (fixup_offset) {
2198                 info = open_ctree_fs_info(target, 0, 0, 0,
2199                                           OPEN_CTREE_WRITES |
2200                                           OPEN_CTREE_RESTORE |
2201                                           OPEN_CTREE_PARTIAL);
2202                 if (!info) {
2203                         error("open ctree failed");
2204                         ret = -EIO;
2205                         goto failed_open;
2206                 }
2207         }
2208
2209         cluster = malloc(BLOCK_SIZE);
2210         if (!cluster) {
2211                 error("not enough memory for cluster");
2212                 ret = -ENOMEM;
2213                 goto failed_info;
2214         }
2215
2216         ret = mdrestore_init(&mdrestore, in, out, old_restore, num_threads,
2217                              fixup_offset, info, multi_devices);
2218         if (ret) {
2219                 error("failed to initialize metadata restore state: %d", ret);
2220                 goto failed_cluster;
2221         }
2222
2223         if (!multi_devices && !old_restore) {
2224                 ret = build_chunk_tree(&mdrestore, cluster);
2225                 if (ret)
2226                         goto out;
2227                 if (!list_empty(&mdrestore.overlapping_chunks))
2228                         remap_overlapping_chunks(&mdrestore);
2229         }
2230
2231         if (in != stdin && fseek(in, 0, SEEK_SET)) {
2232                 error("seek failed: %m");
2233                 goto out;
2234         }
2235
2236         while (!mdrestore.error) {
2237                 ret = fread(cluster, BLOCK_SIZE, 1, in);
2238                 if (!ret)
2239                         break;
2240
2241                 header = &cluster->header;
2242                 if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
2243                     le64_to_cpu(header->bytenr) != bytenr) {
2244                         error("bad header in metadump image");
2245                         ret = -EIO;
2246                         break;
2247                 }
2248                 ret = add_cluster(cluster, &mdrestore, &bytenr);
2249                 if (ret) {
2250                         error("failed to add cluster: %d", ret);
2251                         break;
2252                 }
2253         }
2254         ret = wait_for_worker(&mdrestore);
2255
2256         if (!ret && !multi_devices && !old_restore) {
2257                 struct btrfs_root *root;
2258                 struct stat st;
2259
2260                 root = open_ctree_fd(fileno(out), target, 0,
2261                                           OPEN_CTREE_PARTIAL |
2262                                           OPEN_CTREE_WRITES |
2263                                           OPEN_CTREE_NO_DEVICES);
2264                 if (!root) {
2265                         error("open ctree failed in %s", target);
2266                         ret = -EIO;
2267                         goto out;
2268                 }
2269                 info = root->fs_info;
2270
2271                 if (stat(target, &st)) {
2272                         error("stat %s failed: %m", target);
2273                         close_ctree(info->chunk_root);
2274                         free(cluster);
2275                         return 1;
2276                 }
2277
2278                 ret = fixup_devices(info, &mdrestore, st.st_size);
2279                 close_ctree(info->chunk_root);
2280                 if (ret)
2281                         goto out;
2282         }
2283 out:
2284         mdrestore_destroy(&mdrestore, num_threads);
2285 failed_cluster:
2286         free(cluster);
2287 failed_info:
2288         if (fixup_offset && info)
2289                 close_ctree(info->chunk_root);
2290 failed_open:
2291         if (in != stdin)
2292                 fclose(in);
2293         return ret;
2294 }
2295
2296 static int update_disk_super_on_device(struct btrfs_fs_info *info,
2297                                        const char *other_dev, u64 cur_devid)
2298 {
2299         struct btrfs_key key;
2300         struct extent_buffer *leaf;
2301         struct btrfs_path path;
2302         struct btrfs_dev_item *dev_item;
2303         struct btrfs_super_block *disk_super;
2304         char dev_uuid[BTRFS_UUID_SIZE];
2305         char fs_uuid[BTRFS_UUID_SIZE];
2306         u64 devid, type, io_align, io_width;
2307         u64 sector_size, total_bytes, bytes_used;
2308         char buf[BTRFS_SUPER_INFO_SIZE];
2309         int fp = -1;
2310         int ret;
2311
2312         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2313         key.type = BTRFS_DEV_ITEM_KEY;
2314         key.offset = cur_devid;
2315
2316         btrfs_init_path(&path);
2317         ret = btrfs_search_slot(NULL, info->chunk_root, &key, &path, 0, 0); 
2318         if (ret) {
2319                 error("search key failed: %d", ret);
2320                 ret = -EIO;
2321                 goto out;
2322         }
2323
2324         leaf = path.nodes[0];
2325         dev_item = btrfs_item_ptr(leaf, path.slots[0],
2326                                   struct btrfs_dev_item);
2327
2328         devid = btrfs_device_id(leaf, dev_item);
2329         if (devid != cur_devid) {
2330                 error("devid mismatch: %llu != %llu",
2331                                 (unsigned long long)devid,
2332                                 (unsigned long long)cur_devid);
2333                 ret = -EIO;
2334                 goto out;
2335         }
2336
2337         type = btrfs_device_type(leaf, dev_item);
2338         io_align = btrfs_device_io_align(leaf, dev_item);
2339         io_width = btrfs_device_io_width(leaf, dev_item);
2340         sector_size = btrfs_device_sector_size(leaf, dev_item);
2341         total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2342         bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2343         read_extent_buffer(leaf, dev_uuid, (unsigned long)btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE);
2344         read_extent_buffer(leaf, fs_uuid, (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE);
2345
2346         btrfs_release_path(&path);
2347
2348         printf("update disk super on %s devid=%llu\n", other_dev, devid);
2349
2350         /* update other devices' super block */
2351         fp = open(other_dev, O_CREAT | O_RDWR, 0600);
2352         if (fp < 0) {
2353                 error("could not open %s: %m", other_dev);
2354                 ret = -EIO;
2355                 goto out;
2356         }
2357
2358         memcpy(buf, info->super_copy, BTRFS_SUPER_INFO_SIZE);
2359
2360         disk_super = (struct btrfs_super_block *)buf;
2361         dev_item = &disk_super->dev_item;
2362
2363         btrfs_set_stack_device_type(dev_item, type);
2364         btrfs_set_stack_device_id(dev_item, devid);
2365         btrfs_set_stack_device_total_bytes(dev_item, total_bytes);
2366         btrfs_set_stack_device_bytes_used(dev_item, bytes_used);
2367         btrfs_set_stack_device_io_align(dev_item, io_align);
2368         btrfs_set_stack_device_io_width(dev_item, io_width);
2369         btrfs_set_stack_device_sector_size(dev_item, sector_size);
2370         memcpy(dev_item->uuid, dev_uuid, BTRFS_UUID_SIZE);
2371         memcpy(dev_item->fsid, fs_uuid, BTRFS_UUID_SIZE);
2372         csum_block((u8 *)buf, BTRFS_SUPER_INFO_SIZE);
2373
2374         ret = pwrite64(fp, buf, BTRFS_SUPER_INFO_SIZE, BTRFS_SUPER_INFO_OFFSET);
2375         if (ret != BTRFS_SUPER_INFO_SIZE) {
2376                 if (ret < 0)
2377                         error("cannot write superblock: %s", strerror(ret));
2378                 else
2379                         error("cannot write superblock");
2380                 ret = -EIO;
2381                 goto out;
2382         }
2383
2384         write_backup_supers(fp, (u8 *)buf);
2385
2386 out:
2387         if (fp != -1)
2388                 close(fp);
2389         return ret;
2390 }
2391
2392 static void print_usage(int ret)
2393 {
2394         printf("usage: btrfs-image [options] source target\n");
2395         printf("\t-r      \trestore metadump image\n");
2396         printf("\t-c value\tcompression level (0 ~ 9)\n");
2397         printf("\t-t value\tnumber of threads (1 ~ 32)\n");
2398         printf("\t-o      \tdon't mess with the chunk tree when restoring\n");
2399         printf("\t-s      \tsanitize file names, use once to just use garbage, use twice if you want crc collisions\n");
2400         printf("\t-w      \twalk all trees instead of using extent tree, do this if your extent tree is broken\n");
2401         printf("\t-m       \trestore for multiple devices\n");
2402         printf("\n");
2403         printf("\tIn the dump mode, source is the btrfs device and target is the output file (use '-' for stdout).\n");
2404         printf("\tIn the restore mode, source is the dumped image and target is the btrfs device/file.\n");
2405         exit(ret);
2406 }
2407
2408 int main(int argc, char *argv[])
2409 {
2410         char *source;
2411         char *target;
2412         u64 num_threads = 0;
2413         u64 compress_level = 0;
2414         int create = 1;
2415         int old_restore = 0;
2416         int walk_trees = 0;
2417         int multi_devices = 0;
2418         int ret;
2419         enum sanitize_mode sanitize = SANITIZE_NONE;
2420         int dev_cnt = 0;
2421         int usage_error = 0;
2422         FILE *out;
2423
2424         while (1) {
2425                 static const struct option long_options[] = {
2426                         { "help", no_argument, NULL, GETOPT_VAL_HELP},
2427                         { NULL, 0, NULL, 0 }
2428                 };
2429                 int c = getopt_long(argc, argv, "rc:t:oswm", long_options, NULL);
2430                 if (c < 0)
2431                         break;
2432                 switch (c) {
2433                 case 'r':
2434                         create = 0;
2435                         break;
2436                 case 't':
2437                         num_threads = arg_strtou64(optarg);
2438                         if (num_threads > MAX_WORKER_THREADS) {
2439                                 error("number of threads out of range: %llu > %d",
2440                                         (unsigned long long)num_threads,
2441                                         MAX_WORKER_THREADS);
2442                                 return 1;
2443                         }
2444                         break;
2445                 case 'c':
2446                         compress_level = arg_strtou64(optarg);
2447                         if (compress_level > 9) {
2448                                 error("compression level out of range: %llu",
2449                                         (unsigned long long)compress_level);
2450                                 return 1;
2451                         }
2452                         break;
2453                 case 'o':
2454                         old_restore = 1;
2455                         break;
2456                 case 's':
2457                         if (sanitize == SANITIZE_NONE)
2458                                 sanitize = SANITIZE_NAMES;
2459                         else if (sanitize == SANITIZE_NAMES)
2460                                 sanitize = SANITIZE_COLLISIONS;
2461                         break;
2462                 case 'w':
2463                         walk_trees = 1;
2464                         break;
2465                 case 'm':
2466                         create = 0;
2467                         multi_devices = 1;
2468                         break;
2469                         case GETOPT_VAL_HELP:
2470                 default:
2471                         print_usage(c != GETOPT_VAL_HELP);
2472                 }
2473         }
2474
2475         set_argv0(argv);
2476         if (check_argc_min(argc - optind, 2))
2477                 print_usage(1);
2478
2479         dev_cnt = argc - optind - 1;
2480
2481         if (create) {
2482                 if (old_restore) {
2483                         error(
2484                         "create and restore cannot be used at the same time");
2485                         usage_error++;
2486                 }
2487         } else {
2488                 if (walk_trees || sanitize != SANITIZE_NONE || compress_level) {
2489                         error(
2490                         "useing -w, -s, -c options for restore makes no sense");
2491                         usage_error++;
2492                 }
2493                 if (multi_devices && dev_cnt < 2) {
2494                         error("not enough devices specified for -m option");
2495                         usage_error++;
2496                 }
2497                 if (!multi_devices && dev_cnt != 1) {
2498                         error("accepts only 1 device without -m option");
2499                         usage_error++;
2500                 }
2501         }
2502
2503         if (usage_error)
2504                 print_usage(1);
2505
2506         source = argv[optind];
2507         target = argv[optind + 1];
2508
2509         if (create && !strcmp(target, "-")) {
2510                 out = stdout;
2511         } else {
2512                 out = fopen(target, "w+");
2513                 if (!out) {
2514                         error("unable to create target file %s", target);
2515                         exit(1);
2516                 }
2517         }
2518
2519         if (compress_level > 0 || create == 0) {
2520                 if (num_threads == 0) {
2521                         long tmp = sysconf(_SC_NPROCESSORS_ONLN);
2522
2523                         if (tmp <= 0)
2524                                 tmp = 1;
2525                         num_threads = tmp;
2526                 }
2527         } else {
2528                 num_threads = 0;
2529         }
2530
2531         if (create) {
2532                 ret = check_mounted(source);
2533                 if (ret < 0) {
2534                         warning("unable to check mount status of: %s",
2535                                         strerror(-ret));
2536                 } else if (ret) {
2537                         warning("%s already mounted, results may be inaccurate",
2538                                         source);
2539                 }
2540
2541                 ret = create_metadump(source, out, num_threads,
2542                                       compress_level, sanitize, walk_trees);
2543         } else {
2544                 ret = restore_metadump(source, out, old_restore, num_threads,
2545                                        0, target, multi_devices);
2546         }
2547         if (ret) {
2548                 error("%s failed: %m", (create) ? "create" : "restore");
2549                 goto out;
2550         }
2551
2552          /* extended support for multiple devices */
2553         if (!create && multi_devices) {
2554                 struct btrfs_fs_info *info;
2555                 u64 total_devs;
2556                 int i;
2557
2558                 info = open_ctree_fs_info(target, 0, 0, 0,
2559                                           OPEN_CTREE_PARTIAL |
2560                                           OPEN_CTREE_RESTORE);
2561                 if (!info) {
2562                         error("open ctree failed at %s", target);
2563                         return 1;
2564                 }
2565
2566                 total_devs = btrfs_super_num_devices(info->super_copy);
2567                 if (total_devs != dev_cnt) {
2568                         error("it needs %llu devices but has only %d",
2569                                 total_devs, dev_cnt);
2570                         close_ctree(info->chunk_root);
2571                         goto out;
2572                 }
2573
2574                 /* update super block on other disks */
2575                 for (i = 2; i <= dev_cnt; i++) {
2576                         ret = update_disk_super_on_device(info,
2577                                         argv[optind + i], (u64)i);
2578                         if (ret) {
2579                                 error("update disk superblock failed devid %d: %d",
2580                                         i, ret);
2581                                 close_ctree(info->chunk_root);
2582                                 exit(1);
2583                         }
2584                 }
2585
2586                 close_ctree(info->chunk_root);
2587
2588                 /* fix metadata block to map correct chunk */
2589                 ret = restore_metadump(source, out, 0, num_threads, 1,
2590                                        target, 1);
2591                 if (ret) {
2592                         error("unable to fixup metadump: %d", ret);
2593                         exit(1);
2594                 }
2595         }
2596 out:
2597         if (out == stdout) {
2598                 fflush(out);
2599         } else {
2600                 fclose(out);
2601                 if (ret && create) {
2602                         int unlink_ret;
2603
2604                         unlink_ret = unlink(target);
2605                         if (unlink_ret)
2606                                 error("unlink output file %s failed: %m",
2607                                                 target);
2608                 }
2609         }
2610
2611         btrfs_close_all_devices();
2612
2613         return !!ret;
2614 }