btrfs-progs: build: mention library dependency for reiserfs
[platform/upstream/btrfs-progs.git] / image / main.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <pthread.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <fcntl.h>
25 #include <unistd.h>
26 #include <dirent.h>
27 #include <zlib.h>
28 #include <getopt.h>
29
30 #include "kerncompat.h"
31 #include "crc32c.h"
32 #include "ctree.h"
33 #include "disk-io.h"
34 #include "transaction.h"
35 #include "utils.h"
36 #include "volumes.h"
37 #include "extent_io.h"
38 #include "help.h"
39 #include "image/metadump.h"
40 #include "image/sanitize.h"
41
42 #define MAX_WORKER_THREADS      (32)
43
44 struct async_work {
45         struct list_head list;
46         struct list_head ordered;
47         u64 start;
48         u64 size;
49         u8 *buffer;
50         size_t bufsize;
51         int error;
52 };
53
54 struct metadump_struct {
55         struct btrfs_root *root;
56         FILE *out;
57
58         union {
59                 struct meta_cluster cluster;
60                 char meta_cluster_bytes[BLOCK_SIZE];
61         };
62
63         pthread_t threads[MAX_WORKER_THREADS];
64         size_t num_threads;
65         pthread_mutex_t mutex;
66         pthread_cond_t cond;
67         struct rb_root name_tree;
68
69         struct list_head list;
70         struct list_head ordered;
71         size_t num_items;
72         size_t num_ready;
73
74         u64 pending_start;
75         u64 pending_size;
76
77         int compress_level;
78         int done;
79         int data;
80         enum sanitize_mode sanitize_names;
81
82         int error;
83 };
84
85 struct mdrestore_struct {
86         FILE *in;
87         FILE *out;
88
89         pthread_t threads[MAX_WORKER_THREADS];
90         size_t num_threads;
91         pthread_mutex_t mutex;
92         pthread_cond_t cond;
93
94         struct rb_root chunk_tree;
95         struct rb_root physical_tree;
96         struct list_head list;
97         struct list_head overlapping_chunks;
98         size_t num_items;
99         u32 nodesize;
100         u64 devid;
101         u64 alloced_chunks;
102         u64 last_physical_offset;
103         u8 uuid[BTRFS_UUID_SIZE];
104         u8 fsid[BTRFS_FSID_SIZE];
105
106         int compress_method;
107         int done;
108         int error;
109         int old_restore;
110         int fixup_offset;
111         int multi_devices;
112         int clear_space_cache;
113         struct btrfs_fs_info *info;
114 };
115
116 static int search_for_chunk_blocks(struct mdrestore_struct *mdres,
117                                    u64 search, u64 cluster_bytenr);
118 static struct extent_buffer *alloc_dummy_eb(u64 bytenr, u32 size);
119
120 static void csum_block(u8 *buf, size_t len)
121 {
122         u8 result[BTRFS_CRC32_SIZE];
123         u32 crc = ~(u32)0;
124         crc = crc32c(crc, buf + BTRFS_CSUM_SIZE, len - BTRFS_CSUM_SIZE);
125         btrfs_csum_final(crc, result);
126         memcpy(buf, result, BTRFS_CRC32_SIZE);
127 }
128
129 static int has_name(struct btrfs_key *key)
130 {
131         switch (key->type) {
132         case BTRFS_DIR_ITEM_KEY:
133         case BTRFS_DIR_INDEX_KEY:
134         case BTRFS_INODE_REF_KEY:
135         case BTRFS_INODE_EXTREF_KEY:
136         case BTRFS_XATTR_ITEM_KEY:
137                 return 1;
138         default:
139                 break;
140         }
141
142         return 0;
143 }
144
145 static int chunk_cmp(struct rb_node *a, struct rb_node *b, int fuzz)
146 {
147         struct fs_chunk *entry = rb_entry(a, struct fs_chunk, l);
148         struct fs_chunk *ins = rb_entry(b, struct fs_chunk, l);
149
150         if (fuzz && ins->logical >= entry->logical &&
151             ins->logical < entry->logical + entry->bytes)
152                 return 0;
153
154         if (ins->logical < entry->logical)
155                 return -1;
156         else if (ins->logical > entry->logical)
157                 return 1;
158         return 0;
159 }
160
161 static int physical_cmp(struct rb_node *a, struct rb_node *b, int fuzz)
162 {
163         struct fs_chunk *entry = rb_entry(a, struct fs_chunk, p);
164         struct fs_chunk *ins = rb_entry(b, struct fs_chunk, p);
165
166         if (fuzz && ins->physical >= entry->physical &&
167             ins->physical < entry->physical + entry->bytes)
168                 return 0;
169
170         if (fuzz && entry->physical >= ins->physical &&
171             entry->physical < ins->physical + ins->bytes)
172                 return 0;
173
174         if (ins->physical < entry->physical)
175                 return -1;
176         else if (ins->physical > entry->physical)
177                 return 1;
178         return 0;
179 }
180
181 static void tree_insert(struct rb_root *root, struct rb_node *ins,
182                         int (*cmp)(struct rb_node *a, struct rb_node *b,
183                                    int fuzz))
184 {
185         struct rb_node ** p = &root->rb_node;
186         struct rb_node * parent = NULL;
187         int dir;
188
189         while(*p) {
190                 parent = *p;
191
192                 dir = cmp(*p, ins, 1);
193                 if (dir < 0)
194                         p = &(*p)->rb_left;
195                 else if (dir > 0)
196                         p = &(*p)->rb_right;
197                 else
198                         BUG();
199         }
200
201         rb_link_node(ins, parent, p);
202         rb_insert_color(ins, root);
203 }
204
205 static struct rb_node *tree_search(struct rb_root *root,
206                                    struct rb_node *search,
207                                    int (*cmp)(struct rb_node *a,
208                                               struct rb_node *b, int fuzz),
209                                    int fuzz)
210 {
211         struct rb_node *n = root->rb_node;
212         int dir;
213
214         while (n) {
215                 dir = cmp(n, search, fuzz);
216                 if (dir < 0)
217                         n = n->rb_left;
218                 else if (dir > 0)
219                         n = n->rb_right;
220                 else
221                         return n;
222         }
223
224         return NULL;
225 }
226
227 static u64 logical_to_physical(struct mdrestore_struct *mdres, u64 logical,
228                                u64 *size, u64 *physical_dup)
229 {
230         struct fs_chunk *fs_chunk;
231         struct rb_node *entry;
232         struct fs_chunk search;
233         u64 offset;
234
235         if (logical == BTRFS_SUPER_INFO_OFFSET)
236                 return logical;
237
238         search.logical = logical;
239         entry = tree_search(&mdres->chunk_tree, &search.l, chunk_cmp, 1);
240         if (!entry) {
241                 if (mdres->in != stdin)
242                         warning("cannot find a chunk, using logical");
243                 return logical;
244         }
245         fs_chunk = rb_entry(entry, struct fs_chunk, l);
246         if (fs_chunk->logical > logical || fs_chunk->logical + fs_chunk->bytes < logical)
247                 BUG();
248         offset = search.logical - fs_chunk->logical;
249
250         if (physical_dup) {
251                 /* Only in dup case, physical_dup is not equal to 0 */
252                 if (fs_chunk->physical_dup)
253                         *physical_dup = fs_chunk->physical_dup + offset;
254                 else
255                         *physical_dup = 0;
256         }
257
258         *size = min(*size, fs_chunk->bytes + fs_chunk->logical - logical);
259         return fs_chunk->physical + offset;
260 }
261
262 /*
263  * zero inline extents and csum items
264  */
265 static void zero_items(struct metadump_struct *md, u8 *dst,
266                        struct extent_buffer *src)
267 {
268         struct btrfs_file_extent_item *fi;
269         struct btrfs_item *item;
270         struct btrfs_key key;
271         u32 nritems = btrfs_header_nritems(src);
272         size_t size;
273         unsigned long ptr;
274         int i, extent_type;
275
276         for (i = 0; i < nritems; i++) {
277                 item = btrfs_item_nr(i);
278                 btrfs_item_key_to_cpu(src, &key, i);
279                 if (key.type == BTRFS_CSUM_ITEM_KEY) {
280                         size = btrfs_item_size_nr(src, i);
281                         memset(dst + btrfs_leaf_data(src) +
282                                btrfs_item_offset_nr(src, i), 0, size);
283                         continue;
284                 }
285
286                 if (md->sanitize_names && has_name(&key)) {
287                         sanitize_name(md->sanitize_names, &md->name_tree, dst,
288                                         src, &key, i);
289                         continue;
290                 }
291
292                 if (key.type != BTRFS_EXTENT_DATA_KEY)
293                         continue;
294
295                 fi = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
296                 extent_type = btrfs_file_extent_type(src, fi);
297                 if (extent_type != BTRFS_FILE_EXTENT_INLINE)
298                         continue;
299
300                 ptr = btrfs_file_extent_inline_start(fi);
301                 size = btrfs_file_extent_inline_item_len(src, item);
302                 memset(dst + ptr, 0, size);
303         }
304 }
305
306 /*
307  * copy buffer and zero useless data in the buffer
308  */
309 static void copy_buffer(struct metadump_struct *md, u8 *dst,
310                         struct extent_buffer *src)
311 {
312         int level;
313         size_t size;
314         u32 nritems;
315
316         memcpy(dst, src->data, src->len);
317         if (src->start == BTRFS_SUPER_INFO_OFFSET)
318                 return;
319
320         level = btrfs_header_level(src);
321         nritems = btrfs_header_nritems(src);
322
323         if (nritems == 0) {
324                 size = sizeof(struct btrfs_header);
325                 memset(dst + size, 0, src->len - size);
326         } else if (level == 0) {
327                 size = btrfs_leaf_data(src) +
328                         btrfs_item_offset_nr(src, nritems - 1) -
329                         btrfs_item_nr_offset(nritems);
330                 memset(dst + btrfs_item_nr_offset(nritems), 0, size);
331                 zero_items(md, dst, src);
332         } else {
333                 size = offsetof(struct btrfs_node, ptrs) +
334                         sizeof(struct btrfs_key_ptr) * nritems;
335                 memset(dst + size, 0, src->len - size);
336         }
337         csum_block(dst, src->len);
338 }
339
340 static void *dump_worker(void *data)
341 {
342         struct metadump_struct *md = (struct metadump_struct *)data;
343         struct async_work *async;
344         int ret;
345
346         while (1) {
347                 pthread_mutex_lock(&md->mutex);
348                 while (list_empty(&md->list)) {
349                         if (md->done) {
350                                 pthread_mutex_unlock(&md->mutex);
351                                 goto out;
352                         }
353                         pthread_cond_wait(&md->cond, &md->mutex);
354                 }
355                 async = list_entry(md->list.next, struct async_work, list);
356                 list_del_init(&async->list);
357                 pthread_mutex_unlock(&md->mutex);
358
359                 if (md->compress_level > 0) {
360                         u8 *orig = async->buffer;
361
362                         async->bufsize = compressBound(async->size);
363                         async->buffer = malloc(async->bufsize);
364                         if (!async->buffer) {
365                                 error("not enough memory for async buffer");
366                                 pthread_mutex_lock(&md->mutex);
367                                 if (!md->error)
368                                         md->error = -ENOMEM;
369                                 pthread_mutex_unlock(&md->mutex);
370                                 pthread_exit(NULL);
371                         }
372
373                         ret = compress2(async->buffer,
374                                          (unsigned long *)&async->bufsize,
375                                          orig, async->size, md->compress_level);
376
377                         if (ret != Z_OK)
378                                 async->error = 1;
379
380                         free(orig);
381                 }
382
383                 pthread_mutex_lock(&md->mutex);
384                 md->num_ready++;
385                 pthread_mutex_unlock(&md->mutex);
386         }
387 out:
388         pthread_exit(NULL);
389 }
390
391 static void meta_cluster_init(struct metadump_struct *md, u64 start)
392 {
393         struct meta_cluster_header *header;
394
395         md->num_items = 0;
396         md->num_ready = 0;
397         header = &md->cluster.header;
398         header->magic = cpu_to_le64(HEADER_MAGIC);
399         header->bytenr = cpu_to_le64(start);
400         header->nritems = cpu_to_le32(0);
401         header->compress = md->compress_level > 0 ?
402                            COMPRESS_ZLIB : COMPRESS_NONE;
403 }
404
405 static void metadump_destroy(struct metadump_struct *md, int num_threads)
406 {
407         int i;
408         struct rb_node *n;
409
410         pthread_mutex_lock(&md->mutex);
411         md->done = 1;
412         pthread_cond_broadcast(&md->cond);
413         pthread_mutex_unlock(&md->mutex);
414
415         for (i = 0; i < num_threads; i++)
416                 pthread_join(md->threads[i], NULL);
417
418         pthread_cond_destroy(&md->cond);
419         pthread_mutex_destroy(&md->mutex);
420
421         while ((n = rb_first(&md->name_tree))) {
422                 struct name *name;
423
424                 name = rb_entry(n, struct name, n);
425                 rb_erase(n, &md->name_tree);
426                 free(name->val);
427                 free(name->sub);
428                 free(name);
429         }
430 }
431
432 static int metadump_init(struct metadump_struct *md, struct btrfs_root *root,
433                          FILE *out, int num_threads, int compress_level,
434                          enum sanitize_mode sanitize_names)
435 {
436         int i, ret = 0;
437
438         memset(md, 0, sizeof(*md));
439         INIT_LIST_HEAD(&md->list);
440         INIT_LIST_HEAD(&md->ordered);
441         md->root = root;
442         md->out = out;
443         md->pending_start = (u64)-1;
444         md->compress_level = compress_level;
445         md->sanitize_names = sanitize_names;
446         if (sanitize_names == SANITIZE_COLLISIONS)
447                 crc32c_optimization_init();
448
449         md->name_tree.rb_node = NULL;
450         md->num_threads = num_threads;
451         pthread_cond_init(&md->cond, NULL);
452         pthread_mutex_init(&md->mutex, NULL);
453         meta_cluster_init(md, 0);
454
455         if (!num_threads)
456                 return 0;
457
458         for (i = 0; i < num_threads; i++) {
459                 ret = pthread_create(md->threads + i, NULL, dump_worker, md);
460                 if (ret)
461                         break;
462         }
463
464         if (ret)
465                 metadump_destroy(md, i + 1);
466
467         return ret;
468 }
469
470 static int write_zero(FILE *out, size_t size)
471 {
472         static char zero[BLOCK_SIZE];
473         return fwrite(zero, size, 1, out);
474 }
475
476 static int write_buffers(struct metadump_struct *md, u64 *next)
477 {
478         struct meta_cluster_header *header = &md->cluster.header;
479         struct meta_cluster_item *item;
480         struct async_work *async;
481         u64 bytenr = 0;
482         u32 nritems = 0;
483         int ret;
484         int err = 0;
485
486         if (list_empty(&md->ordered))
487                 goto out;
488
489         /* wait until all buffers are compressed */
490         while (!err && md->num_items > md->num_ready) {
491                 struct timespec ts = {
492                         .tv_sec = 0,
493                         .tv_nsec = 10000000,
494                 };
495                 pthread_mutex_unlock(&md->mutex);
496                 nanosleep(&ts, NULL);
497                 pthread_mutex_lock(&md->mutex);
498                 err = md->error;
499         }
500
501         if (err) {
502                 error("one of the threads failed: %s", strerror(-err));
503                 goto out;
504         }
505
506         /* setup and write index block */
507         list_for_each_entry(async, &md->ordered, ordered) {
508                 item = &md->cluster.items[nritems];
509                 item->bytenr = cpu_to_le64(async->start);
510                 item->size = cpu_to_le32(async->bufsize);
511                 nritems++;
512         }
513         header->nritems = cpu_to_le32(nritems);
514
515         ret = fwrite(&md->cluster, BLOCK_SIZE, 1, md->out);
516         if (ret != 1) {
517                 error("unable to write out cluster: %s", strerror(errno));
518                 return -errno;
519         }
520
521         /* write buffers */
522         bytenr += le64_to_cpu(header->bytenr) + BLOCK_SIZE;
523         while (!list_empty(&md->ordered)) {
524                 async = list_entry(md->ordered.next, struct async_work,
525                                    ordered);
526                 list_del_init(&async->ordered);
527
528                 bytenr += async->bufsize;
529                 if (!err)
530                         ret = fwrite(async->buffer, async->bufsize, 1,
531                                      md->out);
532                 if (ret != 1) {
533                         error("unable to write out cluster: %s",
534                                 strerror(errno));
535                         err = -errno;
536                         ret = 0;
537                 }
538
539                 free(async->buffer);
540                 free(async);
541         }
542
543         /* zero unused space in the last block */
544         if (!err && bytenr & BLOCK_MASK) {
545                 size_t size = BLOCK_SIZE - (bytenr & BLOCK_MASK);
546
547                 bytenr += size;
548                 ret = write_zero(md->out, size);
549                 if (ret != 1) {
550                         error("unable to zero out buffer: %s",
551                                 strerror(errno));
552                         err = -errno;
553                 }
554         }
555 out:
556         *next = bytenr;
557         return err;
558 }
559
560 static int read_data_extent(struct metadump_struct *md,
561                             struct async_work *async)
562 {
563         struct btrfs_root *root = md->root;
564         struct btrfs_fs_info *fs_info = root->fs_info;
565         u64 bytes_left = async->size;
566         u64 logical = async->start;
567         u64 offset = 0;
568         u64 read_len;
569         int num_copies;
570         int cur_mirror;
571         int ret;
572
573         num_copies = btrfs_num_copies(root->fs_info, logical, bytes_left);
574
575         /* Try our best to read data, just like read_tree_block() */
576         for (cur_mirror = 0; cur_mirror < num_copies; cur_mirror++) {
577                 while (bytes_left) {
578                         read_len = bytes_left;
579                         ret = read_extent_data(fs_info,
580                                         (char *)(async->buffer + offset),
581                                         logical, &read_len, cur_mirror);
582                         if (ret < 0)
583                                 break;
584                         offset += read_len;
585                         logical += read_len;
586                         bytes_left -= read_len;
587                 }
588         }
589         if (bytes_left)
590                 return -EIO;
591         return 0;
592 }
593
594 static int get_dev_fd(struct btrfs_root *root)
595 {
596         struct btrfs_device *dev;
597
598         dev = list_first_entry(&root->fs_info->fs_devices->devices,
599                                struct btrfs_device, dev_list);
600         return dev->fd;
601 }
602
603 static int flush_pending(struct metadump_struct *md, int done)
604 {
605         struct async_work *async = NULL;
606         struct extent_buffer *eb;
607         u64 start = 0;
608         u64 size;
609         size_t offset;
610         int ret = 0;
611
612         if (md->pending_size) {
613                 async = calloc(1, sizeof(*async));
614                 if (!async)
615                         return -ENOMEM;
616
617                 async->start = md->pending_start;
618                 async->size = md->pending_size;
619                 async->bufsize = async->size;
620                 async->buffer = malloc(async->bufsize);
621                 if (!async->buffer) {
622                         free(async);
623                         return -ENOMEM;
624                 }
625                 offset = 0;
626                 start = async->start;
627                 size = async->size;
628
629                 if (md->data) {
630                         ret = read_data_extent(md, async);
631                         if (ret) {
632                                 free(async->buffer);
633                                 free(async);
634                                 return ret;
635                         }
636                 }
637
638                 /*
639                  * Balance can make the mapping not cover the super block, so
640                  * just copy directly from one of the devices.
641                  */
642                 if (start == BTRFS_SUPER_INFO_OFFSET) {
643                         int fd = get_dev_fd(md->root);
644
645                         ret = pread64(fd, async->buffer, size, start);
646                         if (ret < size) {
647                                 free(async->buffer);
648                                 free(async);
649                                 error("unable to read superblock at %llu: %s",
650                                                 (unsigned long long)start,
651                                                 strerror(errno));
652                                 return -errno;
653                         }
654                         size = 0;
655                         ret = 0;
656                 }
657
658                 while (!md->data && size > 0) {
659                         u64 this_read = min((u64)md->root->fs_info->nodesize,
660                                         size);
661
662                         eb = read_tree_block(md->root->fs_info, start, 0);
663                         if (!extent_buffer_uptodate(eb)) {
664                                 free(async->buffer);
665                                 free(async);
666                                 error("unable to read metadata block %llu",
667                                         (unsigned long long)start);
668                                 return -EIO;
669                         }
670                         copy_buffer(md, async->buffer + offset, eb);
671                         free_extent_buffer(eb);
672                         start += this_read;
673                         offset += this_read;
674                         size -= this_read;
675                 }
676
677                 md->pending_start = (u64)-1;
678                 md->pending_size = 0;
679         } else if (!done) {
680                 return 0;
681         }
682
683         pthread_mutex_lock(&md->mutex);
684         if (async) {
685                 list_add_tail(&async->ordered, &md->ordered);
686                 md->num_items++;
687                 if (md->compress_level > 0) {
688                         list_add_tail(&async->list, &md->list);
689                         pthread_cond_signal(&md->cond);
690                 } else {
691                         md->num_ready++;
692                 }
693         }
694         if (md->num_items >= ITEMS_PER_CLUSTER || done) {
695                 ret = write_buffers(md, &start);
696                 if (ret)
697                         error("unable to write buffers: %s", strerror(-ret));
698                 else
699                         meta_cluster_init(md, start);
700         }
701         pthread_mutex_unlock(&md->mutex);
702         return ret;
703 }
704
705 static int add_extent(u64 start, u64 size, struct metadump_struct *md,
706                       int data)
707 {
708         int ret;
709         if (md->data != data ||
710             md->pending_size + size > MAX_PENDING_SIZE ||
711             md->pending_start + md->pending_size != start) {
712                 ret = flush_pending(md, 0);
713                 if (ret)
714                         return ret;
715                 md->pending_start = start;
716         }
717         readahead_tree_block(md->root->fs_info, start, 0);
718         md->pending_size += size;
719         md->data = data;
720         return 0;
721 }
722
723 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
724 static int is_tree_block(struct btrfs_root *extent_root,
725                          struct btrfs_path *path, u64 bytenr)
726 {
727         struct extent_buffer *leaf;
728         struct btrfs_key key;
729         u64 ref_objectid;
730         int ret;
731
732         leaf = path->nodes[0];
733         while (1) {
734                 struct btrfs_extent_ref_v0 *ref_item;
735                 path->slots[0]++;
736                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
737                         ret = btrfs_next_leaf(extent_root, path);
738                         if (ret < 0)
739                                 return ret;
740                         if (ret > 0)
741                                 break;
742                         leaf = path->nodes[0];
743                 }
744                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
745                 if (key.objectid != bytenr)
746                         break;
747                 if (key.type != BTRFS_EXTENT_REF_V0_KEY)
748                         continue;
749                 ref_item = btrfs_item_ptr(leaf, path->slots[0],
750                                           struct btrfs_extent_ref_v0);
751                 ref_objectid = btrfs_ref_objectid_v0(leaf, ref_item);
752                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID)
753                         return 1;
754                 break;
755         }
756         return 0;
757 }
758 #endif
759
760 static int copy_tree_blocks(struct btrfs_root *root, struct extent_buffer *eb,
761                             struct metadump_struct *metadump, int root_tree)
762 {
763         struct extent_buffer *tmp;
764         struct btrfs_root_item *ri;
765         struct btrfs_key key;
766         struct btrfs_fs_info *fs_info = root->fs_info;
767         u64 bytenr;
768         int level;
769         int nritems = 0;
770         int i = 0;
771         int ret;
772
773         ret = add_extent(btrfs_header_bytenr(eb), fs_info->nodesize,
774                          metadump, 0);
775         if (ret) {
776                 error("unable to add metadata block %llu: %d",
777                                 btrfs_header_bytenr(eb), ret);
778                 return ret;
779         }
780
781         if (btrfs_header_level(eb) == 0 && !root_tree)
782                 return 0;
783
784         level = btrfs_header_level(eb);
785         nritems = btrfs_header_nritems(eb);
786         for (i = 0; i < nritems; i++) {
787                 if (level == 0) {
788                         btrfs_item_key_to_cpu(eb, &key, i);
789                         if (key.type != BTRFS_ROOT_ITEM_KEY)
790                                 continue;
791                         ri = btrfs_item_ptr(eb, i, struct btrfs_root_item);
792                         bytenr = btrfs_disk_root_bytenr(eb, ri);
793                         tmp = read_tree_block(fs_info, bytenr, 0);
794                         if (!extent_buffer_uptodate(tmp)) {
795                                 error("unable to read log root block");
796                                 return -EIO;
797                         }
798                         ret = copy_tree_blocks(root, tmp, metadump, 0);
799                         free_extent_buffer(tmp);
800                         if (ret)
801                                 return ret;
802                 } else {
803                         bytenr = btrfs_node_blockptr(eb, i);
804                         tmp = read_tree_block(fs_info, bytenr, 0);
805                         if (!extent_buffer_uptodate(tmp)) {
806                                 error("unable to read log root block");
807                                 return -EIO;
808                         }
809                         ret = copy_tree_blocks(root, tmp, metadump, root_tree);
810                         free_extent_buffer(tmp);
811                         if (ret)
812                                 return ret;
813                 }
814         }
815
816         return 0;
817 }
818
819 static int copy_log_trees(struct btrfs_root *root,
820                           struct metadump_struct *metadump)
821 {
822         u64 blocknr = btrfs_super_log_root(root->fs_info->super_copy);
823
824         if (blocknr == 0)
825                 return 0;
826
827         if (!root->fs_info->log_root_tree ||
828             !root->fs_info->log_root_tree->node) {
829                 error("unable to copy tree log, it has not been setup");
830                 return -EIO;
831         }
832
833         return copy_tree_blocks(root, root->fs_info->log_root_tree->node,
834                                 metadump, 1);
835 }
836
837 static int copy_space_cache(struct btrfs_root *root,
838                             struct metadump_struct *metadump,
839                             struct btrfs_path *path)
840 {
841         struct extent_buffer *leaf;
842         struct btrfs_file_extent_item *fi;
843         struct btrfs_key key;
844         u64 bytenr, num_bytes;
845         int ret;
846
847         root = root->fs_info->tree_root;
848
849         key.objectid = 0;
850         key.type = BTRFS_EXTENT_DATA_KEY;
851         key.offset = 0;
852
853         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
854         if (ret < 0) {
855                 error("free space inode not found: %d", ret);
856                 return ret;
857         }
858
859         leaf = path->nodes[0];
860
861         while (1) {
862                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
863                         ret = btrfs_next_leaf(root, path);
864                         if (ret < 0) {
865                                 error("cannot go to next leaf %d", ret);
866                                 return ret;
867                         }
868                         if (ret > 0)
869                                 break;
870                         leaf = path->nodes[0];
871                 }
872
873                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
874                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
875                         path->slots[0]++;
876                         continue;
877                 }
878
879                 fi = btrfs_item_ptr(leaf, path->slots[0],
880                                     struct btrfs_file_extent_item);
881                 if (btrfs_file_extent_type(leaf, fi) !=
882                     BTRFS_FILE_EXTENT_REG) {
883                         path->slots[0]++;
884                         continue;
885                 }
886
887                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
888                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
889                 ret = add_extent(bytenr, num_bytes, metadump, 1);
890                 if (ret) {
891                         error("unable to add space cache blocks %d", ret);
892                         btrfs_release_path(path);
893                         return ret;
894                 }
895                 path->slots[0]++;
896         }
897
898         return 0;
899 }
900
901 static int copy_from_extent_tree(struct metadump_struct *metadump,
902                                  struct btrfs_path *path)
903 {
904         struct btrfs_root *extent_root;
905         struct extent_buffer *leaf;
906         struct btrfs_extent_item *ei;
907         struct btrfs_key key;
908         u64 bytenr;
909         u64 num_bytes;
910         int ret;
911
912         extent_root = metadump->root->fs_info->extent_root;
913         bytenr = BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE;
914         key.objectid = bytenr;
915         key.type = BTRFS_EXTENT_ITEM_KEY;
916         key.offset = 0;
917
918         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
919         if (ret < 0) {
920                 error("extent root not found: %d", ret);
921                 return ret;
922         }
923         ret = 0;
924
925         leaf = path->nodes[0];
926
927         while (1) {
928                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
929                         ret = btrfs_next_leaf(extent_root, path);
930                         if (ret < 0) {
931                                 error("cannot go to next leaf %d", ret);
932                                 break;
933                         }
934                         if (ret > 0) {
935                                 ret = 0;
936                                 break;
937                         }
938                         leaf = path->nodes[0];
939                 }
940
941                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
942                 if (key.objectid < bytenr ||
943                     (key.type != BTRFS_EXTENT_ITEM_KEY &&
944                      key.type != BTRFS_METADATA_ITEM_KEY)) {
945                         path->slots[0]++;
946                         continue;
947                 }
948
949                 bytenr = key.objectid;
950                 if (key.type == BTRFS_METADATA_ITEM_KEY) {
951                         num_bytes = extent_root->fs_info->nodesize;
952                 } else {
953                         num_bytes = key.offset;
954                 }
955
956                 if (num_bytes == 0) {
957                         error("extent length 0 at bytenr %llu key type %d",
958                                         (unsigned long long)bytenr, key.type);
959                         ret = -EIO;
960                         break;
961                 }
962
963                 if (btrfs_item_size_nr(leaf, path->slots[0]) > sizeof(*ei)) {
964                         ei = btrfs_item_ptr(leaf, path->slots[0],
965                                             struct btrfs_extent_item);
966                         if (btrfs_extent_flags(leaf, ei) &
967                             BTRFS_EXTENT_FLAG_TREE_BLOCK) {
968                                 ret = add_extent(bytenr, num_bytes, metadump,
969                                                  0);
970                                 if (ret) {
971                                         error("unable to add block %llu: %d",
972                                                 (unsigned long long)bytenr, ret);
973                                         break;
974                                 }
975                         }
976                 } else {
977 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
978                         ret = is_tree_block(extent_root, path, bytenr);
979                         if (ret < 0) {
980                                 error("failed to check tree block %llu: %d",
981                                         (unsigned long long)bytenr, ret);
982                                 break;
983                         }
984
985                         if (ret) {
986                                 ret = add_extent(bytenr, num_bytes, metadump,
987                                                  0);
988                                 if (ret) {
989                                         error("unable to add block %llu: %d",
990                                                 (unsigned long long)bytenr, ret);
991                                         break;
992                                 }
993                         }
994                         ret = 0;
995 #else
996                         error(
997         "either extent tree is corrupted or you haven't built with V0 support");
998                         ret = -EIO;
999                         break;
1000 #endif
1001                 }
1002                 bytenr += num_bytes;
1003         }
1004
1005         btrfs_release_path(path);
1006
1007         return ret;
1008 }
1009
1010 static int create_metadump(const char *input, FILE *out, int num_threads,
1011                            int compress_level, enum sanitize_mode sanitize,
1012                            int walk_trees)
1013 {
1014         struct btrfs_root *root;
1015         struct btrfs_path path;
1016         struct metadump_struct metadump;
1017         int ret;
1018         int err = 0;
1019
1020         root = open_ctree(input, 0, 0);
1021         if (!root) {
1022                 error("open ctree failed");
1023                 return -EIO;
1024         }
1025
1026         ret = metadump_init(&metadump, root, out, num_threads,
1027                             compress_level, sanitize);
1028         if (ret) {
1029                 error("failed to initialize metadump: %d", ret);
1030                 close_ctree(root);
1031                 return ret;
1032         }
1033
1034         ret = add_extent(BTRFS_SUPER_INFO_OFFSET, BTRFS_SUPER_INFO_SIZE,
1035                         &metadump, 0);
1036         if (ret) {
1037                 error("unable to add metadata: %d", ret);
1038                 err = ret;
1039                 goto out;
1040         }
1041
1042         btrfs_init_path(&path);
1043
1044         if (walk_trees) {
1045                 ret = copy_tree_blocks(root, root->fs_info->chunk_root->node,
1046                                        &metadump, 1);
1047                 if (ret) {
1048                         err = ret;
1049                         goto out;
1050                 }
1051
1052                 ret = copy_tree_blocks(root, root->fs_info->tree_root->node,
1053                                        &metadump, 1);
1054                 if (ret) {
1055                         err = ret;
1056                         goto out;
1057                 }
1058         } else {
1059                 ret = copy_from_extent_tree(&metadump, &path);
1060                 if (ret) {
1061                         err = ret;
1062                         goto out;
1063                 }
1064         }
1065
1066         ret = copy_log_trees(root, &metadump);
1067         if (ret) {
1068                 err = ret;
1069                 goto out;
1070         }
1071
1072         ret = copy_space_cache(root, &metadump, &path);
1073 out:
1074         ret = flush_pending(&metadump, 1);
1075         if (ret) {
1076                 if (!err)
1077                         err = ret;
1078                 error("failed to flush pending data: %d", ret);
1079         }
1080
1081         metadump_destroy(&metadump, num_threads);
1082
1083         btrfs_release_path(&path);
1084         ret = close_ctree(root);
1085         return err ? err : ret;
1086 }
1087
1088 static void update_super_old(u8 *buffer)
1089 {
1090         struct btrfs_super_block *super = (struct btrfs_super_block *)buffer;
1091         struct btrfs_chunk *chunk;
1092         struct btrfs_disk_key *key;
1093         u32 sectorsize = btrfs_super_sectorsize(super);
1094         u64 flags = btrfs_super_flags(super);
1095
1096         flags |= BTRFS_SUPER_FLAG_METADUMP;
1097         btrfs_set_super_flags(super, flags);
1098
1099         key = (struct btrfs_disk_key *)(super->sys_chunk_array);
1100         chunk = (struct btrfs_chunk *)(super->sys_chunk_array +
1101                                        sizeof(struct btrfs_disk_key));
1102
1103         btrfs_set_disk_key_objectid(key, BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1104         btrfs_set_disk_key_type(key, BTRFS_CHUNK_ITEM_KEY);
1105         btrfs_set_disk_key_offset(key, 0);
1106
1107         btrfs_set_stack_chunk_length(chunk, (u64)-1);
1108         btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
1109         btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
1110         btrfs_set_stack_chunk_type(chunk, BTRFS_BLOCK_GROUP_SYSTEM);
1111         btrfs_set_stack_chunk_io_align(chunk, sectorsize);
1112         btrfs_set_stack_chunk_io_width(chunk, sectorsize);
1113         btrfs_set_stack_chunk_sector_size(chunk, sectorsize);
1114         btrfs_set_stack_chunk_num_stripes(chunk, 1);
1115         btrfs_set_stack_chunk_sub_stripes(chunk, 0);
1116         chunk->stripe.devid = super->dev_item.devid;
1117         btrfs_set_stack_stripe_offset(&chunk->stripe, 0);
1118         memcpy(chunk->stripe.dev_uuid, super->dev_item.uuid, BTRFS_UUID_SIZE);
1119         btrfs_set_super_sys_array_size(super, sizeof(*key) + sizeof(*chunk));
1120         csum_block(buffer, BTRFS_SUPER_INFO_SIZE);
1121 }
1122
1123 static int update_super(struct mdrestore_struct *mdres, u8 *buffer)
1124 {
1125         struct btrfs_super_block *super = (struct btrfs_super_block *)buffer;
1126         struct btrfs_chunk *chunk;
1127         struct btrfs_disk_key *disk_key;
1128         struct btrfs_key key;
1129         u64 flags = btrfs_super_flags(super);
1130         u32 new_array_size = 0;
1131         u32 array_size;
1132         u32 cur = 0;
1133         u8 *ptr, *write_ptr;
1134         int old_num_stripes;
1135
1136         write_ptr = ptr = super->sys_chunk_array;
1137         array_size = btrfs_super_sys_array_size(super);
1138
1139         while (cur < array_size) {
1140                 disk_key = (struct btrfs_disk_key *)ptr;
1141                 btrfs_disk_key_to_cpu(&key, disk_key);
1142
1143                 new_array_size += sizeof(*disk_key);
1144                 memmove(write_ptr, ptr, sizeof(*disk_key));
1145
1146                 write_ptr += sizeof(*disk_key);
1147                 ptr += sizeof(*disk_key);
1148                 cur += sizeof(*disk_key);
1149
1150                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1151                         u64 type, physical, physical_dup, size = 0;
1152
1153                         chunk = (struct btrfs_chunk *)ptr;
1154                         old_num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1155                         chunk = (struct btrfs_chunk *)write_ptr;
1156
1157                         memmove(write_ptr, ptr, sizeof(*chunk));
1158                         btrfs_set_stack_chunk_sub_stripes(chunk, 0);
1159                         type = btrfs_stack_chunk_type(chunk);
1160                         if (type & BTRFS_BLOCK_GROUP_DUP) {
1161                                 new_array_size += sizeof(struct btrfs_stripe);
1162                                 write_ptr += sizeof(struct btrfs_stripe);
1163                         } else {
1164                                 btrfs_set_stack_chunk_num_stripes(chunk, 1);
1165                                 btrfs_set_stack_chunk_type(chunk,
1166                                                 BTRFS_BLOCK_GROUP_SYSTEM);
1167                         }
1168                         chunk->stripe.devid = super->dev_item.devid;
1169                         physical = logical_to_physical(mdres, key.offset,
1170                                                        &size, &physical_dup);
1171                         if (size != (u64)-1)
1172                                 btrfs_set_stack_stripe_offset(&chunk->stripe,
1173                                                               physical);
1174                         memcpy(chunk->stripe.dev_uuid, super->dev_item.uuid,
1175                                BTRFS_UUID_SIZE);
1176                         new_array_size += sizeof(*chunk);
1177                 } else {
1178                         error("bogus key in the sys array %d", key.type);
1179                         return -EIO;
1180                 }
1181                 write_ptr += sizeof(*chunk);
1182                 ptr += btrfs_chunk_item_size(old_num_stripes);
1183                 cur += btrfs_chunk_item_size(old_num_stripes);
1184         }
1185
1186         if (mdres->clear_space_cache)
1187                 btrfs_set_super_cache_generation(super, 0);
1188
1189         flags |= BTRFS_SUPER_FLAG_METADUMP_V2;
1190         btrfs_set_super_flags(super, flags);
1191         btrfs_set_super_sys_array_size(super, new_array_size);
1192         btrfs_set_super_num_devices(super, 1);
1193         csum_block(buffer, BTRFS_SUPER_INFO_SIZE);
1194
1195         return 0;
1196 }
1197
1198 static struct extent_buffer *alloc_dummy_eb(u64 bytenr, u32 size)
1199 {
1200         struct extent_buffer *eb;
1201
1202         eb = calloc(1, sizeof(struct extent_buffer) + size);
1203         if (!eb)
1204                 return NULL;
1205
1206         eb->start = bytenr;
1207         eb->len = size;
1208         return eb;
1209 }
1210
1211 static void truncate_item(struct extent_buffer *eb, int slot, u32 new_size)
1212 {
1213         struct btrfs_item *item;
1214         u32 nritems;
1215         u32 old_size;
1216         u32 old_data_start;
1217         u32 size_diff;
1218         u32 data_end;
1219         int i;
1220
1221         old_size = btrfs_item_size_nr(eb, slot);
1222         if (old_size == new_size)
1223                 return;
1224
1225         nritems = btrfs_header_nritems(eb);
1226         data_end = btrfs_item_offset_nr(eb, nritems - 1);
1227
1228         old_data_start = btrfs_item_offset_nr(eb, slot);
1229         size_diff = old_size - new_size;
1230
1231         for (i = slot; i < nritems; i++) {
1232                 u32 ioff;
1233                 item = btrfs_item_nr(i);
1234                 ioff = btrfs_item_offset(eb, item);
1235                 btrfs_set_item_offset(eb, item, ioff + size_diff);
1236         }
1237
1238         memmove_extent_buffer(eb, btrfs_leaf_data(eb) + data_end + size_diff,
1239                               btrfs_leaf_data(eb) + data_end,
1240                               old_data_start + new_size - data_end);
1241         item = btrfs_item_nr(slot);
1242         btrfs_set_item_size(eb, item, new_size);
1243 }
1244
1245 static int fixup_chunk_tree_block(struct mdrestore_struct *mdres,
1246                                   struct async_work *async, u8 *buffer,
1247                                   size_t size)
1248 {
1249         struct extent_buffer *eb;
1250         size_t size_left = size;
1251         u64 bytenr = async->start;
1252         int i;
1253
1254         if (size_left % mdres->nodesize)
1255                 return 0;
1256
1257         eb = alloc_dummy_eb(bytenr, mdres->nodesize);
1258         if (!eb)
1259                 return -ENOMEM;
1260
1261         while (size_left) {
1262                 eb->start = bytenr;
1263                 memcpy(eb->data, buffer, mdres->nodesize);
1264
1265                 if (btrfs_header_bytenr(eb) != bytenr)
1266                         break;
1267                 if (memcmp(mdres->fsid,
1268                            eb->data + offsetof(struct btrfs_header, fsid),
1269                            BTRFS_FSID_SIZE))
1270                         break;
1271
1272                 if (btrfs_header_owner(eb) != BTRFS_CHUNK_TREE_OBJECTID)
1273                         goto next;
1274
1275                 if (btrfs_header_level(eb) != 0)
1276                         goto next;
1277
1278                 for (i = 0; i < btrfs_header_nritems(eb); i++) {
1279                         struct btrfs_chunk *chunk;
1280                         struct btrfs_key key;
1281                         u64 type, physical, physical_dup, size = (u64)-1;
1282
1283                         btrfs_item_key_to_cpu(eb, &key, i);
1284                         if (key.type != BTRFS_CHUNK_ITEM_KEY)
1285                                 continue;
1286
1287                         size = 0;
1288                         physical = logical_to_physical(mdres, key.offset,
1289                                                        &size, &physical_dup);
1290
1291                         if (!physical_dup)
1292                                 truncate_item(eb, i, sizeof(*chunk));
1293                         chunk = btrfs_item_ptr(eb, i, struct btrfs_chunk);
1294
1295
1296                         /* Zero out the RAID profile */
1297                         type = btrfs_chunk_type(eb, chunk);
1298                         type &= (BTRFS_BLOCK_GROUP_DATA |
1299                                  BTRFS_BLOCK_GROUP_SYSTEM |
1300                                  BTRFS_BLOCK_GROUP_METADATA |
1301                                  BTRFS_BLOCK_GROUP_DUP);
1302                         btrfs_set_chunk_type(eb, chunk, type);
1303
1304                         if (!physical_dup)
1305                                 btrfs_set_chunk_num_stripes(eb, chunk, 1);
1306                         btrfs_set_chunk_sub_stripes(eb, chunk, 0);
1307                         btrfs_set_stripe_devid_nr(eb, chunk, 0, mdres->devid);
1308                         if (size != (u64)-1)
1309                                 btrfs_set_stripe_offset_nr(eb, chunk, 0,
1310                                                            physical);
1311                         /* update stripe 2 offset */
1312                         if (physical_dup)
1313                                 btrfs_set_stripe_offset_nr(eb, chunk, 1,
1314                                                            physical_dup);
1315
1316                         write_extent_buffer(eb, mdres->uuid,
1317                                         (unsigned long)btrfs_stripe_dev_uuid_nr(
1318                                                 chunk, 0),
1319                                         BTRFS_UUID_SIZE);
1320                 }
1321                 memcpy(buffer, eb->data, eb->len);
1322                 csum_block(buffer, eb->len);
1323 next:
1324                 size_left -= mdres->nodesize;
1325                 buffer += mdres->nodesize;
1326                 bytenr += mdres->nodesize;
1327         }
1328
1329         free(eb);
1330         return 0;
1331 }
1332
1333 static void write_backup_supers(int fd, u8 *buf)
1334 {
1335         struct btrfs_super_block *super = (struct btrfs_super_block *)buf;
1336         struct stat st;
1337         u64 size;
1338         u64 bytenr;
1339         int i;
1340         int ret;
1341
1342         if (fstat(fd, &st)) {
1343                 error(
1344         "cannot stat restore point, won't be able to write backup supers: %s",
1345                         strerror(errno));
1346                 return;
1347         }
1348
1349         size = btrfs_device_size(fd, &st);
1350
1351         for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1352                 bytenr = btrfs_sb_offset(i);
1353                 if (bytenr + BTRFS_SUPER_INFO_SIZE > size)
1354                         break;
1355                 btrfs_set_super_bytenr(super, bytenr);
1356                 csum_block(buf, BTRFS_SUPER_INFO_SIZE);
1357                 ret = pwrite64(fd, buf, BTRFS_SUPER_INFO_SIZE, bytenr);
1358                 if (ret < BTRFS_SUPER_INFO_SIZE) {
1359                         if (ret < 0)
1360                                 error(
1361                                 "problem writing out backup super block %d: %s",
1362                                                 i, strerror(errno));
1363                         else
1364                                 error("short write writing out backup super block");
1365                         break;
1366                 }
1367         }
1368 }
1369
1370 static void *restore_worker(void *data)
1371 {
1372         struct mdrestore_struct *mdres = (struct mdrestore_struct *)data;
1373         struct async_work *async;
1374         size_t size;
1375         u8 *buffer;
1376         u8 *outbuf;
1377         int outfd;
1378         int ret;
1379         int compress_size = MAX_PENDING_SIZE * 4;
1380
1381         outfd = fileno(mdres->out);
1382         buffer = malloc(compress_size);
1383         if (!buffer) {
1384                 error("not enough memory for restore worker buffer");
1385                 pthread_mutex_lock(&mdres->mutex);
1386                 if (!mdres->error)
1387                         mdres->error = -ENOMEM;
1388                 pthread_mutex_unlock(&mdres->mutex);
1389                 pthread_exit(NULL);
1390         }
1391
1392         while (1) {
1393                 u64 bytenr, physical_dup;
1394                 off_t offset = 0;
1395                 int err = 0;
1396
1397                 pthread_mutex_lock(&mdres->mutex);
1398                 while (!mdres->nodesize || list_empty(&mdres->list)) {
1399                         if (mdres->done) {
1400                                 pthread_mutex_unlock(&mdres->mutex);
1401                                 goto out;
1402                         }
1403                         pthread_cond_wait(&mdres->cond, &mdres->mutex);
1404                 }
1405                 async = list_entry(mdres->list.next, struct async_work, list);
1406                 list_del_init(&async->list);
1407
1408                 if (mdres->compress_method == COMPRESS_ZLIB) {
1409                         size = compress_size; 
1410                         pthread_mutex_unlock(&mdres->mutex);
1411                         ret = uncompress(buffer, (unsigned long *)&size,
1412                                          async->buffer, async->bufsize);
1413                         pthread_mutex_lock(&mdres->mutex);
1414                         if (ret != Z_OK) {
1415                                 error("decompression failed with %d", ret);
1416                                 err = -EIO;
1417                         }
1418                         outbuf = buffer;
1419                 } else {
1420                         outbuf = async->buffer;
1421                         size = async->bufsize;
1422                 }
1423
1424                 if (!mdres->multi_devices) {
1425                         if (async->start == BTRFS_SUPER_INFO_OFFSET) {
1426                                 if (mdres->old_restore) {
1427                                         update_super_old(outbuf);
1428                                 } else {
1429                                         ret = update_super(mdres, outbuf);
1430                                         if (ret)
1431                                                 err = ret;
1432                                 }
1433                         } else if (!mdres->old_restore) {
1434                                 ret = fixup_chunk_tree_block(mdres, async, outbuf, size);
1435                                 if (ret)
1436                                         err = ret;
1437                         }
1438                 }
1439
1440                 if (!mdres->fixup_offset) {
1441                         while (size) {
1442                                 u64 chunk_size = size;
1443                                 physical_dup = 0;
1444                                 if (!mdres->multi_devices && !mdres->old_restore)
1445                                         bytenr = logical_to_physical(mdres,
1446                                                      async->start + offset,
1447                                                      &chunk_size,
1448                                                      &physical_dup);
1449                                 else
1450                                         bytenr = async->start + offset;
1451
1452                                 ret = pwrite64(outfd, outbuf+offset, chunk_size,
1453                                                bytenr);
1454                                 if (ret != chunk_size)
1455                                         goto error;
1456
1457                                 if (physical_dup)
1458                                         ret = pwrite64(outfd, outbuf+offset,
1459                                                        chunk_size,
1460                                                        physical_dup);
1461                                 if (ret != chunk_size)
1462                                         goto error;
1463
1464                                 size -= chunk_size;
1465                                 offset += chunk_size;
1466                                 continue;
1467
1468 error:
1469                                 if (ret < 0) {
1470                                         error("unable to write to device: %s",
1471                                                         strerror(errno));
1472                                         err = errno;
1473                                 } else {
1474                                         error("short write");
1475                                         err = -EIO;
1476                                 }
1477                         }
1478                 } else if (async->start != BTRFS_SUPER_INFO_OFFSET) {
1479                         ret = write_data_to_disk(mdres->info, outbuf, async->start, size, 0);
1480                         if (ret) {
1481                                 error("failed to write data");
1482                                 exit(1);
1483                         }
1484                 }
1485
1486
1487                 /* backup super blocks are already there at fixup_offset stage */
1488                 if (!mdres->multi_devices && async->start == BTRFS_SUPER_INFO_OFFSET)
1489                         write_backup_supers(outfd, outbuf);
1490
1491                 if (err && !mdres->error)
1492                         mdres->error = err;
1493                 mdres->num_items--;
1494                 pthread_mutex_unlock(&mdres->mutex);
1495
1496                 free(async->buffer);
1497                 free(async);
1498         }
1499 out:
1500         free(buffer);
1501         pthread_exit(NULL);
1502 }
1503
1504 static void mdrestore_destroy(struct mdrestore_struct *mdres, int num_threads)
1505 {
1506         struct rb_node *n;
1507         int i;
1508
1509         while ((n = rb_first(&mdres->chunk_tree))) {
1510                 struct fs_chunk *entry;
1511
1512                 entry = rb_entry(n, struct fs_chunk, l);
1513                 rb_erase(n, &mdres->chunk_tree);
1514                 rb_erase(&entry->p, &mdres->physical_tree);
1515                 free(entry);
1516         }
1517         pthread_mutex_lock(&mdres->mutex);
1518         mdres->done = 1;
1519         pthread_cond_broadcast(&mdres->cond);
1520         pthread_mutex_unlock(&mdres->mutex);
1521
1522         for (i = 0; i < num_threads; i++)
1523                 pthread_join(mdres->threads[i], NULL);
1524
1525         pthread_cond_destroy(&mdres->cond);
1526         pthread_mutex_destroy(&mdres->mutex);
1527 }
1528
1529 static int mdrestore_init(struct mdrestore_struct *mdres,
1530                           FILE *in, FILE *out, int old_restore,
1531                           int num_threads, int fixup_offset,
1532                           struct btrfs_fs_info *info, int multi_devices)
1533 {
1534         int i, ret = 0;
1535
1536         memset(mdres, 0, sizeof(*mdres));
1537         pthread_cond_init(&mdres->cond, NULL);
1538         pthread_mutex_init(&mdres->mutex, NULL);
1539         INIT_LIST_HEAD(&mdres->list);
1540         INIT_LIST_HEAD(&mdres->overlapping_chunks);
1541         mdres->in = in;
1542         mdres->out = out;
1543         mdres->old_restore = old_restore;
1544         mdres->chunk_tree.rb_node = NULL;
1545         mdres->fixup_offset = fixup_offset;
1546         mdres->info = info;
1547         mdres->multi_devices = multi_devices;
1548         mdres->clear_space_cache = 0;
1549         mdres->last_physical_offset = 0;
1550         mdres->alloced_chunks = 0;
1551
1552         if (!num_threads)
1553                 return 0;
1554
1555         mdres->num_threads = num_threads;
1556         for (i = 0; i < num_threads; i++) {
1557                 ret = pthread_create(&mdres->threads[i], NULL, restore_worker,
1558                                      mdres);
1559                 if (ret) {
1560                         /* pthread_create returns errno directly */
1561                         ret = -ret;
1562                         break;
1563                 }
1564         }
1565         if (ret)
1566                 mdrestore_destroy(mdres, i + 1);
1567         return ret;
1568 }
1569
1570 static int fill_mdres_info(struct mdrestore_struct *mdres,
1571                            struct async_work *async)
1572 {
1573         struct btrfs_super_block *super;
1574         u8 *buffer = NULL;
1575         u8 *outbuf;
1576         int ret;
1577
1578         /* We've already been initialized */
1579         if (mdres->nodesize)
1580                 return 0;
1581
1582         if (mdres->compress_method == COMPRESS_ZLIB) {
1583                 size_t size = MAX_PENDING_SIZE * 2;
1584
1585                 buffer = malloc(MAX_PENDING_SIZE * 2);
1586                 if (!buffer)
1587                         return -ENOMEM;
1588                 ret = uncompress(buffer, (unsigned long *)&size,
1589                                  async->buffer, async->bufsize);
1590                 if (ret != Z_OK) {
1591                         error("decompression failed with %d", ret);
1592                         free(buffer);
1593                         return -EIO;
1594                 }
1595                 outbuf = buffer;
1596         } else {
1597                 outbuf = async->buffer;
1598         }
1599
1600         super = (struct btrfs_super_block *)outbuf;
1601         mdres->nodesize = btrfs_super_nodesize(super);
1602         memcpy(mdres->fsid, super->fsid, BTRFS_FSID_SIZE);
1603         memcpy(mdres->uuid, super->dev_item.uuid,
1604                        BTRFS_UUID_SIZE);
1605         mdres->devid = le64_to_cpu(super->dev_item.devid);
1606         free(buffer);
1607         return 0;
1608 }
1609
1610 static int add_cluster(struct meta_cluster *cluster,
1611                        struct mdrestore_struct *mdres, u64 *next)
1612 {
1613         struct meta_cluster_item *item;
1614         struct meta_cluster_header *header = &cluster->header;
1615         struct async_work *async;
1616         u64 bytenr;
1617         u32 i, nritems;
1618         int ret;
1619
1620         pthread_mutex_lock(&mdres->mutex);
1621         mdres->compress_method = header->compress;
1622         pthread_mutex_unlock(&mdres->mutex);
1623
1624         bytenr = le64_to_cpu(header->bytenr) + BLOCK_SIZE;
1625         nritems = le32_to_cpu(header->nritems);
1626         for (i = 0; i < nritems; i++) {
1627                 item = &cluster->items[i];
1628                 async = calloc(1, sizeof(*async));
1629                 if (!async) {
1630                         error("not enough memory for async data");
1631                         return -ENOMEM;
1632                 }
1633                 async->start = le64_to_cpu(item->bytenr);
1634                 async->bufsize = le32_to_cpu(item->size);
1635                 async->buffer = malloc(async->bufsize);
1636                 if (!async->buffer) {
1637                         error("not enough memory for async buffer");
1638                         free(async);
1639                         return -ENOMEM;
1640                 }
1641                 ret = fread(async->buffer, async->bufsize, 1, mdres->in);
1642                 if (ret != 1) {
1643                         error("unable to read buffer: %s", strerror(errno));
1644                         free(async->buffer);
1645                         free(async);
1646                         return -EIO;
1647                 }
1648                 bytenr += async->bufsize;
1649
1650                 pthread_mutex_lock(&mdres->mutex);
1651                 if (async->start == BTRFS_SUPER_INFO_OFFSET) {
1652                         ret = fill_mdres_info(mdres, async);
1653                         if (ret) {
1654                                 error("unable to set up restore state");
1655                                 pthread_mutex_unlock(&mdres->mutex);
1656                                 free(async->buffer);
1657                                 free(async);
1658                                 return ret;
1659                         }
1660                 }
1661                 list_add_tail(&async->list, &mdres->list);
1662                 mdres->num_items++;
1663                 pthread_cond_signal(&mdres->cond);
1664                 pthread_mutex_unlock(&mdres->mutex);
1665         }
1666         if (bytenr & BLOCK_MASK) {
1667                 char buffer[BLOCK_MASK];
1668                 size_t size = BLOCK_SIZE - (bytenr & BLOCK_MASK);
1669
1670                 bytenr += size;
1671                 ret = fread(buffer, size, 1, mdres->in);
1672                 if (ret != 1) {
1673                         error("failed to read buffer: %s", strerror(errno));
1674                         return -EIO;
1675                 }
1676         }
1677         *next = bytenr;
1678         return 0;
1679 }
1680
1681 static int wait_for_worker(struct mdrestore_struct *mdres)
1682 {
1683         int ret = 0;
1684
1685         pthread_mutex_lock(&mdres->mutex);
1686         ret = mdres->error;
1687         while (!ret && mdres->num_items > 0) {
1688                 struct timespec ts = {
1689                         .tv_sec = 0,
1690                         .tv_nsec = 10000000,
1691                 };
1692                 pthread_mutex_unlock(&mdres->mutex);
1693                 nanosleep(&ts, NULL);
1694                 pthread_mutex_lock(&mdres->mutex);
1695                 ret = mdres->error;
1696         }
1697         pthread_mutex_unlock(&mdres->mutex);
1698         return ret;
1699 }
1700
1701 static int read_chunk_block(struct mdrestore_struct *mdres, u8 *buffer,
1702                             u64 bytenr, u64 item_bytenr, u32 bufsize,
1703                             u64 cluster_bytenr)
1704 {
1705         struct extent_buffer *eb;
1706         int ret = 0;
1707         int i;
1708
1709         eb = alloc_dummy_eb(bytenr, mdres->nodesize);
1710         if (!eb) {
1711                 ret = -ENOMEM;
1712                 goto out;
1713         }
1714
1715         while (item_bytenr != bytenr) {
1716                 buffer += mdres->nodesize;
1717                 item_bytenr += mdres->nodesize;
1718         }
1719
1720         memcpy(eb->data, buffer, mdres->nodesize);
1721         if (btrfs_header_bytenr(eb) != bytenr) {
1722                 error("eb bytenr does not match found bytenr: %llu != %llu",
1723                                 (unsigned long long)btrfs_header_bytenr(eb),
1724                                 (unsigned long long)bytenr);
1725                 ret = -EIO;
1726                 goto out;
1727         }
1728
1729         if (memcmp(mdres->fsid, eb->data + offsetof(struct btrfs_header, fsid),
1730                    BTRFS_FSID_SIZE)) {
1731                 error("filesystem UUID of eb %llu does not match",
1732                                 (unsigned long long)bytenr);
1733                 ret = -EIO;
1734                 goto out;
1735         }
1736
1737         if (btrfs_header_owner(eb) != BTRFS_CHUNK_TREE_OBJECTID) {
1738                 error("wrong eb %llu owner %llu",
1739                                 (unsigned long long)bytenr,
1740                                 (unsigned long long)btrfs_header_owner(eb));
1741                 ret = -EIO;
1742                 goto out;
1743         }
1744
1745         for (i = 0; i < btrfs_header_nritems(eb); i++) {
1746                 struct btrfs_chunk *chunk;
1747                 struct fs_chunk *fs_chunk;
1748                 struct btrfs_key key;
1749                 u64 type;
1750
1751                 if (btrfs_header_level(eb)) {
1752                         u64 blockptr = btrfs_node_blockptr(eb, i);
1753
1754                         ret = search_for_chunk_blocks(mdres, blockptr,
1755                                                       cluster_bytenr);
1756                         if (ret)
1757                                 break;
1758                         continue;
1759                 }
1760
1761                 /* Yay a leaf!  We loves leafs! */
1762                 btrfs_item_key_to_cpu(eb, &key, i);
1763                 if (key.type != BTRFS_CHUNK_ITEM_KEY)
1764                         continue;
1765
1766                 fs_chunk = malloc(sizeof(struct fs_chunk));
1767                 if (!fs_chunk) {
1768                         error("not enough memory to allocate chunk");
1769                         ret = -ENOMEM;
1770                         break;
1771                 }
1772                 memset(fs_chunk, 0, sizeof(*fs_chunk));
1773                 chunk = btrfs_item_ptr(eb, i, struct btrfs_chunk);
1774
1775                 fs_chunk->logical = key.offset;
1776                 fs_chunk->physical = btrfs_stripe_offset_nr(eb, chunk, 0);
1777                 fs_chunk->bytes = btrfs_chunk_length(eb, chunk);
1778                 INIT_LIST_HEAD(&fs_chunk->list);
1779                 if (tree_search(&mdres->physical_tree, &fs_chunk->p,
1780                                 physical_cmp, 1) != NULL)
1781                         list_add(&fs_chunk->list, &mdres->overlapping_chunks);
1782                 else
1783                         tree_insert(&mdres->physical_tree, &fs_chunk->p,
1784                                     physical_cmp);
1785
1786                 type = btrfs_chunk_type(eb, chunk);
1787                 if (type & BTRFS_BLOCK_GROUP_DUP) {
1788                         fs_chunk->physical_dup =
1789                                         btrfs_stripe_offset_nr(eb, chunk, 1);
1790                 }
1791
1792                 if (fs_chunk->physical_dup + fs_chunk->bytes >
1793                     mdres->last_physical_offset)
1794                         mdres->last_physical_offset = fs_chunk->physical_dup +
1795                                 fs_chunk->bytes;
1796                 else if (fs_chunk->physical + fs_chunk->bytes >
1797                     mdres->last_physical_offset)
1798                         mdres->last_physical_offset = fs_chunk->physical +
1799                                 fs_chunk->bytes;
1800                 mdres->alloced_chunks += fs_chunk->bytes;
1801                 /* in dup case, fs_chunk->bytes should add twice */
1802                 if (fs_chunk->physical_dup)
1803                         mdres->alloced_chunks += fs_chunk->bytes;
1804                 tree_insert(&mdres->chunk_tree, &fs_chunk->l, chunk_cmp);
1805         }
1806 out:
1807         free(eb);
1808         return ret;
1809 }
1810
1811 /* If you have to ask you aren't worthy */
1812 static int search_for_chunk_blocks(struct mdrestore_struct *mdres,
1813                                    u64 search, u64 cluster_bytenr)
1814 {
1815         struct meta_cluster *cluster;
1816         struct meta_cluster_header *header;
1817         struct meta_cluster_item *item;
1818         u64 current_cluster = cluster_bytenr, bytenr;
1819         u64 item_bytenr;
1820         u32 bufsize, nritems, i;
1821         u32 max_size = MAX_PENDING_SIZE * 2;
1822         u8 *buffer, *tmp = NULL;
1823         int ret = 0;
1824
1825         cluster = malloc(BLOCK_SIZE);
1826         if (!cluster) {
1827                 error("not enough memory for cluster");
1828                 return -ENOMEM;
1829         }
1830
1831         buffer = malloc(max_size);
1832         if (!buffer) {
1833                 error("not enough memory for buffer");
1834                 free(cluster);
1835                 return -ENOMEM;
1836         }
1837
1838         if (mdres->compress_method == COMPRESS_ZLIB) {
1839                 tmp = malloc(max_size);
1840                 if (!tmp) {
1841                         error("not enough memory for buffer");
1842                         free(cluster);
1843                         free(buffer);
1844                         return -ENOMEM;
1845                 }
1846         }
1847
1848         bytenr = current_cluster;
1849         while (1) {
1850                 if (fseek(mdres->in, current_cluster, SEEK_SET)) {
1851                         error("seek failed: %s", strerror(errno));
1852                         ret = -EIO;
1853                         break;
1854                 }
1855
1856                 ret = fread(cluster, BLOCK_SIZE, 1, mdres->in);
1857                 if (ret == 0) {
1858                         if (cluster_bytenr != 0) {
1859                                 cluster_bytenr = 0;
1860                                 current_cluster = 0;
1861                                 bytenr = 0;
1862                                 continue;
1863                         }
1864                         error(
1865         "unknown state after reading cluster at %llu, probably corrupted data",
1866                                         cluster_bytenr);
1867                         ret = -EIO;
1868                         break;
1869                 } else if (ret < 0) {
1870                         error("unable to read image at %llu: %s",
1871                                         (unsigned long long)cluster_bytenr,
1872                                         strerror(errno));
1873                         break;
1874                 }
1875                 ret = 0;
1876
1877                 header = &cluster->header;
1878                 if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
1879                     le64_to_cpu(header->bytenr) != current_cluster) {
1880                         error("bad header in metadump image");
1881                         ret = -EIO;
1882                         break;
1883                 }
1884
1885                 bytenr += BLOCK_SIZE;
1886                 nritems = le32_to_cpu(header->nritems);
1887                 for (i = 0; i < nritems; i++) {
1888                         size_t size;
1889
1890                         item = &cluster->items[i];
1891                         bufsize = le32_to_cpu(item->size);
1892                         item_bytenr = le64_to_cpu(item->bytenr);
1893
1894                         if (bufsize > max_size) {
1895                                 error("item %u too big: %u > %u", i, bufsize,
1896                                                 max_size);
1897                                 ret = -EIO;
1898                                 break;
1899                         }
1900
1901                         if (mdres->compress_method == COMPRESS_ZLIB) {
1902                                 ret = fread(tmp, bufsize, 1, mdres->in);
1903                                 if (ret != 1) {
1904                                         error("read error: %s", strerror(errno));
1905                                         ret = -EIO;
1906                                         break;
1907                                 }
1908
1909                                 size = max_size;
1910                                 ret = uncompress(buffer,
1911                                                  (unsigned long *)&size, tmp,
1912                                                  bufsize);
1913                                 if (ret != Z_OK) {
1914                                         error("decompression failed with %d",
1915                                                         ret);
1916                                         ret = -EIO;
1917                                         break;
1918                                 }
1919                         } else {
1920                                 ret = fread(buffer, bufsize, 1, mdres->in);
1921                                 if (ret != 1) {
1922                                         error("read error: %s",
1923                                                         strerror(errno));
1924                                         ret = -EIO;
1925                                         break;
1926                                 }
1927                                 size = bufsize;
1928                         }
1929                         ret = 0;
1930
1931                         if (item_bytenr <= search &&
1932                             item_bytenr + size > search) {
1933                                 ret = read_chunk_block(mdres, buffer, search,
1934                                                        item_bytenr, size,
1935                                                        current_cluster);
1936                                 if (!ret)
1937                                         ret = 1;
1938                                 break;
1939                         }
1940                         bytenr += bufsize;
1941                 }
1942                 if (ret) {
1943                         if (ret > 0)
1944                                 ret = 0;
1945                         break;
1946                 }
1947                 if (bytenr & BLOCK_MASK)
1948                         bytenr += BLOCK_SIZE - (bytenr & BLOCK_MASK);
1949                 current_cluster = bytenr;
1950         }
1951
1952         free(tmp);
1953         free(buffer);
1954         free(cluster);
1955         return ret;
1956 }
1957
1958 static int build_chunk_tree(struct mdrestore_struct *mdres,
1959                             struct meta_cluster *cluster)
1960 {
1961         struct btrfs_super_block *super;
1962         struct meta_cluster_header *header;
1963         struct meta_cluster_item *item = NULL;
1964         u64 chunk_root_bytenr = 0;
1965         u32 i, nritems;
1966         u64 bytenr = 0;
1967         u8 *buffer;
1968         int ret;
1969
1970         /* We can't seek with stdin so don't bother doing this */
1971         if (mdres->in == stdin)
1972                 return 0;
1973
1974         ret = fread(cluster, BLOCK_SIZE, 1, mdres->in);
1975         if (ret <= 0) {
1976                 error("unable to read cluster: %s", strerror(errno));
1977                 return -EIO;
1978         }
1979         ret = 0;
1980
1981         header = &cluster->header;
1982         if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
1983             le64_to_cpu(header->bytenr) != 0) {
1984                 error("bad header in metadump image");
1985                 return -EIO;
1986         }
1987
1988         bytenr += BLOCK_SIZE;
1989         mdres->compress_method = header->compress;
1990         nritems = le32_to_cpu(header->nritems);
1991         for (i = 0; i < nritems; i++) {
1992                 item = &cluster->items[i];
1993
1994                 if (le64_to_cpu(item->bytenr) == BTRFS_SUPER_INFO_OFFSET)
1995                         break;
1996                 bytenr += le32_to_cpu(item->size);
1997                 if (fseek(mdres->in, le32_to_cpu(item->size), SEEK_CUR)) {
1998                         error("seek failed: %s", strerror(errno));
1999                         return -EIO;
2000                 }
2001         }
2002
2003         if (!item || le64_to_cpu(item->bytenr) != BTRFS_SUPER_INFO_OFFSET) {
2004                 error("did not find superblock at %llu",
2005                                 le64_to_cpu(item->bytenr));
2006                 return -EINVAL;
2007         }
2008
2009         buffer = malloc(le32_to_cpu(item->size));
2010         if (!buffer) {
2011                 error("not enough memory to allocate buffer");
2012                 return -ENOMEM;
2013         }
2014
2015         ret = fread(buffer, le32_to_cpu(item->size), 1, mdres->in);
2016         if (ret != 1) {
2017                 error("unable to read buffer: %s", strerror(errno));
2018                 free(buffer);
2019                 return -EIO;
2020         }
2021
2022         if (mdres->compress_method == COMPRESS_ZLIB) {
2023                 size_t size = MAX_PENDING_SIZE * 2;
2024                 u8 *tmp;
2025
2026                 tmp = malloc(MAX_PENDING_SIZE * 2);
2027                 if (!tmp) {
2028                         free(buffer);
2029                         return -ENOMEM;
2030                 }
2031                 ret = uncompress(tmp, (unsigned long *)&size,
2032                                  buffer, le32_to_cpu(item->size));
2033                 if (ret != Z_OK) {
2034                         error("decompression failed with %d", ret);
2035                         free(buffer);
2036                         free(tmp);
2037                         return -EIO;
2038                 }
2039                 free(buffer);
2040                 buffer = tmp;
2041         }
2042
2043         pthread_mutex_lock(&mdres->mutex);
2044         super = (struct btrfs_super_block *)buffer;
2045         chunk_root_bytenr = btrfs_super_chunk_root(super);
2046         mdres->nodesize = btrfs_super_nodesize(super);
2047         memcpy(mdres->fsid, super->fsid, BTRFS_FSID_SIZE);
2048         memcpy(mdres->uuid, super->dev_item.uuid,
2049                        BTRFS_UUID_SIZE);
2050         mdres->devid = le64_to_cpu(super->dev_item.devid);
2051         free(buffer);
2052         pthread_mutex_unlock(&mdres->mutex);
2053
2054         return search_for_chunk_blocks(mdres, chunk_root_bytenr, 0);
2055 }
2056
2057 static int range_contains_super(u64 physical, u64 bytes)
2058 {
2059         u64 super_bytenr;
2060         int i;
2061
2062         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2063                 super_bytenr = btrfs_sb_offset(i);
2064                 if (super_bytenr >= physical &&
2065                     super_bytenr < physical + bytes)
2066                         return 1;
2067         }
2068
2069         return 0;
2070 }
2071
2072 static void remap_overlapping_chunks(struct mdrestore_struct *mdres)
2073 {
2074         struct fs_chunk *fs_chunk;
2075
2076         while (!list_empty(&mdres->overlapping_chunks)) {
2077                 fs_chunk = list_first_entry(&mdres->overlapping_chunks,
2078                                             struct fs_chunk, list);
2079                 list_del_init(&fs_chunk->list);
2080                 if (range_contains_super(fs_chunk->physical,
2081                                          fs_chunk->bytes)) {
2082                         warning(
2083 "remapping a chunk that had a super mirror inside of it, clearing space cache so we don't end up with corruption");
2084                         mdres->clear_space_cache = 1;
2085                 }
2086                 fs_chunk->physical = mdres->last_physical_offset;
2087                 tree_insert(&mdres->physical_tree, &fs_chunk->p, physical_cmp);
2088                 mdres->last_physical_offset += fs_chunk->bytes;
2089         }
2090 }
2091
2092 static int fixup_devices(struct btrfs_fs_info *fs_info,
2093                          struct mdrestore_struct *mdres, off_t dev_size)
2094 {
2095         struct btrfs_trans_handle *trans;
2096         struct btrfs_dev_item *dev_item;
2097         struct btrfs_path path;
2098         struct extent_buffer *leaf;
2099         struct btrfs_root *root = fs_info->chunk_root;
2100         struct btrfs_key key;
2101         u64 devid, cur_devid;
2102         int ret;
2103
2104         trans = btrfs_start_transaction(fs_info->tree_root, 1);
2105         if (IS_ERR(trans)) {
2106                 error("cannot starting transaction %ld", PTR_ERR(trans));
2107                 return PTR_ERR(trans);
2108         }
2109
2110         dev_item = &fs_info->super_copy->dev_item;
2111
2112         devid = btrfs_stack_device_id(dev_item);
2113
2114         btrfs_set_stack_device_total_bytes(dev_item, dev_size);
2115         btrfs_set_stack_device_bytes_used(dev_item, mdres->alloced_chunks);
2116
2117         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2118         key.type = BTRFS_DEV_ITEM_KEY;
2119         key.offset = 0;
2120
2121         btrfs_init_path(&path);
2122
2123 again:
2124         ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
2125         if (ret < 0) {
2126                 error("search failed: %d", ret);
2127                 exit(1);
2128         }
2129
2130         while (1) {
2131                 leaf = path.nodes[0];
2132                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
2133                         ret = btrfs_next_leaf(root, &path);
2134                         if (ret < 0) {
2135                                 error("cannot go to next leaf %d", ret);
2136                                 exit(1);
2137                         }
2138                         if (ret > 0) {
2139                                 ret = 0;
2140                                 break;
2141                         }
2142                         leaf = path.nodes[0];
2143                 }
2144
2145                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
2146                 if (key.type > BTRFS_DEV_ITEM_KEY)
2147                         break;
2148                 if (key.type != BTRFS_DEV_ITEM_KEY) {
2149                         path.slots[0]++;
2150                         continue;
2151                 }
2152
2153                 dev_item = btrfs_item_ptr(leaf, path.slots[0],
2154                                           struct btrfs_dev_item);
2155                 cur_devid = btrfs_device_id(leaf, dev_item);
2156                 if (devid != cur_devid) {
2157                         ret = btrfs_del_item(trans, root, &path);
2158                         if (ret) {
2159                                 error("cannot delete item: %d", ret);
2160                                 exit(1);
2161                         }
2162                         btrfs_release_path(&path);
2163                         goto again;
2164                 }
2165
2166                 btrfs_set_device_total_bytes(leaf, dev_item, dev_size);
2167                 btrfs_set_device_bytes_used(leaf, dev_item,
2168                                             mdres->alloced_chunks);
2169                 btrfs_mark_buffer_dirty(leaf);
2170                 path.slots[0]++;
2171         }
2172
2173         btrfs_release_path(&path);
2174         ret = btrfs_commit_transaction(trans, fs_info->tree_root);
2175         if (ret) {
2176                 error("unable to commit transaction: %d", ret);
2177                 return ret;
2178         }
2179         return 0;
2180 }
2181
2182 static int restore_metadump(const char *input, FILE *out, int old_restore,
2183                             int num_threads, int fixup_offset,
2184                             const char *target, int multi_devices)
2185 {
2186         struct meta_cluster *cluster = NULL;
2187         struct meta_cluster_header *header;
2188         struct mdrestore_struct mdrestore;
2189         struct btrfs_fs_info *info = NULL;
2190         u64 bytenr = 0;
2191         FILE *in = NULL;
2192         int ret = 0;
2193
2194         if (!strcmp(input, "-")) {
2195                 in = stdin;
2196         } else {
2197                 in = fopen(input, "r");
2198                 if (!in) {
2199                         error("unable to open metadump image: %s",
2200                                         strerror(errno));
2201                         return 1;
2202                 }
2203         }
2204
2205         /* NOTE: open with write mode */
2206         if (fixup_offset) {
2207                 info = open_ctree_fs_info(target, 0, 0, 0,
2208                                           OPEN_CTREE_WRITES |
2209                                           OPEN_CTREE_RESTORE |
2210                                           OPEN_CTREE_PARTIAL);
2211                 if (!info) {
2212                         error("open ctree failed");
2213                         ret = -EIO;
2214                         goto failed_open;
2215                 }
2216         }
2217
2218         cluster = malloc(BLOCK_SIZE);
2219         if (!cluster) {
2220                 error("not enough memory for cluster");
2221                 ret = -ENOMEM;
2222                 goto failed_info;
2223         }
2224
2225         ret = mdrestore_init(&mdrestore, in, out, old_restore, num_threads,
2226                              fixup_offset, info, multi_devices);
2227         if (ret) {
2228                 error("failed to initialize metadata restore state: %d", ret);
2229                 goto failed_cluster;
2230         }
2231
2232         if (!multi_devices && !old_restore) {
2233                 ret = build_chunk_tree(&mdrestore, cluster);
2234                 if (ret)
2235                         goto out;
2236                 if (!list_empty(&mdrestore.overlapping_chunks))
2237                         remap_overlapping_chunks(&mdrestore);
2238         }
2239
2240         if (in != stdin && fseek(in, 0, SEEK_SET)) {
2241                 error("seek failed: %s", strerror(errno));
2242                 goto out;
2243         }
2244
2245         while (!mdrestore.error) {
2246                 ret = fread(cluster, BLOCK_SIZE, 1, in);
2247                 if (!ret)
2248                         break;
2249
2250                 header = &cluster->header;
2251                 if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
2252                     le64_to_cpu(header->bytenr) != bytenr) {
2253                         error("bad header in metadump image");
2254                         ret = -EIO;
2255                         break;
2256                 }
2257                 ret = add_cluster(cluster, &mdrestore, &bytenr);
2258                 if (ret) {
2259                         error("failed to add cluster: %d", ret);
2260                         break;
2261                 }
2262         }
2263         ret = wait_for_worker(&mdrestore);
2264
2265         if (!ret && !multi_devices && !old_restore) {
2266                 struct btrfs_root *root;
2267                 struct stat st;
2268
2269                 root = open_ctree_fd(fileno(out), target, 0,
2270                                           OPEN_CTREE_PARTIAL |
2271                                           OPEN_CTREE_WRITES |
2272                                           OPEN_CTREE_NO_DEVICES);
2273                 if (!root) {
2274                         error("open ctree failed in %s", target);
2275                         ret = -EIO;
2276                         goto out;
2277                 }
2278                 info = root->fs_info;
2279
2280                 if (stat(target, &st)) {
2281                         error("stat %s failed: %s", target, strerror(errno));
2282                         close_ctree(info->chunk_root);
2283                         free(cluster);
2284                         return 1;
2285                 }
2286
2287                 ret = fixup_devices(info, &mdrestore, st.st_size);
2288                 close_ctree(info->chunk_root);
2289                 if (ret)
2290                         goto out;
2291         }
2292 out:
2293         mdrestore_destroy(&mdrestore, num_threads);
2294 failed_cluster:
2295         free(cluster);
2296 failed_info:
2297         if (fixup_offset && info)
2298                 close_ctree(info->chunk_root);
2299 failed_open:
2300         if (in != stdin)
2301                 fclose(in);
2302         return ret;
2303 }
2304
2305 static int update_disk_super_on_device(struct btrfs_fs_info *info,
2306                                        const char *other_dev, u64 cur_devid)
2307 {
2308         struct btrfs_key key;
2309         struct extent_buffer *leaf;
2310         struct btrfs_path path;
2311         struct btrfs_dev_item *dev_item;
2312         struct btrfs_super_block *disk_super;
2313         char dev_uuid[BTRFS_UUID_SIZE];
2314         char fs_uuid[BTRFS_UUID_SIZE];
2315         u64 devid, type, io_align, io_width;
2316         u64 sector_size, total_bytes, bytes_used;
2317         char buf[BTRFS_SUPER_INFO_SIZE];
2318         int fp = -1;
2319         int ret;
2320
2321         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2322         key.type = BTRFS_DEV_ITEM_KEY;
2323         key.offset = cur_devid;
2324
2325         btrfs_init_path(&path);
2326         ret = btrfs_search_slot(NULL, info->chunk_root, &key, &path, 0, 0); 
2327         if (ret) {
2328                 error("search key failed: %d", ret);
2329                 ret = -EIO;
2330                 goto out;
2331         }
2332
2333         leaf = path.nodes[0];
2334         dev_item = btrfs_item_ptr(leaf, path.slots[0],
2335                                   struct btrfs_dev_item);
2336
2337         devid = btrfs_device_id(leaf, dev_item);
2338         if (devid != cur_devid) {
2339                 error("devid mismatch: %llu != %llu",
2340                                 (unsigned long long)devid,
2341                                 (unsigned long long)cur_devid);
2342                 ret = -EIO;
2343                 goto out;
2344         }
2345
2346         type = btrfs_device_type(leaf, dev_item);
2347         io_align = btrfs_device_io_align(leaf, dev_item);
2348         io_width = btrfs_device_io_width(leaf, dev_item);
2349         sector_size = btrfs_device_sector_size(leaf, dev_item);
2350         total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2351         bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2352         read_extent_buffer(leaf, dev_uuid, (unsigned long)btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE);
2353         read_extent_buffer(leaf, fs_uuid, (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE);
2354
2355         btrfs_release_path(&path);
2356
2357         printf("update disk super on %s devid=%llu\n", other_dev, devid);
2358
2359         /* update other devices' super block */
2360         fp = open(other_dev, O_CREAT | O_RDWR, 0600);
2361         if (fp < 0) {
2362                 error("could not open %s: %s", other_dev, strerror(errno));
2363                 ret = -EIO;
2364                 goto out;
2365         }
2366
2367         memcpy(buf, info->super_copy, BTRFS_SUPER_INFO_SIZE);
2368
2369         disk_super = (struct btrfs_super_block *)buf;
2370         dev_item = &disk_super->dev_item;
2371
2372         btrfs_set_stack_device_type(dev_item, type);
2373         btrfs_set_stack_device_id(dev_item, devid);
2374         btrfs_set_stack_device_total_bytes(dev_item, total_bytes);
2375         btrfs_set_stack_device_bytes_used(dev_item, bytes_used);
2376         btrfs_set_stack_device_io_align(dev_item, io_align);
2377         btrfs_set_stack_device_io_width(dev_item, io_width);
2378         btrfs_set_stack_device_sector_size(dev_item, sector_size);
2379         memcpy(dev_item->uuid, dev_uuid, BTRFS_UUID_SIZE);
2380         memcpy(dev_item->fsid, fs_uuid, BTRFS_UUID_SIZE);
2381         csum_block((u8 *)buf, BTRFS_SUPER_INFO_SIZE);
2382
2383         ret = pwrite64(fp, buf, BTRFS_SUPER_INFO_SIZE, BTRFS_SUPER_INFO_OFFSET);
2384         if (ret != BTRFS_SUPER_INFO_SIZE) {
2385                 if (ret < 0)
2386                         error("cannot write superblock: %s", strerror(ret));
2387                 else
2388                         error("cannot write superblock");
2389                 ret = -EIO;
2390                 goto out;
2391         }
2392
2393         write_backup_supers(fp, (u8 *)buf);
2394
2395 out:
2396         if (fp != -1)
2397                 close(fp);
2398         return ret;
2399 }
2400
2401 static void print_usage(int ret)
2402 {
2403         printf("usage: btrfs-image [options] source target\n");
2404         printf("\t-r      \trestore metadump image\n");
2405         printf("\t-c value\tcompression level (0 ~ 9)\n");
2406         printf("\t-t value\tnumber of threads (1 ~ 32)\n");
2407         printf("\t-o      \tdon't mess with the chunk tree when restoring\n");
2408         printf("\t-s      \tsanitize file names, use once to just use garbage, use twice if you want crc collisions\n");
2409         printf("\t-w      \twalk all trees instead of using extent tree, do this if your extent tree is broken\n");
2410         printf("\t-m       \trestore for multiple devices\n");
2411         printf("\n");
2412         printf("\tIn the dump mode, source is the btrfs device and target is the output file (use '-' for stdout).\n");
2413         printf("\tIn the restore mode, source is the dumped image and target is the btrfs device/file.\n");
2414         exit(ret);
2415 }
2416
2417 int main(int argc, char *argv[])
2418 {
2419         char *source;
2420         char *target;
2421         u64 num_threads = 0;
2422         u64 compress_level = 0;
2423         int create = 1;
2424         int old_restore = 0;
2425         int walk_trees = 0;
2426         int multi_devices = 0;
2427         int ret;
2428         enum sanitize_mode sanitize = SANITIZE_NONE;
2429         int dev_cnt = 0;
2430         int usage_error = 0;
2431         FILE *out;
2432
2433         while (1) {
2434                 static const struct option long_options[] = {
2435                         { "help", no_argument, NULL, GETOPT_VAL_HELP},
2436                         { NULL, 0, NULL, 0 }
2437                 };
2438                 int c = getopt_long(argc, argv, "rc:t:oswm", long_options, NULL);
2439                 if (c < 0)
2440                         break;
2441                 switch (c) {
2442                 case 'r':
2443                         create = 0;
2444                         break;
2445                 case 't':
2446                         num_threads = arg_strtou64(optarg);
2447                         if (num_threads > MAX_WORKER_THREADS) {
2448                                 error("number of threads out of range: %llu > %d",
2449                                         (unsigned long long)num_threads,
2450                                         MAX_WORKER_THREADS);
2451                                 return 1;
2452                         }
2453                         break;
2454                 case 'c':
2455                         compress_level = arg_strtou64(optarg);
2456                         if (compress_level > 9) {
2457                                 error("compression level out of range: %llu",
2458                                         (unsigned long long)compress_level);
2459                                 return 1;
2460                         }
2461                         break;
2462                 case 'o':
2463                         old_restore = 1;
2464                         break;
2465                 case 's':
2466                         if (sanitize == SANITIZE_NONE)
2467                                 sanitize = SANITIZE_NAMES;
2468                         else if (sanitize == SANITIZE_NAMES)
2469                                 sanitize = SANITIZE_COLLISIONS;
2470                         break;
2471                 case 'w':
2472                         walk_trees = 1;
2473                         break;
2474                 case 'm':
2475                         create = 0;
2476                         multi_devices = 1;
2477                         break;
2478                         case GETOPT_VAL_HELP:
2479                 default:
2480                         print_usage(c != GETOPT_VAL_HELP);
2481                 }
2482         }
2483
2484         set_argv0(argv);
2485         if (check_argc_min(argc - optind, 2))
2486                 print_usage(1);
2487
2488         dev_cnt = argc - optind - 1;
2489
2490         if (create) {
2491                 if (old_restore) {
2492                         error(
2493                         "create and restore cannot be used at the same time");
2494                         usage_error++;
2495                 }
2496         } else {
2497                 if (walk_trees || sanitize != SANITIZE_NONE || compress_level) {
2498                         error(
2499                         "useing -w, -s, -c options for restore makes no sense");
2500                         usage_error++;
2501                 }
2502                 if (multi_devices && dev_cnt < 2) {
2503                         error("not enough devices specified for -m option");
2504                         usage_error++;
2505                 }
2506                 if (!multi_devices && dev_cnt != 1) {
2507                         error("accepts only 1 device without -m option");
2508                         usage_error++;
2509                 }
2510         }
2511
2512         if (usage_error)
2513                 print_usage(1);
2514
2515         source = argv[optind];
2516         target = argv[optind + 1];
2517
2518         if (create && !strcmp(target, "-")) {
2519                 out = stdout;
2520         } else {
2521                 out = fopen(target, "w+");
2522                 if (!out) {
2523                         error("unable to create target file %s", target);
2524                         exit(1);
2525                 }
2526         }
2527
2528         if (compress_level > 0 || create == 0) {
2529                 if (num_threads == 0) {
2530                         long tmp = sysconf(_SC_NPROCESSORS_ONLN);
2531
2532                         if (tmp <= 0)
2533                                 tmp = 1;
2534                         num_threads = tmp;
2535                 }
2536         } else {
2537                 num_threads = 0;
2538         }
2539
2540         if (create) {
2541                 ret = check_mounted(source);
2542                 if (ret < 0) {
2543                         warning("unable to check mount status of: %s",
2544                                         strerror(-ret));
2545                 } else if (ret) {
2546                         warning("%s already mounted, results may be inaccurate",
2547                                         source);
2548                 }
2549
2550                 ret = create_metadump(source, out, num_threads,
2551                                       compress_level, sanitize, walk_trees);
2552         } else {
2553                 ret = restore_metadump(source, out, old_restore, num_threads,
2554                                        0, target, multi_devices);
2555         }
2556         if (ret) {
2557                 error("%s failed: %s", (create) ? "create" : "restore",
2558                        strerror(errno));
2559                 goto out;
2560         }
2561
2562          /* extended support for multiple devices */
2563         if (!create && multi_devices) {
2564                 struct btrfs_fs_info *info;
2565                 u64 total_devs;
2566                 int i;
2567
2568                 info = open_ctree_fs_info(target, 0, 0, 0,
2569                                           OPEN_CTREE_PARTIAL |
2570                                           OPEN_CTREE_RESTORE);
2571                 if (!info) {
2572                         error("open ctree failed at %s", target);
2573                         return 1;
2574                 }
2575
2576                 total_devs = btrfs_super_num_devices(info->super_copy);
2577                 if (total_devs != dev_cnt) {
2578                         error("it needs %llu devices but has only %d",
2579                                 total_devs, dev_cnt);
2580                         close_ctree(info->chunk_root);
2581                         goto out;
2582                 }
2583
2584                 /* update super block on other disks */
2585                 for (i = 2; i <= dev_cnt; i++) {
2586                         ret = update_disk_super_on_device(info,
2587                                         argv[optind + i], (u64)i);
2588                         if (ret) {
2589                                 error("update disk superblock failed devid %d: %d",
2590                                         i, ret);
2591                                 close_ctree(info->chunk_root);
2592                                 exit(1);
2593                         }
2594                 }
2595
2596                 close_ctree(info->chunk_root);
2597
2598                 /* fix metadata block to map correct chunk */
2599                 ret = restore_metadump(source, out, 0, num_threads, 1,
2600                                        target, 1);
2601                 if (ret) {
2602                         error("unable to fixup metadump: %d", ret);
2603                         exit(1);
2604                 }
2605         }
2606 out:
2607         if (out == stdout) {
2608                 fflush(out);
2609         } else {
2610                 fclose(out);
2611                 if (ret && create) {
2612                         int unlink_ret;
2613
2614                         unlink_ret = unlink(target);
2615                         if (unlink_ret)
2616                                 error("unlink output file %s failed: %s",
2617                                                 target, strerror(errno));
2618                 }
2619         }
2620
2621         btrfs_close_all_devices();
2622
2623         return !!ret;
2624 }