btrfs-progs: image: use CRC32C reversing instead of brute force to find collisions
[platform/upstream/btrfs-progs.git] / image / main.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <pthread.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <sys/types.h>
23 #include <sys/stat.h>
24 #include <fcntl.h>
25 #include <unistd.h>
26 #include <dirent.h>
27 #include <zlib.h>
28 #include <getopt.h>
29
30 #include "kerncompat.h"
31 #include "crc32c.h"
32 #include "ctree.h"
33 #include "disk-io.h"
34 #include "transaction.h"
35 #include "utils.h"
36 #include "volumes.h"
37 #include "extent_io.h"
38 #include "help.h"
39
40 #define HEADER_MAGIC            0xbd5c25e27295668bULL
41 #define MAX_PENDING_SIZE        (256 * 1024)
42 #define BLOCK_SIZE              1024
43 #define BLOCK_MASK              (BLOCK_SIZE - 1)
44
45 #define COMPRESS_NONE           0
46 #define COMPRESS_ZLIB           1
47
48 #define MAX_WORKER_THREADS      (32)
49
50 struct meta_cluster_item {
51         __le64 bytenr;
52         __le32 size;
53 } __attribute__ ((__packed__));
54
55 struct meta_cluster_header {
56         __le64 magic;
57         __le64 bytenr;
58         __le32 nritems;
59         u8 compress;
60 } __attribute__ ((__packed__));
61
62 /* cluster header + index items + buffers */
63 struct meta_cluster {
64         struct meta_cluster_header header;
65         struct meta_cluster_item items[];
66 } __attribute__ ((__packed__));
67
68 #define ITEMS_PER_CLUSTER ((BLOCK_SIZE - sizeof(struct meta_cluster)) / \
69                            sizeof(struct meta_cluster_item))
70
71 struct fs_chunk {
72         u64 logical;
73         u64 physical;
74         /*
75          * physical_dup only store additonal physical for BTRFS_BLOCK_GROUP_DUP
76          * currently restore only support single and DUP
77          * TODO: modify this structure and the function related to this
78          * structure for support RAID*
79          */
80         u64 physical_dup;
81         u64 bytes;
82         struct rb_node l;
83         struct rb_node p;
84         struct list_head list;
85 };
86
87 struct async_work {
88         struct list_head list;
89         struct list_head ordered;
90         u64 start;
91         u64 size;
92         u8 *buffer;
93         size_t bufsize;
94         int error;
95 };
96
97 struct metadump_struct {
98         struct btrfs_root *root;
99         FILE *out;
100
101         union {
102                 struct meta_cluster cluster;
103                 char meta_cluster_bytes[BLOCK_SIZE];
104         };
105
106         pthread_t threads[MAX_WORKER_THREADS];
107         size_t num_threads;
108         pthread_mutex_t mutex;
109         pthread_cond_t cond;
110         struct rb_root name_tree;
111
112         struct list_head list;
113         struct list_head ordered;
114         size_t num_items;
115         size_t num_ready;
116
117         u64 pending_start;
118         u64 pending_size;
119
120         int compress_level;
121         int done;
122         int data;
123         int sanitize_names;
124
125         int error;
126 };
127
128 struct name {
129         struct rb_node n;
130         char *val;
131         char *sub;
132         u32 len;
133 };
134
135 struct mdrestore_struct {
136         FILE *in;
137         FILE *out;
138
139         pthread_t threads[MAX_WORKER_THREADS];
140         size_t num_threads;
141         pthread_mutex_t mutex;
142         pthread_cond_t cond;
143
144         struct rb_root chunk_tree;
145         struct rb_root physical_tree;
146         struct list_head list;
147         struct list_head overlapping_chunks;
148         size_t num_items;
149         u32 nodesize;
150         u64 devid;
151         u64 alloced_chunks;
152         u64 last_physical_offset;
153         u8 uuid[BTRFS_UUID_SIZE];
154         u8 fsid[BTRFS_FSID_SIZE];
155
156         int compress_method;
157         int done;
158         int error;
159         int old_restore;
160         int fixup_offset;
161         int multi_devices;
162         int clear_space_cache;
163         struct btrfs_fs_info *info;
164 };
165
166 static int search_for_chunk_blocks(struct mdrestore_struct *mdres,
167                                    u64 search, u64 cluster_bytenr);
168 static struct extent_buffer *alloc_dummy_eb(u64 bytenr, u32 size);
169
170 static void csum_block(u8 *buf, size_t len)
171 {
172         u8 result[BTRFS_CRC32_SIZE];
173         u32 crc = ~(u32)0;
174         crc = crc32c(crc, buf + BTRFS_CSUM_SIZE, len - BTRFS_CSUM_SIZE);
175         btrfs_csum_final(crc, result);
176         memcpy(buf, result, BTRFS_CRC32_SIZE);
177 }
178
179 static int has_name(struct btrfs_key *key)
180 {
181         switch (key->type) {
182         case BTRFS_DIR_ITEM_KEY:
183         case BTRFS_DIR_INDEX_KEY:
184         case BTRFS_INODE_REF_KEY:
185         case BTRFS_INODE_EXTREF_KEY:
186         case BTRFS_XATTR_ITEM_KEY:
187                 return 1;
188         default:
189                 break;
190         }
191
192         return 0;
193 }
194
195 static char *generate_garbage(u32 name_len)
196 {
197         char *buf = malloc(name_len);
198         int i;
199
200         if (!buf)
201                 return NULL;
202
203         for (i = 0; i < name_len; i++) {
204                 char c = rand_range(94) + 33;
205
206                 if (c == '/')
207                         c++;
208                 buf[i] = c;
209         }
210
211         return buf;
212 }
213
214 static int name_cmp(struct rb_node *a, struct rb_node *b, int fuzz)
215 {
216         struct name *entry = rb_entry(a, struct name, n);
217         struct name *ins = rb_entry(b, struct name, n);
218         u32 len;
219
220         len = min(ins->len, entry->len);
221         return memcmp(ins->val, entry->val, len);
222 }
223
224 static int chunk_cmp(struct rb_node *a, struct rb_node *b, int fuzz)
225 {
226         struct fs_chunk *entry = rb_entry(a, struct fs_chunk, l);
227         struct fs_chunk *ins = rb_entry(b, struct fs_chunk, l);
228
229         if (fuzz && ins->logical >= entry->logical &&
230             ins->logical < entry->logical + entry->bytes)
231                 return 0;
232
233         if (ins->logical < entry->logical)
234                 return -1;
235         else if (ins->logical > entry->logical)
236                 return 1;
237         return 0;
238 }
239
240 static int physical_cmp(struct rb_node *a, struct rb_node *b, int fuzz)
241 {
242         struct fs_chunk *entry = rb_entry(a, struct fs_chunk, p);
243         struct fs_chunk *ins = rb_entry(b, struct fs_chunk, p);
244
245         if (fuzz && ins->physical >= entry->physical &&
246             ins->physical < entry->physical + entry->bytes)
247                 return 0;
248
249         if (fuzz && entry->physical >= ins->physical &&
250             entry->physical < ins->physical + ins->bytes)
251                 return 0;
252
253         if (ins->physical < entry->physical)
254                 return -1;
255         else if (ins->physical > entry->physical)
256                 return 1;
257         return 0;
258 }
259
260 static void tree_insert(struct rb_root *root, struct rb_node *ins,
261                         int (*cmp)(struct rb_node *a, struct rb_node *b,
262                                    int fuzz))
263 {
264         struct rb_node ** p = &root->rb_node;
265         struct rb_node * parent = NULL;
266         int dir;
267
268         while(*p) {
269                 parent = *p;
270
271                 dir = cmp(*p, ins, 1);
272                 if (dir < 0)
273                         p = &(*p)->rb_left;
274                 else if (dir > 0)
275                         p = &(*p)->rb_right;
276                 else
277                         BUG();
278         }
279
280         rb_link_node(ins, parent, p);
281         rb_insert_color(ins, root);
282 }
283
284 static struct rb_node *tree_search(struct rb_root *root,
285                                    struct rb_node *search,
286                                    int (*cmp)(struct rb_node *a,
287                                               struct rb_node *b, int fuzz),
288                                    int fuzz)
289 {
290         struct rb_node *n = root->rb_node;
291         int dir;
292
293         while (n) {
294                 dir = cmp(n, search, fuzz);
295                 if (dir < 0)
296                         n = n->rb_left;
297                 else if (dir > 0)
298                         n = n->rb_right;
299                 else
300                         return n;
301         }
302
303         return NULL;
304 }
305
306 static u64 logical_to_physical(struct mdrestore_struct *mdres, u64 logical,
307                                u64 *size, u64 *physical_dup)
308 {
309         struct fs_chunk *fs_chunk;
310         struct rb_node *entry;
311         struct fs_chunk search;
312         u64 offset;
313
314         if (logical == BTRFS_SUPER_INFO_OFFSET)
315                 return logical;
316
317         search.logical = logical;
318         entry = tree_search(&mdres->chunk_tree, &search.l, chunk_cmp, 1);
319         if (!entry) {
320                 if (mdres->in != stdin)
321                         warning("cannot find a chunk, using logical");
322                 return logical;
323         }
324         fs_chunk = rb_entry(entry, struct fs_chunk, l);
325         if (fs_chunk->logical > logical || fs_chunk->logical + fs_chunk->bytes < logical)
326                 BUG();
327         offset = search.logical - fs_chunk->logical;
328
329         if (physical_dup) {
330                 /* Only in dup case, physical_dup is not equal to 0 */
331                 if (fs_chunk->physical_dup)
332                         *physical_dup = fs_chunk->physical_dup + offset;
333                 else
334                         *physical_dup = 0;
335         }
336
337         *size = min(*size, fs_chunk->bytes + fs_chunk->logical - logical);
338         return fs_chunk->physical + offset;
339 }
340
341 /*
342  * Reverse CRC-32C table
343  */
344 static const u32 crc32c_rev_table[256] = {
345         0x00000000L,0x05EC76F1L,0x0BD8EDE2L,0x0E349B13L,
346         0x17B1DBC4L,0x125DAD35L,0x1C693626L,0x198540D7L,
347         0x2F63B788L,0x2A8FC179L,0x24BB5A6AL,0x21572C9BL,
348         0x38D26C4CL,0x3D3E1ABDL,0x330A81AEL,0x36E6F75FL,
349         0x5EC76F10L,0x5B2B19E1L,0x551F82F2L,0x50F3F403L,
350         0x4976B4D4L,0x4C9AC225L,0x42AE5936L,0x47422FC7L,
351         0x71A4D898L,0x7448AE69L,0x7A7C357AL,0x7F90438BL,
352         0x6615035CL,0x63F975ADL,0x6DCDEEBEL,0x6821984FL,
353         0xBD8EDE20L,0xB862A8D1L,0xB65633C2L,0xB3BA4533L,
354         0xAA3F05E4L,0xAFD37315L,0xA1E7E806L,0xA40B9EF7L,
355         0x92ED69A8L,0x97011F59L,0x9935844AL,0x9CD9F2BBL,
356         0x855CB26CL,0x80B0C49DL,0x8E845F8EL,0x8B68297FL,
357         0xE349B130L,0xE6A5C7C1L,0xE8915CD2L,0xED7D2A23L,
358         0xF4F86AF4L,0xF1141C05L,0xFF208716L,0xFACCF1E7L,
359         0xCC2A06B8L,0xC9C67049L,0xC7F2EB5AL,0xC21E9DABL,
360         0xDB9BDD7CL,0xDE77AB8DL,0xD043309EL,0xD5AF466FL,
361         0x7EF1CAB1L,0x7B1DBC40L,0x75292753L,0x70C551A2L,
362         0x69401175L,0x6CAC6784L,0x6298FC97L,0x67748A66L,
363         0x51927D39L,0x547E0BC8L,0x5A4A90DBL,0x5FA6E62AL,
364         0x4623A6FDL,0x43CFD00CL,0x4DFB4B1FL,0x48173DEEL,
365         0x2036A5A1L,0x25DAD350L,0x2BEE4843L,0x2E023EB2L,
366         0x37877E65L,0x326B0894L,0x3C5F9387L,0x39B3E576L,
367         0x0F551229L,0x0AB964D8L,0x048DFFCBL,0x0161893AL,
368         0x18E4C9EDL,0x1D08BF1CL,0x133C240FL,0x16D052FEL,
369         0xC37F1491L,0xC6936260L,0xC8A7F973L,0xCD4B8F82L,
370         0xD4CECF55L,0xD122B9A4L,0xDF1622B7L,0xDAFA5446L,
371         0xEC1CA319L,0xE9F0D5E8L,0xE7C44EFBL,0xE228380AL,
372         0xFBAD78DDL,0xFE410E2CL,0xF075953FL,0xF599E3CEL,
373         0x9DB87B81L,0x98540D70L,0x96609663L,0x938CE092L,
374         0x8A09A045L,0x8FE5D6B4L,0x81D14DA7L,0x843D3B56L,
375         0xB2DBCC09L,0xB737BAF8L,0xB90321EBL,0xBCEF571AL,
376         0xA56A17CDL,0xA086613CL,0xAEB2FA2FL,0xAB5E8CDEL,
377         0xFDE39562L,0xF80FE393L,0xF63B7880L,0xF3D70E71L,
378         0xEA524EA6L,0xEFBE3857L,0xE18AA344L,0xE466D5B5L,
379         0xD28022EAL,0xD76C541BL,0xD958CF08L,0xDCB4B9F9L,
380         0xC531F92EL,0xC0DD8FDFL,0xCEE914CCL,0xCB05623DL,
381         0xA324FA72L,0xA6C88C83L,0xA8FC1790L,0xAD106161L,
382         0xB49521B6L,0xB1795747L,0xBF4DCC54L,0xBAA1BAA5L,
383         0x8C474DFAL,0x89AB3B0BL,0x879FA018L,0x8273D6E9L,
384         0x9BF6963EL,0x9E1AE0CFL,0x902E7BDCL,0x95C20D2DL,
385         0x406D4B42L,0x45813DB3L,0x4BB5A6A0L,0x4E59D051L,
386         0x57DC9086L,0x5230E677L,0x5C047D64L,0x59E80B95L,
387         0x6F0EFCCAL,0x6AE28A3BL,0x64D61128L,0x613A67D9L,
388         0x78BF270EL,0x7D5351FFL,0x7367CAECL,0x768BBC1DL,
389         0x1EAA2452L,0x1B4652A3L,0x1572C9B0L,0x109EBF41L,
390         0x091BFF96L,0x0CF78967L,0x02C31274L,0x072F6485L,
391         0x31C993DAL,0x3425E52BL,0x3A117E38L,0x3FFD08C9L,
392         0x2678481EL,0x23943EEFL,0x2DA0A5FCL,0x284CD30DL,
393         0x83125FD3L,0x86FE2922L,0x88CAB231L,0x8D26C4C0L,
394         0x94A38417L,0x914FF2E6L,0x9F7B69F5L,0x9A971F04L,
395         0xAC71E85BL,0xA99D9EAAL,0xA7A905B9L,0xA2457348L,
396         0xBBC0339FL,0xBE2C456EL,0xB018DE7DL,0xB5F4A88CL,
397         0xDDD530C3L,0xD8394632L,0xD60DDD21L,0xD3E1ABD0L,
398         0xCA64EB07L,0xCF889DF6L,0xC1BC06E5L,0xC4507014L,
399         0xF2B6874BL,0xF75AF1BAL,0xF96E6AA9L,0xFC821C58L,
400         0xE5075C8FL,0xE0EB2A7EL,0xEEDFB16DL,0xEB33C79CL,
401         0x3E9C81F3L,0x3B70F702L,0x35446C11L,0x30A81AE0L,
402         0x292D5A37L,0x2CC12CC6L,0x22F5B7D5L,0x2719C124L,
403         0x11FF367BL,0x1413408AL,0x1A27DB99L,0x1FCBAD68L,
404         0x064EEDBFL,0x03A29B4EL,0x0D96005DL,0x087A76ACL,
405         0x605BEEE3L,0x65B79812L,0x6B830301L,0x6E6F75F0L,
406         0x77EA3527L,0x720643D6L,0x7C32D8C5L,0x79DEAE34L,
407         0x4F38596BL,0x4AD42F9AL,0x44E0B489L,0x410CC278L,
408         0x588982AFL,0x5D65F45EL,0x53516F4DL,0x56BD19BCL
409 };
410
411 /*
412  * Calculate a 4-byte suffix to match desired CRC32C
413  *
414  * @current_crc: CRC32C checksum of all bytes before the suffix
415  * @desired_crc: the checksum that we want to get after adding the suffix
416  *
417  * Outputs: @suffix: pointer to where the suffix will be written (4-bytes)
418  */
419 static void find_collision_calc_suffix(unsigned long current_crc,
420                                        unsigned long desired_crc,
421                                        char *suffix)
422 {
423         int i;
424
425         for(i = 3; i >= 0; i--) {
426                 desired_crc = (desired_crc << 8)
427                             ^ crc32c_rev_table[desired_crc >> 24 & 0xFF]
428                             ^ ((current_crc >> i * 8) & 0xFF);
429         }
430         for (i = 0; i < 4; i++)
431                 suffix[i] = (desired_crc >> i * 8) & 0xFF;
432 }
433
434 /*
435  * Check if suffix is valid according to our file name conventions
436  */
437 static int find_collision_is_suffix_valid(const char *suffix)
438 {
439         int i;
440         char c;
441
442         for (i = 0; i < 4; i++) {
443                 c = suffix[i];
444                 if (c < ' ' || c > 126 || c == '/')
445                         return 0;
446         }
447         return 1;
448 }
449
450 static int find_collision_reverse_crc32c(struct name *val, u32 name_len)
451 {
452         unsigned long checksum;
453         unsigned long current_checksum;
454         int found = 0;
455         int i;
456
457         /* There are no same length collisions of 4 or less bytes */
458         if (name_len <= 4)
459                 return 0;
460         checksum = crc32c(~1, val->val, name_len);
461         name_len -= 4;
462         memset(val->sub, ' ', name_len);
463         i = 0;
464         while (1) {
465                 current_checksum = crc32c(~1, val->sub, name_len);
466                 find_collision_calc_suffix(current_checksum,
467                                            checksum,
468                                            val->sub + name_len);
469                 if (find_collision_is_suffix_valid(val->sub + name_len) &&
470                     memcmp(val->sub, val->val, val->len)) {
471                         found = 1;
472                         break;
473                 }
474
475                 if (val->sub[i] == 126) {
476                         do {
477                                 i++;
478                                 if (i >= name_len)
479                                         break;
480                         } while (val->sub[i] == 126);
481
482                         if (i >= name_len)
483                                 break;
484                         val->sub[i]++;
485                         if (val->sub[i] == '/')
486                                 val->sub[i]++;
487                         memset(val->sub, ' ', i);
488                         i = 0;
489                         continue;
490                 } else {
491                         val->sub[i]++;
492                         if (val->sub[i] == '/')
493                                 val->sub[i]++;
494                 }
495         }
496         return found;
497 }
498
499 static char *find_collision(struct metadump_struct *md, char *name,
500                             u32 name_len)
501 {
502         struct name *val;
503         struct rb_node *entry;
504         struct name tmp;
505         int found;
506         int i;
507
508         tmp.val = name;
509         tmp.len = name_len;
510         entry = tree_search(&md->name_tree, &tmp.n, name_cmp, 0);
511         if (entry) {
512                 val = rb_entry(entry, struct name, n);
513                 free(name);
514                 return val->sub;
515         }
516
517         val = malloc(sizeof(struct name));
518         if (!val) {
519                 error("cannot sanitize name, not enough memory");
520                 free(name);
521                 return NULL;
522         }
523
524         memset(val, 0, sizeof(*val));
525
526         val->val = name;
527         val->len = name_len;
528         val->sub = malloc(name_len);
529         if (!val->sub) {
530                 error("cannot sanitize name, not enough memory");
531                 free(val);
532                 free(name);
533                 return NULL;
534         }
535
536         found = find_collision_reverse_crc32c(val, name_len);
537
538         if (!found) {
539                 warning(
540 "cannot find a hash collision for '%.*s', generating garbage, it won't match indexes",
541                         val->len, val->val);
542                 for (i = 0; i < name_len; i++) {
543                         char c = rand_range(94) + 33;
544
545                         if (c == '/')
546                                 c++;
547                         val->sub[i] = c;
548                 }
549         }
550
551         tree_insert(&md->name_tree, &val->n, name_cmp);
552         return val->sub;
553 }
554
555 static void sanitize_dir_item(struct metadump_struct *md, struct extent_buffer *eb,
556                               int slot)
557 {
558         struct btrfs_dir_item *dir_item;
559         char *buf;
560         char *garbage;
561         unsigned long name_ptr;
562         u32 total_len;
563         u32 cur = 0;
564         u32 this_len;
565         u32 name_len;
566         int free_garbage = (md->sanitize_names == 1);
567
568         dir_item = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
569         total_len = btrfs_item_size_nr(eb, slot);
570         while (cur < total_len) {
571                 this_len = sizeof(*dir_item) +
572                         btrfs_dir_name_len(eb, dir_item) +
573                         btrfs_dir_data_len(eb, dir_item);
574                 name_ptr = (unsigned long)(dir_item + 1);
575                 name_len = btrfs_dir_name_len(eb, dir_item);
576
577                 if (md->sanitize_names > 1) {
578                         buf = malloc(name_len);
579                         if (!buf) {
580                                 error("cannot sanitize name, not enough memory");
581                                 return;
582                         }
583                         read_extent_buffer(eb, buf, name_ptr, name_len);
584                         garbage = find_collision(md, buf, name_len);
585                 } else {
586                         garbage = generate_garbage(name_len);
587                 }
588                 if (!garbage) {
589                         error("cannot sanitize name, not enough memory");
590                         return;
591                 }
592                 write_extent_buffer(eb, garbage, name_ptr, name_len);
593                 cur += this_len;
594                 dir_item = (struct btrfs_dir_item *)((char *)dir_item +
595                                                      this_len);
596                 if (free_garbage)
597                         free(garbage);
598         }
599 }
600
601 static void sanitize_inode_ref(struct metadump_struct *md,
602                                struct extent_buffer *eb, int slot, int ext)
603 {
604         struct btrfs_inode_extref *extref;
605         struct btrfs_inode_ref *ref;
606         char *garbage, *buf;
607         unsigned long ptr;
608         unsigned long name_ptr;
609         u32 item_size;
610         u32 cur_offset = 0;
611         int len;
612         int free_garbage = (md->sanitize_names == 1);
613
614         item_size = btrfs_item_size_nr(eb, slot);
615         ptr = btrfs_item_ptr_offset(eb, slot);
616         while (cur_offset < item_size) {
617                 if (ext) {
618                         extref = (struct btrfs_inode_extref *)(ptr +
619                                                                cur_offset);
620                         name_ptr = (unsigned long)(&extref->name);
621                         len = btrfs_inode_extref_name_len(eb, extref);
622                         cur_offset += sizeof(*extref);
623                 } else {
624                         ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
625                         len = btrfs_inode_ref_name_len(eb, ref);
626                         name_ptr = (unsigned long)(ref + 1);
627                         cur_offset += sizeof(*ref);
628                 }
629                 cur_offset += len;
630
631                 if (md->sanitize_names > 1) {
632                         buf = malloc(len);
633                         if (!buf) {
634                                 error("cannot sanitize name, not enough memory");
635                                 return;
636                         }
637                         read_extent_buffer(eb, buf, name_ptr, len);
638                         garbage = find_collision(md, buf, len);
639                 } else {
640                         garbage = generate_garbage(len);
641                 }
642
643                 if (!garbage) {
644                         error("cannot sanitize name, not enough memory");
645                         return;
646                 }
647                 write_extent_buffer(eb, garbage, name_ptr, len);
648                 if (free_garbage)
649                         free(garbage);
650         }
651 }
652
653 static void sanitize_xattr(struct metadump_struct *md,
654                            struct extent_buffer *eb, int slot)
655 {
656         struct btrfs_dir_item *dir_item;
657         unsigned long data_ptr;
658         u32 data_len;
659
660         dir_item = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
661         data_len = btrfs_dir_data_len(eb, dir_item);
662
663         data_ptr = (unsigned long)((char *)(dir_item + 1) +
664                                    btrfs_dir_name_len(eb, dir_item));
665         memset_extent_buffer(eb, 0, data_ptr, data_len);
666 }
667
668 static void sanitize_name(struct metadump_struct *md, u8 *dst,
669                           struct extent_buffer *src, struct btrfs_key *key,
670                           int slot)
671 {
672         struct extent_buffer *eb;
673
674         eb = alloc_dummy_eb(src->start, src->len);
675         if (!eb) {
676                 error("cannot sanitize name, not enough memory");
677                 return;
678         }
679
680         memcpy(eb->data, src->data, src->len);
681
682         switch (key->type) {
683         case BTRFS_DIR_ITEM_KEY:
684         case BTRFS_DIR_INDEX_KEY:
685                 sanitize_dir_item(md, eb, slot);
686                 break;
687         case BTRFS_INODE_REF_KEY:
688                 sanitize_inode_ref(md, eb, slot, 0);
689                 break;
690         case BTRFS_INODE_EXTREF_KEY:
691                 sanitize_inode_ref(md, eb, slot, 1);
692                 break;
693         case BTRFS_XATTR_ITEM_KEY:
694                 sanitize_xattr(md, eb, slot);
695                 break;
696         default:
697                 break;
698         }
699
700         memcpy(dst, eb->data, eb->len);
701         free(eb);
702 }
703
704 /*
705  * zero inline extents and csum items
706  */
707 static void zero_items(struct metadump_struct *md, u8 *dst,
708                        struct extent_buffer *src)
709 {
710         struct btrfs_file_extent_item *fi;
711         struct btrfs_item *item;
712         struct btrfs_key key;
713         u32 nritems = btrfs_header_nritems(src);
714         size_t size;
715         unsigned long ptr;
716         int i, extent_type;
717
718         for (i = 0; i < nritems; i++) {
719                 item = btrfs_item_nr(i);
720                 btrfs_item_key_to_cpu(src, &key, i);
721                 if (key.type == BTRFS_CSUM_ITEM_KEY) {
722                         size = btrfs_item_size_nr(src, i);
723                         memset(dst + btrfs_leaf_data(src) +
724                                btrfs_item_offset_nr(src, i), 0, size);
725                         continue;
726                 }
727
728                 if (md->sanitize_names && has_name(&key)) {
729                         sanitize_name(md, dst, src, &key, i);
730                         continue;
731                 }
732
733                 if (key.type != BTRFS_EXTENT_DATA_KEY)
734                         continue;
735
736                 fi = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
737                 extent_type = btrfs_file_extent_type(src, fi);
738                 if (extent_type != BTRFS_FILE_EXTENT_INLINE)
739                         continue;
740
741                 ptr = btrfs_file_extent_inline_start(fi);
742                 size = btrfs_file_extent_inline_item_len(src, item);
743                 memset(dst + ptr, 0, size);
744         }
745 }
746
747 /*
748  * copy buffer and zero useless data in the buffer
749  */
750 static void copy_buffer(struct metadump_struct *md, u8 *dst,
751                         struct extent_buffer *src)
752 {
753         int level;
754         size_t size;
755         u32 nritems;
756
757         memcpy(dst, src->data, src->len);
758         if (src->start == BTRFS_SUPER_INFO_OFFSET)
759                 return;
760
761         level = btrfs_header_level(src);
762         nritems = btrfs_header_nritems(src);
763
764         if (nritems == 0) {
765                 size = sizeof(struct btrfs_header);
766                 memset(dst + size, 0, src->len - size);
767         } else if (level == 0) {
768                 size = btrfs_leaf_data(src) +
769                         btrfs_item_offset_nr(src, nritems - 1) -
770                         btrfs_item_nr_offset(nritems);
771                 memset(dst + btrfs_item_nr_offset(nritems), 0, size);
772                 zero_items(md, dst, src);
773         } else {
774                 size = offsetof(struct btrfs_node, ptrs) +
775                         sizeof(struct btrfs_key_ptr) * nritems;
776                 memset(dst + size, 0, src->len - size);
777         }
778         csum_block(dst, src->len);
779 }
780
781 static void *dump_worker(void *data)
782 {
783         struct metadump_struct *md = (struct metadump_struct *)data;
784         struct async_work *async;
785         int ret;
786
787         while (1) {
788                 pthread_mutex_lock(&md->mutex);
789                 while (list_empty(&md->list)) {
790                         if (md->done) {
791                                 pthread_mutex_unlock(&md->mutex);
792                                 goto out;
793                         }
794                         pthread_cond_wait(&md->cond, &md->mutex);
795                 }
796                 async = list_entry(md->list.next, struct async_work, list);
797                 list_del_init(&async->list);
798                 pthread_mutex_unlock(&md->mutex);
799
800                 if (md->compress_level > 0) {
801                         u8 *orig = async->buffer;
802
803                         async->bufsize = compressBound(async->size);
804                         async->buffer = malloc(async->bufsize);
805                         if (!async->buffer) {
806                                 error("not enough memory for async buffer");
807                                 pthread_mutex_lock(&md->mutex);
808                                 if (!md->error)
809                                         md->error = -ENOMEM;
810                                 pthread_mutex_unlock(&md->mutex);
811                                 pthread_exit(NULL);
812                         }
813
814                         ret = compress2(async->buffer,
815                                          (unsigned long *)&async->bufsize,
816                                          orig, async->size, md->compress_level);
817
818                         if (ret != Z_OK)
819                                 async->error = 1;
820
821                         free(orig);
822                 }
823
824                 pthread_mutex_lock(&md->mutex);
825                 md->num_ready++;
826                 pthread_mutex_unlock(&md->mutex);
827         }
828 out:
829         pthread_exit(NULL);
830 }
831
832 static void meta_cluster_init(struct metadump_struct *md, u64 start)
833 {
834         struct meta_cluster_header *header;
835
836         md->num_items = 0;
837         md->num_ready = 0;
838         header = &md->cluster.header;
839         header->magic = cpu_to_le64(HEADER_MAGIC);
840         header->bytenr = cpu_to_le64(start);
841         header->nritems = cpu_to_le32(0);
842         header->compress = md->compress_level > 0 ?
843                            COMPRESS_ZLIB : COMPRESS_NONE;
844 }
845
846 static void metadump_destroy(struct metadump_struct *md, int num_threads)
847 {
848         int i;
849         struct rb_node *n;
850
851         pthread_mutex_lock(&md->mutex);
852         md->done = 1;
853         pthread_cond_broadcast(&md->cond);
854         pthread_mutex_unlock(&md->mutex);
855
856         for (i = 0; i < num_threads; i++)
857                 pthread_join(md->threads[i], NULL);
858
859         pthread_cond_destroy(&md->cond);
860         pthread_mutex_destroy(&md->mutex);
861
862         while ((n = rb_first(&md->name_tree))) {
863                 struct name *name;
864
865                 name = rb_entry(n, struct name, n);
866                 rb_erase(n, &md->name_tree);
867                 free(name->val);
868                 free(name->sub);
869                 free(name);
870         }
871 }
872
873 static int metadump_init(struct metadump_struct *md, struct btrfs_root *root,
874                          FILE *out, int num_threads, int compress_level,
875                          int sanitize_names)
876 {
877         int i, ret = 0;
878
879         memset(md, 0, sizeof(*md));
880         INIT_LIST_HEAD(&md->list);
881         INIT_LIST_HEAD(&md->ordered);
882         md->root = root;
883         md->out = out;
884         md->pending_start = (u64)-1;
885         md->compress_level = compress_level;
886         md->sanitize_names = sanitize_names;
887         if (sanitize_names > 1)
888                 crc32c_optimization_init();
889
890         md->name_tree.rb_node = NULL;
891         md->num_threads = num_threads;
892         pthread_cond_init(&md->cond, NULL);
893         pthread_mutex_init(&md->mutex, NULL);
894         meta_cluster_init(md, 0);
895
896         if (!num_threads)
897                 return 0;
898
899         for (i = 0; i < num_threads; i++) {
900                 ret = pthread_create(md->threads + i, NULL, dump_worker, md);
901                 if (ret)
902                         break;
903         }
904
905         if (ret)
906                 metadump_destroy(md, i + 1);
907
908         return ret;
909 }
910
911 static int write_zero(FILE *out, size_t size)
912 {
913         static char zero[BLOCK_SIZE];
914         return fwrite(zero, size, 1, out);
915 }
916
917 static int write_buffers(struct metadump_struct *md, u64 *next)
918 {
919         struct meta_cluster_header *header = &md->cluster.header;
920         struct meta_cluster_item *item;
921         struct async_work *async;
922         u64 bytenr = 0;
923         u32 nritems = 0;
924         int ret;
925         int err = 0;
926
927         if (list_empty(&md->ordered))
928                 goto out;
929
930         /* wait until all buffers are compressed */
931         while (!err && md->num_items > md->num_ready) {
932                 struct timespec ts = {
933                         .tv_sec = 0,
934                         .tv_nsec = 10000000,
935                 };
936                 pthread_mutex_unlock(&md->mutex);
937                 nanosleep(&ts, NULL);
938                 pthread_mutex_lock(&md->mutex);
939                 err = md->error;
940         }
941
942         if (err) {
943                 error("one of the threads failed: %s", strerror(-err));
944                 goto out;
945         }
946
947         /* setup and write index block */
948         list_for_each_entry(async, &md->ordered, ordered) {
949                 item = &md->cluster.items[nritems];
950                 item->bytenr = cpu_to_le64(async->start);
951                 item->size = cpu_to_le32(async->bufsize);
952                 nritems++;
953         }
954         header->nritems = cpu_to_le32(nritems);
955
956         ret = fwrite(&md->cluster, BLOCK_SIZE, 1, md->out);
957         if (ret != 1) {
958                 error("unable to write out cluster: %s", strerror(errno));
959                 return -errno;
960         }
961
962         /* write buffers */
963         bytenr += le64_to_cpu(header->bytenr) + BLOCK_SIZE;
964         while (!list_empty(&md->ordered)) {
965                 async = list_entry(md->ordered.next, struct async_work,
966                                    ordered);
967                 list_del_init(&async->ordered);
968
969                 bytenr += async->bufsize;
970                 if (!err)
971                         ret = fwrite(async->buffer, async->bufsize, 1,
972                                      md->out);
973                 if (ret != 1) {
974                         error("unable to write out cluster: %s",
975                                 strerror(errno));
976                         err = -errno;
977                         ret = 0;
978                 }
979
980                 free(async->buffer);
981                 free(async);
982         }
983
984         /* zero unused space in the last block */
985         if (!err && bytenr & BLOCK_MASK) {
986                 size_t size = BLOCK_SIZE - (bytenr & BLOCK_MASK);
987
988                 bytenr += size;
989                 ret = write_zero(md->out, size);
990                 if (ret != 1) {
991                         error("unable to zero out buffer: %s",
992                                 strerror(errno));
993                         err = -errno;
994                 }
995         }
996 out:
997         *next = bytenr;
998         return err;
999 }
1000
1001 static int read_data_extent(struct metadump_struct *md,
1002                             struct async_work *async)
1003 {
1004         struct btrfs_root *root = md->root;
1005         struct btrfs_fs_info *fs_info = root->fs_info;
1006         u64 bytes_left = async->size;
1007         u64 logical = async->start;
1008         u64 offset = 0;
1009         u64 read_len;
1010         int num_copies;
1011         int cur_mirror;
1012         int ret;
1013
1014         num_copies = btrfs_num_copies(root->fs_info, logical, bytes_left);
1015
1016         /* Try our best to read data, just like read_tree_block() */
1017         for (cur_mirror = 0; cur_mirror < num_copies; cur_mirror++) {
1018                 while (bytes_left) {
1019                         read_len = bytes_left;
1020                         ret = read_extent_data(fs_info,
1021                                         (char *)(async->buffer + offset),
1022                                         logical, &read_len, cur_mirror);
1023                         if (ret < 0)
1024                                 break;
1025                         offset += read_len;
1026                         logical += read_len;
1027                         bytes_left -= read_len;
1028                 }
1029         }
1030         if (bytes_left)
1031                 return -EIO;
1032         return 0;
1033 }
1034
1035 static int get_dev_fd(struct btrfs_root *root)
1036 {
1037         struct btrfs_device *dev;
1038
1039         dev = list_first_entry(&root->fs_info->fs_devices->devices,
1040                                struct btrfs_device, dev_list);
1041         return dev->fd;
1042 }
1043
1044 static int flush_pending(struct metadump_struct *md, int done)
1045 {
1046         struct async_work *async = NULL;
1047         struct extent_buffer *eb;
1048         u64 start = 0;
1049         u64 size;
1050         size_t offset;
1051         int ret = 0;
1052
1053         if (md->pending_size) {
1054                 async = calloc(1, sizeof(*async));
1055                 if (!async)
1056                         return -ENOMEM;
1057
1058                 async->start = md->pending_start;
1059                 async->size = md->pending_size;
1060                 async->bufsize = async->size;
1061                 async->buffer = malloc(async->bufsize);
1062                 if (!async->buffer) {
1063                         free(async);
1064                         return -ENOMEM;
1065                 }
1066                 offset = 0;
1067                 start = async->start;
1068                 size = async->size;
1069
1070                 if (md->data) {
1071                         ret = read_data_extent(md, async);
1072                         if (ret) {
1073                                 free(async->buffer);
1074                                 free(async);
1075                                 return ret;
1076                         }
1077                 }
1078
1079                 /*
1080                  * Balance can make the mapping not cover the super block, so
1081                  * just copy directly from one of the devices.
1082                  */
1083                 if (start == BTRFS_SUPER_INFO_OFFSET) {
1084                         int fd = get_dev_fd(md->root);
1085
1086                         ret = pread64(fd, async->buffer, size, start);
1087                         if (ret < size) {
1088                                 free(async->buffer);
1089                                 free(async);
1090                                 error("unable to read superblock at %llu: %s",
1091                                                 (unsigned long long)start,
1092                                                 strerror(errno));
1093                                 return -errno;
1094                         }
1095                         size = 0;
1096                         ret = 0;
1097                 }
1098
1099                 while (!md->data && size > 0) {
1100                         u64 this_read = min((u64)md->root->fs_info->nodesize,
1101                                         size);
1102
1103                         eb = read_tree_block(md->root->fs_info, start, 0);
1104                         if (!extent_buffer_uptodate(eb)) {
1105                                 free(async->buffer);
1106                                 free(async);
1107                                 error("unable to read metadata block %llu",
1108                                         (unsigned long long)start);
1109                                 return -EIO;
1110                         }
1111                         copy_buffer(md, async->buffer + offset, eb);
1112                         free_extent_buffer(eb);
1113                         start += this_read;
1114                         offset += this_read;
1115                         size -= this_read;
1116                 }
1117
1118                 md->pending_start = (u64)-1;
1119                 md->pending_size = 0;
1120         } else if (!done) {
1121                 return 0;
1122         }
1123
1124         pthread_mutex_lock(&md->mutex);
1125         if (async) {
1126                 list_add_tail(&async->ordered, &md->ordered);
1127                 md->num_items++;
1128                 if (md->compress_level > 0) {
1129                         list_add_tail(&async->list, &md->list);
1130                         pthread_cond_signal(&md->cond);
1131                 } else {
1132                         md->num_ready++;
1133                 }
1134         }
1135         if (md->num_items >= ITEMS_PER_CLUSTER || done) {
1136                 ret = write_buffers(md, &start);
1137                 if (ret)
1138                         error("unable to write buffers: %s", strerror(-ret));
1139                 else
1140                         meta_cluster_init(md, start);
1141         }
1142         pthread_mutex_unlock(&md->mutex);
1143         return ret;
1144 }
1145
1146 static int add_extent(u64 start, u64 size, struct metadump_struct *md,
1147                       int data)
1148 {
1149         int ret;
1150         if (md->data != data ||
1151             md->pending_size + size > MAX_PENDING_SIZE ||
1152             md->pending_start + md->pending_size != start) {
1153                 ret = flush_pending(md, 0);
1154                 if (ret)
1155                         return ret;
1156                 md->pending_start = start;
1157         }
1158         readahead_tree_block(md->root->fs_info, start, 0);
1159         md->pending_size += size;
1160         md->data = data;
1161         return 0;
1162 }
1163
1164 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1165 static int is_tree_block(struct btrfs_root *extent_root,
1166                          struct btrfs_path *path, u64 bytenr)
1167 {
1168         struct extent_buffer *leaf;
1169         struct btrfs_key key;
1170         u64 ref_objectid;
1171         int ret;
1172
1173         leaf = path->nodes[0];
1174         while (1) {
1175                 struct btrfs_extent_ref_v0 *ref_item;
1176                 path->slots[0]++;
1177                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1178                         ret = btrfs_next_leaf(extent_root, path);
1179                         if (ret < 0)
1180                                 return ret;
1181                         if (ret > 0)
1182                                 break;
1183                         leaf = path->nodes[0];
1184                 }
1185                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1186                 if (key.objectid != bytenr)
1187                         break;
1188                 if (key.type != BTRFS_EXTENT_REF_V0_KEY)
1189                         continue;
1190                 ref_item = btrfs_item_ptr(leaf, path->slots[0],
1191                                           struct btrfs_extent_ref_v0);
1192                 ref_objectid = btrfs_ref_objectid_v0(leaf, ref_item);
1193                 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID)
1194                         return 1;
1195                 break;
1196         }
1197         return 0;
1198 }
1199 #endif
1200
1201 static int copy_tree_blocks(struct btrfs_root *root, struct extent_buffer *eb,
1202                             struct metadump_struct *metadump, int root_tree)
1203 {
1204         struct extent_buffer *tmp;
1205         struct btrfs_root_item *ri;
1206         struct btrfs_key key;
1207         struct btrfs_fs_info *fs_info = root->fs_info;
1208         u64 bytenr;
1209         int level;
1210         int nritems = 0;
1211         int i = 0;
1212         int ret;
1213
1214         ret = add_extent(btrfs_header_bytenr(eb), fs_info->nodesize,
1215                          metadump, 0);
1216         if (ret) {
1217                 error("unable to add metadata block %llu: %d",
1218                                 btrfs_header_bytenr(eb), ret);
1219                 return ret;
1220         }
1221
1222         if (btrfs_header_level(eb) == 0 && !root_tree)
1223                 return 0;
1224
1225         level = btrfs_header_level(eb);
1226         nritems = btrfs_header_nritems(eb);
1227         for (i = 0; i < nritems; i++) {
1228                 if (level == 0) {
1229                         btrfs_item_key_to_cpu(eb, &key, i);
1230                         if (key.type != BTRFS_ROOT_ITEM_KEY)
1231                                 continue;
1232                         ri = btrfs_item_ptr(eb, i, struct btrfs_root_item);
1233                         bytenr = btrfs_disk_root_bytenr(eb, ri);
1234                         tmp = read_tree_block(fs_info, bytenr, 0);
1235                         if (!extent_buffer_uptodate(tmp)) {
1236                                 error("unable to read log root block");
1237                                 return -EIO;
1238                         }
1239                         ret = copy_tree_blocks(root, tmp, metadump, 0);
1240                         free_extent_buffer(tmp);
1241                         if (ret)
1242                                 return ret;
1243                 } else {
1244                         bytenr = btrfs_node_blockptr(eb, i);
1245                         tmp = read_tree_block(fs_info, bytenr, 0);
1246                         if (!extent_buffer_uptodate(tmp)) {
1247                                 error("unable to read log root block");
1248                                 return -EIO;
1249                         }
1250                         ret = copy_tree_blocks(root, tmp, metadump, root_tree);
1251                         free_extent_buffer(tmp);
1252                         if (ret)
1253                                 return ret;
1254                 }
1255         }
1256
1257         return 0;
1258 }
1259
1260 static int copy_log_trees(struct btrfs_root *root,
1261                           struct metadump_struct *metadump)
1262 {
1263         u64 blocknr = btrfs_super_log_root(root->fs_info->super_copy);
1264
1265         if (blocknr == 0)
1266                 return 0;
1267
1268         if (!root->fs_info->log_root_tree ||
1269             !root->fs_info->log_root_tree->node) {
1270                 error("unable to copy tree log, it has not been setup");
1271                 return -EIO;
1272         }
1273
1274         return copy_tree_blocks(root, root->fs_info->log_root_tree->node,
1275                                 metadump, 1);
1276 }
1277
1278 static int copy_space_cache(struct btrfs_root *root,
1279                             struct metadump_struct *metadump,
1280                             struct btrfs_path *path)
1281 {
1282         struct extent_buffer *leaf;
1283         struct btrfs_file_extent_item *fi;
1284         struct btrfs_key key;
1285         u64 bytenr, num_bytes;
1286         int ret;
1287
1288         root = root->fs_info->tree_root;
1289
1290         key.objectid = 0;
1291         key.type = BTRFS_EXTENT_DATA_KEY;
1292         key.offset = 0;
1293
1294         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1295         if (ret < 0) {
1296                 error("free space inode not found: %d", ret);
1297                 return ret;
1298         }
1299
1300         leaf = path->nodes[0];
1301
1302         while (1) {
1303                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1304                         ret = btrfs_next_leaf(root, path);
1305                         if (ret < 0) {
1306                                 error("cannot go to next leaf %d", ret);
1307                                 return ret;
1308                         }
1309                         if (ret > 0)
1310                                 break;
1311                         leaf = path->nodes[0];
1312                 }
1313
1314                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1315                 if (key.type != BTRFS_EXTENT_DATA_KEY) {
1316                         path->slots[0]++;
1317                         continue;
1318                 }
1319
1320                 fi = btrfs_item_ptr(leaf, path->slots[0],
1321                                     struct btrfs_file_extent_item);
1322                 if (btrfs_file_extent_type(leaf, fi) !=
1323                     BTRFS_FILE_EXTENT_REG) {
1324                         path->slots[0]++;
1325                         continue;
1326                 }
1327
1328                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1329                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1330                 ret = add_extent(bytenr, num_bytes, metadump, 1);
1331                 if (ret) {
1332                         error("unable to add space cache blocks %d", ret);
1333                         btrfs_release_path(path);
1334                         return ret;
1335                 }
1336                 path->slots[0]++;
1337         }
1338
1339         return 0;
1340 }
1341
1342 static int copy_from_extent_tree(struct metadump_struct *metadump,
1343                                  struct btrfs_path *path)
1344 {
1345         struct btrfs_root *extent_root;
1346         struct extent_buffer *leaf;
1347         struct btrfs_extent_item *ei;
1348         struct btrfs_key key;
1349         u64 bytenr;
1350         u64 num_bytes;
1351         int ret;
1352
1353         extent_root = metadump->root->fs_info->extent_root;
1354         bytenr = BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE;
1355         key.objectid = bytenr;
1356         key.type = BTRFS_EXTENT_ITEM_KEY;
1357         key.offset = 0;
1358
1359         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1360         if (ret < 0) {
1361                 error("extent root not found: %d", ret);
1362                 return ret;
1363         }
1364         ret = 0;
1365
1366         leaf = path->nodes[0];
1367
1368         while (1) {
1369                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1370                         ret = btrfs_next_leaf(extent_root, path);
1371                         if (ret < 0) {
1372                                 error("cannot go to next leaf %d", ret);
1373                                 break;
1374                         }
1375                         if (ret > 0) {
1376                                 ret = 0;
1377                                 break;
1378                         }
1379                         leaf = path->nodes[0];
1380                 }
1381
1382                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1383                 if (key.objectid < bytenr ||
1384                     (key.type != BTRFS_EXTENT_ITEM_KEY &&
1385                      key.type != BTRFS_METADATA_ITEM_KEY)) {
1386                         path->slots[0]++;
1387                         continue;
1388                 }
1389
1390                 bytenr = key.objectid;
1391                 if (key.type == BTRFS_METADATA_ITEM_KEY) {
1392                         num_bytes = extent_root->fs_info->nodesize;
1393                 } else {
1394                         num_bytes = key.offset;
1395                 }
1396
1397                 if (num_bytes == 0) {
1398                         error("extent length 0 at bytenr %llu key type %d",
1399                                         (unsigned long long)bytenr, key.type);
1400                         ret = -EIO;
1401                         break;
1402                 }
1403
1404                 if (btrfs_item_size_nr(leaf, path->slots[0]) > sizeof(*ei)) {
1405                         ei = btrfs_item_ptr(leaf, path->slots[0],
1406                                             struct btrfs_extent_item);
1407                         if (btrfs_extent_flags(leaf, ei) &
1408                             BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1409                                 ret = add_extent(bytenr, num_bytes, metadump,
1410                                                  0);
1411                                 if (ret) {
1412                                         error("unable to add block %llu: %d",
1413                                                 (unsigned long long)bytenr, ret);
1414                                         break;
1415                                 }
1416                         }
1417                 } else {
1418 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1419                         ret = is_tree_block(extent_root, path, bytenr);
1420                         if (ret < 0) {
1421                                 error("failed to check tree block %llu: %d",
1422                                         (unsigned long long)bytenr, ret);
1423                                 break;
1424                         }
1425
1426                         if (ret) {
1427                                 ret = add_extent(bytenr, num_bytes, metadump,
1428                                                  0);
1429                                 if (ret) {
1430                                         error("unable to add block %llu: %d",
1431                                                 (unsigned long long)bytenr, ret);
1432                                         break;
1433                                 }
1434                         }
1435                         ret = 0;
1436 #else
1437                         error(
1438         "either extent tree is corrupted or you haven't built with V0 support");
1439                         ret = -EIO;
1440                         break;
1441 #endif
1442                 }
1443                 bytenr += num_bytes;
1444         }
1445
1446         btrfs_release_path(path);
1447
1448         return ret;
1449 }
1450
1451 static int create_metadump(const char *input, FILE *out, int num_threads,
1452                            int compress_level, int sanitize, int walk_trees)
1453 {
1454         struct btrfs_root *root;
1455         struct btrfs_path path;
1456         struct metadump_struct metadump;
1457         int ret;
1458         int err = 0;
1459
1460         root = open_ctree(input, 0, 0);
1461         if (!root) {
1462                 error("open ctree failed");
1463                 return -EIO;
1464         }
1465
1466         ret = metadump_init(&metadump, root, out, num_threads,
1467                             compress_level, sanitize);
1468         if (ret) {
1469                 error("failed to initialize metadump: %d", ret);
1470                 close_ctree(root);
1471                 return ret;
1472         }
1473
1474         ret = add_extent(BTRFS_SUPER_INFO_OFFSET, BTRFS_SUPER_INFO_SIZE,
1475                         &metadump, 0);
1476         if (ret) {
1477                 error("unable to add metadata: %d", ret);
1478                 err = ret;
1479                 goto out;
1480         }
1481
1482         btrfs_init_path(&path);
1483
1484         if (walk_trees) {
1485                 ret = copy_tree_blocks(root, root->fs_info->chunk_root->node,
1486                                        &metadump, 1);
1487                 if (ret) {
1488                         err = ret;
1489                         goto out;
1490                 }
1491
1492                 ret = copy_tree_blocks(root, root->fs_info->tree_root->node,
1493                                        &metadump, 1);
1494                 if (ret) {
1495                         err = ret;
1496                         goto out;
1497                 }
1498         } else {
1499                 ret = copy_from_extent_tree(&metadump, &path);
1500                 if (ret) {
1501                         err = ret;
1502                         goto out;
1503                 }
1504         }
1505
1506         ret = copy_log_trees(root, &metadump);
1507         if (ret) {
1508                 err = ret;
1509                 goto out;
1510         }
1511
1512         ret = copy_space_cache(root, &metadump, &path);
1513 out:
1514         ret = flush_pending(&metadump, 1);
1515         if (ret) {
1516                 if (!err)
1517                         err = ret;
1518                 error("failed to flush pending data: %d", ret);
1519         }
1520
1521         metadump_destroy(&metadump, num_threads);
1522
1523         btrfs_release_path(&path);
1524         ret = close_ctree(root);
1525         return err ? err : ret;
1526 }
1527
1528 static void update_super_old(u8 *buffer)
1529 {
1530         struct btrfs_super_block *super = (struct btrfs_super_block *)buffer;
1531         struct btrfs_chunk *chunk;
1532         struct btrfs_disk_key *key;
1533         u32 sectorsize = btrfs_super_sectorsize(super);
1534         u64 flags = btrfs_super_flags(super);
1535
1536         flags |= BTRFS_SUPER_FLAG_METADUMP;
1537         btrfs_set_super_flags(super, flags);
1538
1539         key = (struct btrfs_disk_key *)(super->sys_chunk_array);
1540         chunk = (struct btrfs_chunk *)(super->sys_chunk_array +
1541                                        sizeof(struct btrfs_disk_key));
1542
1543         btrfs_set_disk_key_objectid(key, BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1544         btrfs_set_disk_key_type(key, BTRFS_CHUNK_ITEM_KEY);
1545         btrfs_set_disk_key_offset(key, 0);
1546
1547         btrfs_set_stack_chunk_length(chunk, (u64)-1);
1548         btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
1549         btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
1550         btrfs_set_stack_chunk_type(chunk, BTRFS_BLOCK_GROUP_SYSTEM);
1551         btrfs_set_stack_chunk_io_align(chunk, sectorsize);
1552         btrfs_set_stack_chunk_io_width(chunk, sectorsize);
1553         btrfs_set_stack_chunk_sector_size(chunk, sectorsize);
1554         btrfs_set_stack_chunk_num_stripes(chunk, 1);
1555         btrfs_set_stack_chunk_sub_stripes(chunk, 0);
1556         chunk->stripe.devid = super->dev_item.devid;
1557         btrfs_set_stack_stripe_offset(&chunk->stripe, 0);
1558         memcpy(chunk->stripe.dev_uuid, super->dev_item.uuid, BTRFS_UUID_SIZE);
1559         btrfs_set_super_sys_array_size(super, sizeof(*key) + sizeof(*chunk));
1560         csum_block(buffer, BTRFS_SUPER_INFO_SIZE);
1561 }
1562
1563 static int update_super(struct mdrestore_struct *mdres, u8 *buffer)
1564 {
1565         struct btrfs_super_block *super = (struct btrfs_super_block *)buffer;
1566         struct btrfs_chunk *chunk;
1567         struct btrfs_disk_key *disk_key;
1568         struct btrfs_key key;
1569         u64 flags = btrfs_super_flags(super);
1570         u32 new_array_size = 0;
1571         u32 array_size;
1572         u32 cur = 0;
1573         u8 *ptr, *write_ptr;
1574         int old_num_stripes;
1575
1576         write_ptr = ptr = super->sys_chunk_array;
1577         array_size = btrfs_super_sys_array_size(super);
1578
1579         while (cur < array_size) {
1580                 disk_key = (struct btrfs_disk_key *)ptr;
1581                 btrfs_disk_key_to_cpu(&key, disk_key);
1582
1583                 new_array_size += sizeof(*disk_key);
1584                 memmove(write_ptr, ptr, sizeof(*disk_key));
1585
1586                 write_ptr += sizeof(*disk_key);
1587                 ptr += sizeof(*disk_key);
1588                 cur += sizeof(*disk_key);
1589
1590                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1591                         u64 type, physical, physical_dup, size = 0;
1592
1593                         chunk = (struct btrfs_chunk *)ptr;
1594                         old_num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1595                         chunk = (struct btrfs_chunk *)write_ptr;
1596
1597                         memmove(write_ptr, ptr, sizeof(*chunk));
1598                         btrfs_set_stack_chunk_sub_stripes(chunk, 0);
1599                         type = btrfs_stack_chunk_type(chunk);
1600                         if (type & BTRFS_BLOCK_GROUP_DUP) {
1601                                 new_array_size += sizeof(struct btrfs_stripe);
1602                                 write_ptr += sizeof(struct btrfs_stripe);
1603                         } else {
1604                                 btrfs_set_stack_chunk_num_stripes(chunk, 1);
1605                                 btrfs_set_stack_chunk_type(chunk,
1606                                                 BTRFS_BLOCK_GROUP_SYSTEM);
1607                         }
1608                         chunk->stripe.devid = super->dev_item.devid;
1609                         physical = logical_to_physical(mdres, key.offset,
1610                                                        &size, &physical_dup);
1611                         if (size != (u64)-1)
1612                                 btrfs_set_stack_stripe_offset(&chunk->stripe,
1613                                                               physical);
1614                         memcpy(chunk->stripe.dev_uuid, super->dev_item.uuid,
1615                                BTRFS_UUID_SIZE);
1616                         new_array_size += sizeof(*chunk);
1617                 } else {
1618                         error("bogus key in the sys array %d", key.type);
1619                         return -EIO;
1620                 }
1621                 write_ptr += sizeof(*chunk);
1622                 ptr += btrfs_chunk_item_size(old_num_stripes);
1623                 cur += btrfs_chunk_item_size(old_num_stripes);
1624         }
1625
1626         if (mdres->clear_space_cache)
1627                 btrfs_set_super_cache_generation(super, 0);
1628
1629         flags |= BTRFS_SUPER_FLAG_METADUMP_V2;
1630         btrfs_set_super_flags(super, flags);
1631         btrfs_set_super_sys_array_size(super, new_array_size);
1632         btrfs_set_super_num_devices(super, 1);
1633         csum_block(buffer, BTRFS_SUPER_INFO_SIZE);
1634
1635         return 0;
1636 }
1637
1638 static struct extent_buffer *alloc_dummy_eb(u64 bytenr, u32 size)
1639 {
1640         struct extent_buffer *eb;
1641
1642         eb = calloc(1, sizeof(struct extent_buffer) + size);
1643         if (!eb)
1644                 return NULL;
1645
1646         eb->start = bytenr;
1647         eb->len = size;
1648         return eb;
1649 }
1650
1651 static void truncate_item(struct extent_buffer *eb, int slot, u32 new_size)
1652 {
1653         struct btrfs_item *item;
1654         u32 nritems;
1655         u32 old_size;
1656         u32 old_data_start;
1657         u32 size_diff;
1658         u32 data_end;
1659         int i;
1660
1661         old_size = btrfs_item_size_nr(eb, slot);
1662         if (old_size == new_size)
1663                 return;
1664
1665         nritems = btrfs_header_nritems(eb);
1666         data_end = btrfs_item_offset_nr(eb, nritems - 1);
1667
1668         old_data_start = btrfs_item_offset_nr(eb, slot);
1669         size_diff = old_size - new_size;
1670
1671         for (i = slot; i < nritems; i++) {
1672                 u32 ioff;
1673                 item = btrfs_item_nr(i);
1674                 ioff = btrfs_item_offset(eb, item);
1675                 btrfs_set_item_offset(eb, item, ioff + size_diff);
1676         }
1677
1678         memmove_extent_buffer(eb, btrfs_leaf_data(eb) + data_end + size_diff,
1679                               btrfs_leaf_data(eb) + data_end,
1680                               old_data_start + new_size - data_end);
1681         item = btrfs_item_nr(slot);
1682         btrfs_set_item_size(eb, item, new_size);
1683 }
1684
1685 static int fixup_chunk_tree_block(struct mdrestore_struct *mdres,
1686                                   struct async_work *async, u8 *buffer,
1687                                   size_t size)
1688 {
1689         struct extent_buffer *eb;
1690         size_t size_left = size;
1691         u64 bytenr = async->start;
1692         int i;
1693
1694         if (size_left % mdres->nodesize)
1695                 return 0;
1696
1697         eb = alloc_dummy_eb(bytenr, mdres->nodesize);
1698         if (!eb)
1699                 return -ENOMEM;
1700
1701         while (size_left) {
1702                 eb->start = bytenr;
1703                 memcpy(eb->data, buffer, mdres->nodesize);
1704
1705                 if (btrfs_header_bytenr(eb) != bytenr)
1706                         break;
1707                 if (memcmp(mdres->fsid,
1708                            eb->data + offsetof(struct btrfs_header, fsid),
1709                            BTRFS_FSID_SIZE))
1710                         break;
1711
1712                 if (btrfs_header_owner(eb) != BTRFS_CHUNK_TREE_OBJECTID)
1713                         goto next;
1714
1715                 if (btrfs_header_level(eb) != 0)
1716                         goto next;
1717
1718                 for (i = 0; i < btrfs_header_nritems(eb); i++) {
1719                         struct btrfs_chunk *chunk;
1720                         struct btrfs_key key;
1721                         u64 type, physical, physical_dup, size = (u64)-1;
1722
1723                         btrfs_item_key_to_cpu(eb, &key, i);
1724                         if (key.type != BTRFS_CHUNK_ITEM_KEY)
1725                                 continue;
1726
1727                         size = 0;
1728                         physical = logical_to_physical(mdres, key.offset,
1729                                                        &size, &physical_dup);
1730
1731                         if (!physical_dup)
1732                                 truncate_item(eb, i, sizeof(*chunk));
1733                         chunk = btrfs_item_ptr(eb, i, struct btrfs_chunk);
1734
1735
1736                         /* Zero out the RAID profile */
1737                         type = btrfs_chunk_type(eb, chunk);
1738                         type &= (BTRFS_BLOCK_GROUP_DATA |
1739                                  BTRFS_BLOCK_GROUP_SYSTEM |
1740                                  BTRFS_BLOCK_GROUP_METADATA |
1741                                  BTRFS_BLOCK_GROUP_DUP);
1742                         btrfs_set_chunk_type(eb, chunk, type);
1743
1744                         if (!physical_dup)
1745                                 btrfs_set_chunk_num_stripes(eb, chunk, 1);
1746                         btrfs_set_chunk_sub_stripes(eb, chunk, 0);
1747                         btrfs_set_stripe_devid_nr(eb, chunk, 0, mdres->devid);
1748                         if (size != (u64)-1)
1749                                 btrfs_set_stripe_offset_nr(eb, chunk, 0,
1750                                                            physical);
1751                         /* update stripe 2 offset */
1752                         if (physical_dup)
1753                                 btrfs_set_stripe_offset_nr(eb, chunk, 1,
1754                                                            physical_dup);
1755
1756                         write_extent_buffer(eb, mdres->uuid,
1757                                         (unsigned long)btrfs_stripe_dev_uuid_nr(
1758                                                 chunk, 0),
1759                                         BTRFS_UUID_SIZE);
1760                 }
1761                 memcpy(buffer, eb->data, eb->len);
1762                 csum_block(buffer, eb->len);
1763 next:
1764                 size_left -= mdres->nodesize;
1765                 buffer += mdres->nodesize;
1766                 bytenr += mdres->nodesize;
1767         }
1768
1769         free(eb);
1770         return 0;
1771 }
1772
1773 static void write_backup_supers(int fd, u8 *buf)
1774 {
1775         struct btrfs_super_block *super = (struct btrfs_super_block *)buf;
1776         struct stat st;
1777         u64 size;
1778         u64 bytenr;
1779         int i;
1780         int ret;
1781
1782         if (fstat(fd, &st)) {
1783                 error(
1784         "cannot stat restore point, won't be able to write backup supers: %s",
1785                         strerror(errno));
1786                 return;
1787         }
1788
1789         size = btrfs_device_size(fd, &st);
1790
1791         for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1792                 bytenr = btrfs_sb_offset(i);
1793                 if (bytenr + BTRFS_SUPER_INFO_SIZE > size)
1794                         break;
1795                 btrfs_set_super_bytenr(super, bytenr);
1796                 csum_block(buf, BTRFS_SUPER_INFO_SIZE);
1797                 ret = pwrite64(fd, buf, BTRFS_SUPER_INFO_SIZE, bytenr);
1798                 if (ret < BTRFS_SUPER_INFO_SIZE) {
1799                         if (ret < 0)
1800                                 error(
1801                                 "problem writing out backup super block %d: %s",
1802                                                 i, strerror(errno));
1803                         else
1804                                 error("short write writing out backup super block");
1805                         break;
1806                 }
1807         }
1808 }
1809
1810 static void *restore_worker(void *data)
1811 {
1812         struct mdrestore_struct *mdres = (struct mdrestore_struct *)data;
1813         struct async_work *async;
1814         size_t size;
1815         u8 *buffer;
1816         u8 *outbuf;
1817         int outfd;
1818         int ret;
1819         int compress_size = MAX_PENDING_SIZE * 4;
1820
1821         outfd = fileno(mdres->out);
1822         buffer = malloc(compress_size);
1823         if (!buffer) {
1824                 error("not enough memory for restore worker buffer");
1825                 pthread_mutex_lock(&mdres->mutex);
1826                 if (!mdres->error)
1827                         mdres->error = -ENOMEM;
1828                 pthread_mutex_unlock(&mdres->mutex);
1829                 pthread_exit(NULL);
1830         }
1831
1832         while (1) {
1833                 u64 bytenr, physical_dup;
1834                 off_t offset = 0;
1835                 int err = 0;
1836
1837                 pthread_mutex_lock(&mdres->mutex);
1838                 while (!mdres->nodesize || list_empty(&mdres->list)) {
1839                         if (mdres->done) {
1840                                 pthread_mutex_unlock(&mdres->mutex);
1841                                 goto out;
1842                         }
1843                         pthread_cond_wait(&mdres->cond, &mdres->mutex);
1844                 }
1845                 async = list_entry(mdres->list.next, struct async_work, list);
1846                 list_del_init(&async->list);
1847
1848                 if (mdres->compress_method == COMPRESS_ZLIB) {
1849                         size = compress_size; 
1850                         pthread_mutex_unlock(&mdres->mutex);
1851                         ret = uncompress(buffer, (unsigned long *)&size,
1852                                          async->buffer, async->bufsize);
1853                         pthread_mutex_lock(&mdres->mutex);
1854                         if (ret != Z_OK) {
1855                                 error("decompression failed with %d", ret);
1856                                 err = -EIO;
1857                         }
1858                         outbuf = buffer;
1859                 } else {
1860                         outbuf = async->buffer;
1861                         size = async->bufsize;
1862                 }
1863
1864                 if (!mdres->multi_devices) {
1865                         if (async->start == BTRFS_SUPER_INFO_OFFSET) {
1866                                 if (mdres->old_restore) {
1867                                         update_super_old(outbuf);
1868                                 } else {
1869                                         ret = update_super(mdres, outbuf);
1870                                         if (ret)
1871                                                 err = ret;
1872                                 }
1873                         } else if (!mdres->old_restore) {
1874                                 ret = fixup_chunk_tree_block(mdres, async, outbuf, size);
1875                                 if (ret)
1876                                         err = ret;
1877                         }
1878                 }
1879
1880                 if (!mdres->fixup_offset) {
1881                         while (size) {
1882                                 u64 chunk_size = size;
1883                                 physical_dup = 0;
1884                                 if (!mdres->multi_devices && !mdres->old_restore)
1885                                         bytenr = logical_to_physical(mdres,
1886                                                      async->start + offset,
1887                                                      &chunk_size,
1888                                                      &physical_dup);
1889                                 else
1890                                         bytenr = async->start + offset;
1891
1892                                 ret = pwrite64(outfd, outbuf+offset, chunk_size,
1893                                                bytenr);
1894                                 if (ret != chunk_size)
1895                                         goto error;
1896
1897                                 if (physical_dup)
1898                                         ret = pwrite64(outfd, outbuf+offset,
1899                                                        chunk_size,
1900                                                        physical_dup);
1901                                 if (ret != chunk_size)
1902                                         goto error;
1903
1904                                 size -= chunk_size;
1905                                 offset += chunk_size;
1906                                 continue;
1907
1908 error:
1909                                 if (ret < 0) {
1910                                         error("unable to write to device: %s",
1911                                                         strerror(errno));
1912                                         err = errno;
1913                                 } else {
1914                                         error("short write");
1915                                         err = -EIO;
1916                                 }
1917                         }
1918                 } else if (async->start != BTRFS_SUPER_INFO_OFFSET) {
1919                         ret = write_data_to_disk(mdres->info, outbuf, async->start, size, 0);
1920                         if (ret) {
1921                                 error("failed to write data");
1922                                 exit(1);
1923                         }
1924                 }
1925
1926
1927                 /* backup super blocks are already there at fixup_offset stage */
1928                 if (!mdres->multi_devices && async->start == BTRFS_SUPER_INFO_OFFSET)
1929                         write_backup_supers(outfd, outbuf);
1930
1931                 if (err && !mdres->error)
1932                         mdres->error = err;
1933                 mdres->num_items--;
1934                 pthread_mutex_unlock(&mdres->mutex);
1935
1936                 free(async->buffer);
1937                 free(async);
1938         }
1939 out:
1940         free(buffer);
1941         pthread_exit(NULL);
1942 }
1943
1944 static void mdrestore_destroy(struct mdrestore_struct *mdres, int num_threads)
1945 {
1946         struct rb_node *n;
1947         int i;
1948
1949         while ((n = rb_first(&mdres->chunk_tree))) {
1950                 struct fs_chunk *entry;
1951
1952                 entry = rb_entry(n, struct fs_chunk, l);
1953                 rb_erase(n, &mdres->chunk_tree);
1954                 rb_erase(&entry->p, &mdres->physical_tree);
1955                 free(entry);
1956         }
1957         pthread_mutex_lock(&mdres->mutex);
1958         mdres->done = 1;
1959         pthread_cond_broadcast(&mdres->cond);
1960         pthread_mutex_unlock(&mdres->mutex);
1961
1962         for (i = 0; i < num_threads; i++)
1963                 pthread_join(mdres->threads[i], NULL);
1964
1965         pthread_cond_destroy(&mdres->cond);
1966         pthread_mutex_destroy(&mdres->mutex);
1967 }
1968
1969 static int mdrestore_init(struct mdrestore_struct *mdres,
1970                           FILE *in, FILE *out, int old_restore,
1971                           int num_threads, int fixup_offset,
1972                           struct btrfs_fs_info *info, int multi_devices)
1973 {
1974         int i, ret = 0;
1975
1976         memset(mdres, 0, sizeof(*mdres));
1977         pthread_cond_init(&mdres->cond, NULL);
1978         pthread_mutex_init(&mdres->mutex, NULL);
1979         INIT_LIST_HEAD(&mdres->list);
1980         INIT_LIST_HEAD(&mdres->overlapping_chunks);
1981         mdres->in = in;
1982         mdres->out = out;
1983         mdres->old_restore = old_restore;
1984         mdres->chunk_tree.rb_node = NULL;
1985         mdres->fixup_offset = fixup_offset;
1986         mdres->info = info;
1987         mdres->multi_devices = multi_devices;
1988         mdres->clear_space_cache = 0;
1989         mdres->last_physical_offset = 0;
1990         mdres->alloced_chunks = 0;
1991
1992         if (!num_threads)
1993                 return 0;
1994
1995         mdres->num_threads = num_threads;
1996         for (i = 0; i < num_threads; i++) {
1997                 ret = pthread_create(&mdres->threads[i], NULL, restore_worker,
1998                                      mdres);
1999                 if (ret) {
2000                         /* pthread_create returns errno directly */
2001                         ret = -ret;
2002                         break;
2003                 }
2004         }
2005         if (ret)
2006                 mdrestore_destroy(mdres, i + 1);
2007         return ret;
2008 }
2009
2010 static int fill_mdres_info(struct mdrestore_struct *mdres,
2011                            struct async_work *async)
2012 {
2013         struct btrfs_super_block *super;
2014         u8 *buffer = NULL;
2015         u8 *outbuf;
2016         int ret;
2017
2018         /* We've already been initialized */
2019         if (mdres->nodesize)
2020                 return 0;
2021
2022         if (mdres->compress_method == COMPRESS_ZLIB) {
2023                 size_t size = MAX_PENDING_SIZE * 2;
2024
2025                 buffer = malloc(MAX_PENDING_SIZE * 2);
2026                 if (!buffer)
2027                         return -ENOMEM;
2028                 ret = uncompress(buffer, (unsigned long *)&size,
2029                                  async->buffer, async->bufsize);
2030                 if (ret != Z_OK) {
2031                         error("decompression failed with %d", ret);
2032                         free(buffer);
2033                         return -EIO;
2034                 }
2035                 outbuf = buffer;
2036         } else {
2037                 outbuf = async->buffer;
2038         }
2039
2040         super = (struct btrfs_super_block *)outbuf;
2041         mdres->nodesize = btrfs_super_nodesize(super);
2042         memcpy(mdres->fsid, super->fsid, BTRFS_FSID_SIZE);
2043         memcpy(mdres->uuid, super->dev_item.uuid,
2044                        BTRFS_UUID_SIZE);
2045         mdres->devid = le64_to_cpu(super->dev_item.devid);
2046         free(buffer);
2047         return 0;
2048 }
2049
2050 static int add_cluster(struct meta_cluster *cluster,
2051                        struct mdrestore_struct *mdres, u64 *next)
2052 {
2053         struct meta_cluster_item *item;
2054         struct meta_cluster_header *header = &cluster->header;
2055         struct async_work *async;
2056         u64 bytenr;
2057         u32 i, nritems;
2058         int ret;
2059
2060         pthread_mutex_lock(&mdres->mutex);
2061         mdres->compress_method = header->compress;
2062         pthread_mutex_unlock(&mdres->mutex);
2063
2064         bytenr = le64_to_cpu(header->bytenr) + BLOCK_SIZE;
2065         nritems = le32_to_cpu(header->nritems);
2066         for (i = 0; i < nritems; i++) {
2067                 item = &cluster->items[i];
2068                 async = calloc(1, sizeof(*async));
2069                 if (!async) {
2070                         error("not enough memory for async data");
2071                         return -ENOMEM;
2072                 }
2073                 async->start = le64_to_cpu(item->bytenr);
2074                 async->bufsize = le32_to_cpu(item->size);
2075                 async->buffer = malloc(async->bufsize);
2076                 if (!async->buffer) {
2077                         error("not enough memory for async buffer");
2078                         free(async);
2079                         return -ENOMEM;
2080                 }
2081                 ret = fread(async->buffer, async->bufsize, 1, mdres->in);
2082                 if (ret != 1) {
2083                         error("unable to read buffer: %s", strerror(errno));
2084                         free(async->buffer);
2085                         free(async);
2086                         return -EIO;
2087                 }
2088                 bytenr += async->bufsize;
2089
2090                 pthread_mutex_lock(&mdres->mutex);
2091                 if (async->start == BTRFS_SUPER_INFO_OFFSET) {
2092                         ret = fill_mdres_info(mdres, async);
2093                         if (ret) {
2094                                 error("unable to set up restore state");
2095                                 pthread_mutex_unlock(&mdres->mutex);
2096                                 free(async->buffer);
2097                                 free(async);
2098                                 return ret;
2099                         }
2100                 }
2101                 list_add_tail(&async->list, &mdres->list);
2102                 mdres->num_items++;
2103                 pthread_cond_signal(&mdres->cond);
2104                 pthread_mutex_unlock(&mdres->mutex);
2105         }
2106         if (bytenr & BLOCK_MASK) {
2107                 char buffer[BLOCK_MASK];
2108                 size_t size = BLOCK_SIZE - (bytenr & BLOCK_MASK);
2109
2110                 bytenr += size;
2111                 ret = fread(buffer, size, 1, mdres->in);
2112                 if (ret != 1) {
2113                         error("failed to read buffer: %s", strerror(errno));
2114                         return -EIO;
2115                 }
2116         }
2117         *next = bytenr;
2118         return 0;
2119 }
2120
2121 static int wait_for_worker(struct mdrestore_struct *mdres)
2122 {
2123         int ret = 0;
2124
2125         pthread_mutex_lock(&mdres->mutex);
2126         ret = mdres->error;
2127         while (!ret && mdres->num_items > 0) {
2128                 struct timespec ts = {
2129                         .tv_sec = 0,
2130                         .tv_nsec = 10000000,
2131                 };
2132                 pthread_mutex_unlock(&mdres->mutex);
2133                 nanosleep(&ts, NULL);
2134                 pthread_mutex_lock(&mdres->mutex);
2135                 ret = mdres->error;
2136         }
2137         pthread_mutex_unlock(&mdres->mutex);
2138         return ret;
2139 }
2140
2141 static int read_chunk_block(struct mdrestore_struct *mdres, u8 *buffer,
2142                             u64 bytenr, u64 item_bytenr, u32 bufsize,
2143                             u64 cluster_bytenr)
2144 {
2145         struct extent_buffer *eb;
2146         int ret = 0;
2147         int i;
2148
2149         eb = alloc_dummy_eb(bytenr, mdres->nodesize);
2150         if (!eb) {
2151                 ret = -ENOMEM;
2152                 goto out;
2153         }
2154
2155         while (item_bytenr != bytenr) {
2156                 buffer += mdres->nodesize;
2157                 item_bytenr += mdres->nodesize;
2158         }
2159
2160         memcpy(eb->data, buffer, mdres->nodesize);
2161         if (btrfs_header_bytenr(eb) != bytenr) {
2162                 error("eb bytenr does not match found bytenr: %llu != %llu",
2163                                 (unsigned long long)btrfs_header_bytenr(eb),
2164                                 (unsigned long long)bytenr);
2165                 ret = -EIO;
2166                 goto out;
2167         }
2168
2169         if (memcmp(mdres->fsid, eb->data + offsetof(struct btrfs_header, fsid),
2170                    BTRFS_FSID_SIZE)) {
2171                 error("filesystem UUID of eb %llu does not match",
2172                                 (unsigned long long)bytenr);
2173                 ret = -EIO;
2174                 goto out;
2175         }
2176
2177         if (btrfs_header_owner(eb) != BTRFS_CHUNK_TREE_OBJECTID) {
2178                 error("wrong eb %llu owner %llu",
2179                                 (unsigned long long)bytenr,
2180                                 (unsigned long long)btrfs_header_owner(eb));
2181                 ret = -EIO;
2182                 goto out;
2183         }
2184
2185         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2186                 struct btrfs_chunk *chunk;
2187                 struct fs_chunk *fs_chunk;
2188                 struct btrfs_key key;
2189                 u64 type;
2190
2191                 if (btrfs_header_level(eb)) {
2192                         u64 blockptr = btrfs_node_blockptr(eb, i);
2193
2194                         ret = search_for_chunk_blocks(mdres, blockptr,
2195                                                       cluster_bytenr);
2196                         if (ret)
2197                                 break;
2198                         continue;
2199                 }
2200
2201                 /* Yay a leaf!  We loves leafs! */
2202                 btrfs_item_key_to_cpu(eb, &key, i);
2203                 if (key.type != BTRFS_CHUNK_ITEM_KEY)
2204                         continue;
2205
2206                 fs_chunk = malloc(sizeof(struct fs_chunk));
2207                 if (!fs_chunk) {
2208                         error("not enough memory to allocate chunk");
2209                         ret = -ENOMEM;
2210                         break;
2211                 }
2212                 memset(fs_chunk, 0, sizeof(*fs_chunk));
2213                 chunk = btrfs_item_ptr(eb, i, struct btrfs_chunk);
2214
2215                 fs_chunk->logical = key.offset;
2216                 fs_chunk->physical = btrfs_stripe_offset_nr(eb, chunk, 0);
2217                 fs_chunk->bytes = btrfs_chunk_length(eb, chunk);
2218                 INIT_LIST_HEAD(&fs_chunk->list);
2219                 if (tree_search(&mdres->physical_tree, &fs_chunk->p,
2220                                 physical_cmp, 1) != NULL)
2221                         list_add(&fs_chunk->list, &mdres->overlapping_chunks);
2222                 else
2223                         tree_insert(&mdres->physical_tree, &fs_chunk->p,
2224                                     physical_cmp);
2225
2226                 type = btrfs_chunk_type(eb, chunk);
2227                 if (type & BTRFS_BLOCK_GROUP_DUP) {
2228                         fs_chunk->physical_dup =
2229                                         btrfs_stripe_offset_nr(eb, chunk, 1);
2230                 }
2231
2232                 if (fs_chunk->physical_dup + fs_chunk->bytes >
2233                     mdres->last_physical_offset)
2234                         mdres->last_physical_offset = fs_chunk->physical_dup +
2235                                 fs_chunk->bytes;
2236                 else if (fs_chunk->physical + fs_chunk->bytes >
2237                     mdres->last_physical_offset)
2238                         mdres->last_physical_offset = fs_chunk->physical +
2239                                 fs_chunk->bytes;
2240                 mdres->alloced_chunks += fs_chunk->bytes;
2241                 /* in dup case, fs_chunk->bytes should add twice */
2242                 if (fs_chunk->physical_dup)
2243                         mdres->alloced_chunks += fs_chunk->bytes;
2244                 tree_insert(&mdres->chunk_tree, &fs_chunk->l, chunk_cmp);
2245         }
2246 out:
2247         free(eb);
2248         return ret;
2249 }
2250
2251 /* If you have to ask you aren't worthy */
2252 static int search_for_chunk_blocks(struct mdrestore_struct *mdres,
2253                                    u64 search, u64 cluster_bytenr)
2254 {
2255         struct meta_cluster *cluster;
2256         struct meta_cluster_header *header;
2257         struct meta_cluster_item *item;
2258         u64 current_cluster = cluster_bytenr, bytenr;
2259         u64 item_bytenr;
2260         u32 bufsize, nritems, i;
2261         u32 max_size = MAX_PENDING_SIZE * 2;
2262         u8 *buffer, *tmp = NULL;
2263         int ret = 0;
2264
2265         cluster = malloc(BLOCK_SIZE);
2266         if (!cluster) {
2267                 error("not enough memory for cluster");
2268                 return -ENOMEM;
2269         }
2270
2271         buffer = malloc(max_size);
2272         if (!buffer) {
2273                 error("not enough memory for buffer");
2274                 free(cluster);
2275                 return -ENOMEM;
2276         }
2277
2278         if (mdres->compress_method == COMPRESS_ZLIB) {
2279                 tmp = malloc(max_size);
2280                 if (!tmp) {
2281                         error("not enough memory for buffer");
2282                         free(cluster);
2283                         free(buffer);
2284                         return -ENOMEM;
2285                 }
2286         }
2287
2288         bytenr = current_cluster;
2289         while (1) {
2290                 if (fseek(mdres->in, current_cluster, SEEK_SET)) {
2291                         error("seek failed: %s", strerror(errno));
2292                         ret = -EIO;
2293                         break;
2294                 }
2295
2296                 ret = fread(cluster, BLOCK_SIZE, 1, mdres->in);
2297                 if (ret == 0) {
2298                         if (cluster_bytenr != 0) {
2299                                 cluster_bytenr = 0;
2300                                 current_cluster = 0;
2301                                 bytenr = 0;
2302                                 continue;
2303                         }
2304                         error(
2305         "unknown state after reading cluster at %llu, probably corrupted data",
2306                                         cluster_bytenr);
2307                         ret = -EIO;
2308                         break;
2309                 } else if (ret < 0) {
2310                         error("unable to read image at %llu: %s",
2311                                         (unsigned long long)cluster_bytenr,
2312                                         strerror(errno));
2313                         break;
2314                 }
2315                 ret = 0;
2316
2317                 header = &cluster->header;
2318                 if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
2319                     le64_to_cpu(header->bytenr) != current_cluster) {
2320                         error("bad header in metadump image");
2321                         ret = -EIO;
2322                         break;
2323                 }
2324
2325                 bytenr += BLOCK_SIZE;
2326                 nritems = le32_to_cpu(header->nritems);
2327                 for (i = 0; i < nritems; i++) {
2328                         size_t size;
2329
2330                         item = &cluster->items[i];
2331                         bufsize = le32_to_cpu(item->size);
2332                         item_bytenr = le64_to_cpu(item->bytenr);
2333
2334                         if (bufsize > max_size) {
2335                                 error("item %u too big: %u > %u", i, bufsize,
2336                                                 max_size);
2337                                 ret = -EIO;
2338                                 break;
2339                         }
2340
2341                         if (mdres->compress_method == COMPRESS_ZLIB) {
2342                                 ret = fread(tmp, bufsize, 1, mdres->in);
2343                                 if (ret != 1) {
2344                                         error("read error: %s", strerror(errno));
2345                                         ret = -EIO;
2346                                         break;
2347                                 }
2348
2349                                 size = max_size;
2350                                 ret = uncompress(buffer,
2351                                                  (unsigned long *)&size, tmp,
2352                                                  bufsize);
2353                                 if (ret != Z_OK) {
2354                                         error("decompression failed with %d",
2355                                                         ret);
2356                                         ret = -EIO;
2357                                         break;
2358                                 }
2359                         } else {
2360                                 ret = fread(buffer, bufsize, 1, mdres->in);
2361                                 if (ret != 1) {
2362                                         error("read error: %s",
2363                                                         strerror(errno));
2364                                         ret = -EIO;
2365                                         break;
2366                                 }
2367                                 size = bufsize;
2368                         }
2369                         ret = 0;
2370
2371                         if (item_bytenr <= search &&
2372                             item_bytenr + size > search) {
2373                                 ret = read_chunk_block(mdres, buffer, search,
2374                                                        item_bytenr, size,
2375                                                        current_cluster);
2376                                 if (!ret)
2377                                         ret = 1;
2378                                 break;
2379                         }
2380                         bytenr += bufsize;
2381                 }
2382                 if (ret) {
2383                         if (ret > 0)
2384                                 ret = 0;
2385                         break;
2386                 }
2387                 if (bytenr & BLOCK_MASK)
2388                         bytenr += BLOCK_SIZE - (bytenr & BLOCK_MASK);
2389                 current_cluster = bytenr;
2390         }
2391
2392         free(tmp);
2393         free(buffer);
2394         free(cluster);
2395         return ret;
2396 }
2397
2398 static int build_chunk_tree(struct mdrestore_struct *mdres,
2399                             struct meta_cluster *cluster)
2400 {
2401         struct btrfs_super_block *super;
2402         struct meta_cluster_header *header;
2403         struct meta_cluster_item *item = NULL;
2404         u64 chunk_root_bytenr = 0;
2405         u32 i, nritems;
2406         u64 bytenr = 0;
2407         u8 *buffer;
2408         int ret;
2409
2410         /* We can't seek with stdin so don't bother doing this */
2411         if (mdres->in == stdin)
2412                 return 0;
2413
2414         ret = fread(cluster, BLOCK_SIZE, 1, mdres->in);
2415         if (ret <= 0) {
2416                 error("unable to read cluster: %s", strerror(errno));
2417                 return -EIO;
2418         }
2419         ret = 0;
2420
2421         header = &cluster->header;
2422         if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
2423             le64_to_cpu(header->bytenr) != 0) {
2424                 error("bad header in metadump image");
2425                 return -EIO;
2426         }
2427
2428         bytenr += BLOCK_SIZE;
2429         mdres->compress_method = header->compress;
2430         nritems = le32_to_cpu(header->nritems);
2431         for (i = 0; i < nritems; i++) {
2432                 item = &cluster->items[i];
2433
2434                 if (le64_to_cpu(item->bytenr) == BTRFS_SUPER_INFO_OFFSET)
2435                         break;
2436                 bytenr += le32_to_cpu(item->size);
2437                 if (fseek(mdres->in, le32_to_cpu(item->size), SEEK_CUR)) {
2438                         error("seek failed: %s", strerror(errno));
2439                         return -EIO;
2440                 }
2441         }
2442
2443         if (!item || le64_to_cpu(item->bytenr) != BTRFS_SUPER_INFO_OFFSET) {
2444                 error("did not find superblock at %llu",
2445                                 le64_to_cpu(item->bytenr));
2446                 return -EINVAL;
2447         }
2448
2449         buffer = malloc(le32_to_cpu(item->size));
2450         if (!buffer) {
2451                 error("not enough memory to allocate buffer");
2452                 return -ENOMEM;
2453         }
2454
2455         ret = fread(buffer, le32_to_cpu(item->size), 1, mdres->in);
2456         if (ret != 1) {
2457                 error("unable to read buffer: %s", strerror(errno));
2458                 free(buffer);
2459                 return -EIO;
2460         }
2461
2462         if (mdres->compress_method == COMPRESS_ZLIB) {
2463                 size_t size = MAX_PENDING_SIZE * 2;
2464                 u8 *tmp;
2465
2466                 tmp = malloc(MAX_PENDING_SIZE * 2);
2467                 if (!tmp) {
2468                         free(buffer);
2469                         return -ENOMEM;
2470                 }
2471                 ret = uncompress(tmp, (unsigned long *)&size,
2472                                  buffer, le32_to_cpu(item->size));
2473                 if (ret != Z_OK) {
2474                         error("decompression failed with %d", ret);
2475                         free(buffer);
2476                         free(tmp);
2477                         return -EIO;
2478                 }
2479                 free(buffer);
2480                 buffer = tmp;
2481         }
2482
2483         pthread_mutex_lock(&mdres->mutex);
2484         super = (struct btrfs_super_block *)buffer;
2485         chunk_root_bytenr = btrfs_super_chunk_root(super);
2486         mdres->nodesize = btrfs_super_nodesize(super);
2487         memcpy(mdres->fsid, super->fsid, BTRFS_FSID_SIZE);
2488         memcpy(mdres->uuid, super->dev_item.uuid,
2489                        BTRFS_UUID_SIZE);
2490         mdres->devid = le64_to_cpu(super->dev_item.devid);
2491         free(buffer);
2492         pthread_mutex_unlock(&mdres->mutex);
2493
2494         return search_for_chunk_blocks(mdres, chunk_root_bytenr, 0);
2495 }
2496
2497 static int range_contains_super(u64 physical, u64 bytes)
2498 {
2499         u64 super_bytenr;
2500         int i;
2501
2502         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2503                 super_bytenr = btrfs_sb_offset(i);
2504                 if (super_bytenr >= physical &&
2505                     super_bytenr < physical + bytes)
2506                         return 1;
2507         }
2508
2509         return 0;
2510 }
2511
2512 static void remap_overlapping_chunks(struct mdrestore_struct *mdres)
2513 {
2514         struct fs_chunk *fs_chunk;
2515
2516         while (!list_empty(&mdres->overlapping_chunks)) {
2517                 fs_chunk = list_first_entry(&mdres->overlapping_chunks,
2518                                             struct fs_chunk, list);
2519                 list_del_init(&fs_chunk->list);
2520                 if (range_contains_super(fs_chunk->physical,
2521                                          fs_chunk->bytes)) {
2522                         warning(
2523 "remapping a chunk that had a super mirror inside of it, clearing space cache so we don't end up with corruption");
2524                         mdres->clear_space_cache = 1;
2525                 }
2526                 fs_chunk->physical = mdres->last_physical_offset;
2527                 tree_insert(&mdres->physical_tree, &fs_chunk->p, physical_cmp);
2528                 mdres->last_physical_offset += fs_chunk->bytes;
2529         }
2530 }
2531
2532 static int fixup_devices(struct btrfs_fs_info *fs_info,
2533                          struct mdrestore_struct *mdres, off_t dev_size)
2534 {
2535         struct btrfs_trans_handle *trans;
2536         struct btrfs_dev_item *dev_item;
2537         struct btrfs_path path;
2538         struct extent_buffer *leaf;
2539         struct btrfs_root *root = fs_info->chunk_root;
2540         struct btrfs_key key;
2541         u64 devid, cur_devid;
2542         int ret;
2543
2544         trans = btrfs_start_transaction(fs_info->tree_root, 1);
2545         if (IS_ERR(trans)) {
2546                 error("cannot starting transaction %ld", PTR_ERR(trans));
2547                 return PTR_ERR(trans);
2548         }
2549
2550         dev_item = &fs_info->super_copy->dev_item;
2551
2552         devid = btrfs_stack_device_id(dev_item);
2553
2554         btrfs_set_stack_device_total_bytes(dev_item, dev_size);
2555         btrfs_set_stack_device_bytes_used(dev_item, mdres->alloced_chunks);
2556
2557         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2558         key.type = BTRFS_DEV_ITEM_KEY;
2559         key.offset = 0;
2560
2561         btrfs_init_path(&path);
2562
2563 again:
2564         ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
2565         if (ret < 0) {
2566                 error("search failed: %d", ret);
2567                 exit(1);
2568         }
2569
2570         while (1) {
2571                 leaf = path.nodes[0];
2572                 if (path.slots[0] >= btrfs_header_nritems(leaf)) {
2573                         ret = btrfs_next_leaf(root, &path);
2574                         if (ret < 0) {
2575                                 error("cannot go to next leaf %d", ret);
2576                                 exit(1);
2577                         }
2578                         if (ret > 0) {
2579                                 ret = 0;
2580                                 break;
2581                         }
2582                         leaf = path.nodes[0];
2583                 }
2584
2585                 btrfs_item_key_to_cpu(leaf, &key, path.slots[0]);
2586                 if (key.type > BTRFS_DEV_ITEM_KEY)
2587                         break;
2588                 if (key.type != BTRFS_DEV_ITEM_KEY) {
2589                         path.slots[0]++;
2590                         continue;
2591                 }
2592
2593                 dev_item = btrfs_item_ptr(leaf, path.slots[0],
2594                                           struct btrfs_dev_item);
2595                 cur_devid = btrfs_device_id(leaf, dev_item);
2596                 if (devid != cur_devid) {
2597                         ret = btrfs_del_item(trans, root, &path);
2598                         if (ret) {
2599                                 error("cannot delete item: %d", ret);
2600                                 exit(1);
2601                         }
2602                         btrfs_release_path(&path);
2603                         goto again;
2604                 }
2605
2606                 btrfs_set_device_total_bytes(leaf, dev_item, dev_size);
2607                 btrfs_set_device_bytes_used(leaf, dev_item,
2608                                             mdres->alloced_chunks);
2609                 btrfs_mark_buffer_dirty(leaf);
2610                 path.slots[0]++;
2611         }
2612
2613         btrfs_release_path(&path);
2614         ret = btrfs_commit_transaction(trans, fs_info->tree_root);
2615         if (ret) {
2616                 error("unable to commit transaction: %d", ret);
2617                 return ret;
2618         }
2619         return 0;
2620 }
2621
2622 static int restore_metadump(const char *input, FILE *out, int old_restore,
2623                             int num_threads, int fixup_offset,
2624                             const char *target, int multi_devices)
2625 {
2626         struct meta_cluster *cluster = NULL;
2627         struct meta_cluster_header *header;
2628         struct mdrestore_struct mdrestore;
2629         struct btrfs_fs_info *info = NULL;
2630         u64 bytenr = 0;
2631         FILE *in = NULL;
2632         int ret = 0;
2633
2634         if (!strcmp(input, "-")) {
2635                 in = stdin;
2636         } else {
2637                 in = fopen(input, "r");
2638                 if (!in) {
2639                         error("unable to open metadump image: %s",
2640                                         strerror(errno));
2641                         return 1;
2642                 }
2643         }
2644
2645         /* NOTE: open with write mode */
2646         if (fixup_offset) {
2647                 info = open_ctree_fs_info(target, 0, 0, 0,
2648                                           OPEN_CTREE_WRITES |
2649                                           OPEN_CTREE_RESTORE |
2650                                           OPEN_CTREE_PARTIAL);
2651                 if (!info) {
2652                         error("open ctree failed");
2653                         ret = -EIO;
2654                         goto failed_open;
2655                 }
2656         }
2657
2658         cluster = malloc(BLOCK_SIZE);
2659         if (!cluster) {
2660                 error("not enough memory for cluster");
2661                 ret = -ENOMEM;
2662                 goto failed_info;
2663         }
2664
2665         ret = mdrestore_init(&mdrestore, in, out, old_restore, num_threads,
2666                              fixup_offset, info, multi_devices);
2667         if (ret) {
2668                 error("failed to initialize metadata restore state: %d", ret);
2669                 goto failed_cluster;
2670         }
2671
2672         if (!multi_devices && !old_restore) {
2673                 ret = build_chunk_tree(&mdrestore, cluster);
2674                 if (ret)
2675                         goto out;
2676                 if (!list_empty(&mdrestore.overlapping_chunks))
2677                         remap_overlapping_chunks(&mdrestore);
2678         }
2679
2680         if (in != stdin && fseek(in, 0, SEEK_SET)) {
2681                 error("seek failed: %s", strerror(errno));
2682                 goto out;
2683         }
2684
2685         while (!mdrestore.error) {
2686                 ret = fread(cluster, BLOCK_SIZE, 1, in);
2687                 if (!ret)
2688                         break;
2689
2690                 header = &cluster->header;
2691                 if (le64_to_cpu(header->magic) != HEADER_MAGIC ||
2692                     le64_to_cpu(header->bytenr) != bytenr) {
2693                         error("bad header in metadump image");
2694                         ret = -EIO;
2695                         break;
2696                 }
2697                 ret = add_cluster(cluster, &mdrestore, &bytenr);
2698                 if (ret) {
2699                         error("failed to add cluster: %d", ret);
2700                         break;
2701                 }
2702         }
2703         ret = wait_for_worker(&mdrestore);
2704
2705         if (!ret && !multi_devices && !old_restore) {
2706                 struct btrfs_root *root;
2707                 struct stat st;
2708
2709                 root = open_ctree_fd(fileno(out), target, 0,
2710                                           OPEN_CTREE_PARTIAL |
2711                                           OPEN_CTREE_WRITES |
2712                                           OPEN_CTREE_NO_DEVICES);
2713                 if (!root) {
2714                         error("open ctree failed in %s", target);
2715                         ret = -EIO;
2716                         goto out;
2717                 }
2718                 info = root->fs_info;
2719
2720                 if (stat(target, &st)) {
2721                         error("stat %s failed: %s", target, strerror(errno));
2722                         close_ctree(info->chunk_root);
2723                         free(cluster);
2724                         return 1;
2725                 }
2726
2727                 ret = fixup_devices(info, &mdrestore, st.st_size);
2728                 close_ctree(info->chunk_root);
2729                 if (ret)
2730                         goto out;
2731         }
2732 out:
2733         mdrestore_destroy(&mdrestore, num_threads);
2734 failed_cluster:
2735         free(cluster);
2736 failed_info:
2737         if (fixup_offset && info)
2738                 close_ctree(info->chunk_root);
2739 failed_open:
2740         if (in != stdin)
2741                 fclose(in);
2742         return ret;
2743 }
2744
2745 static int update_disk_super_on_device(struct btrfs_fs_info *info,
2746                                        const char *other_dev, u64 cur_devid)
2747 {
2748         struct btrfs_key key;
2749         struct extent_buffer *leaf;
2750         struct btrfs_path path;
2751         struct btrfs_dev_item *dev_item;
2752         struct btrfs_super_block *disk_super;
2753         char dev_uuid[BTRFS_UUID_SIZE];
2754         char fs_uuid[BTRFS_UUID_SIZE];
2755         u64 devid, type, io_align, io_width;
2756         u64 sector_size, total_bytes, bytes_used;
2757         char buf[BTRFS_SUPER_INFO_SIZE];
2758         int fp = -1;
2759         int ret;
2760
2761         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2762         key.type = BTRFS_DEV_ITEM_KEY;
2763         key.offset = cur_devid;
2764
2765         btrfs_init_path(&path);
2766         ret = btrfs_search_slot(NULL, info->chunk_root, &key, &path, 0, 0); 
2767         if (ret) {
2768                 error("search key failed: %d", ret);
2769                 ret = -EIO;
2770                 goto out;
2771         }
2772
2773         leaf = path.nodes[0];
2774         dev_item = btrfs_item_ptr(leaf, path.slots[0],
2775                                   struct btrfs_dev_item);
2776
2777         devid = btrfs_device_id(leaf, dev_item);
2778         if (devid != cur_devid) {
2779                 error("devid mismatch: %llu != %llu",
2780                                 (unsigned long long)devid,
2781                                 (unsigned long long)cur_devid);
2782                 ret = -EIO;
2783                 goto out;
2784         }
2785
2786         type = btrfs_device_type(leaf, dev_item);
2787         io_align = btrfs_device_io_align(leaf, dev_item);
2788         io_width = btrfs_device_io_width(leaf, dev_item);
2789         sector_size = btrfs_device_sector_size(leaf, dev_item);
2790         total_bytes = btrfs_device_total_bytes(leaf, dev_item);
2791         bytes_used = btrfs_device_bytes_used(leaf, dev_item);
2792         read_extent_buffer(leaf, dev_uuid, (unsigned long)btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE);
2793         read_extent_buffer(leaf, fs_uuid, (unsigned long)btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE);
2794
2795         btrfs_release_path(&path);
2796
2797         printf("update disk super on %s devid=%llu\n", other_dev, devid);
2798
2799         /* update other devices' super block */
2800         fp = open(other_dev, O_CREAT | O_RDWR, 0600);
2801         if (fp < 0) {
2802                 error("could not open %s: %s", other_dev, strerror(errno));
2803                 ret = -EIO;
2804                 goto out;
2805         }
2806
2807         memcpy(buf, info->super_copy, BTRFS_SUPER_INFO_SIZE);
2808
2809         disk_super = (struct btrfs_super_block *)buf;
2810         dev_item = &disk_super->dev_item;
2811
2812         btrfs_set_stack_device_type(dev_item, type);
2813         btrfs_set_stack_device_id(dev_item, devid);
2814         btrfs_set_stack_device_total_bytes(dev_item, total_bytes);
2815         btrfs_set_stack_device_bytes_used(dev_item, bytes_used);
2816         btrfs_set_stack_device_io_align(dev_item, io_align);
2817         btrfs_set_stack_device_io_width(dev_item, io_width);
2818         btrfs_set_stack_device_sector_size(dev_item, sector_size);
2819         memcpy(dev_item->uuid, dev_uuid, BTRFS_UUID_SIZE);
2820         memcpy(dev_item->fsid, fs_uuid, BTRFS_UUID_SIZE);
2821         csum_block((u8 *)buf, BTRFS_SUPER_INFO_SIZE);
2822
2823         ret = pwrite64(fp, buf, BTRFS_SUPER_INFO_SIZE, BTRFS_SUPER_INFO_OFFSET);
2824         if (ret != BTRFS_SUPER_INFO_SIZE) {
2825                 if (ret < 0)
2826                         error("cannot write superblock: %s", strerror(ret));
2827                 else
2828                         error("cannot write superblock");
2829                 ret = -EIO;
2830                 goto out;
2831         }
2832
2833         write_backup_supers(fp, (u8 *)buf);
2834
2835 out:
2836         if (fp != -1)
2837                 close(fp);
2838         return ret;
2839 }
2840
2841 static void print_usage(int ret)
2842 {
2843         printf("usage: btrfs-image [options] source target\n");
2844         printf("\t-r      \trestore metadump image\n");
2845         printf("\t-c value\tcompression level (0 ~ 9)\n");
2846         printf("\t-t value\tnumber of threads (1 ~ 32)\n");
2847         printf("\t-o      \tdon't mess with the chunk tree when restoring\n");
2848         printf("\t-s      \tsanitize file names, use once to just use garbage, use twice if you want crc collisions\n");
2849         printf("\t-w      \twalk all trees instead of using extent tree, do this if your extent tree is broken\n");
2850         printf("\t-m       \trestore for multiple devices\n");
2851         printf("\n");
2852         printf("\tIn the dump mode, source is the btrfs device and target is the output file (use '-' for stdout).\n");
2853         printf("\tIn the restore mode, source is the dumped image and target is the btrfs device/file.\n");
2854         exit(ret);
2855 }
2856
2857 int main(int argc, char *argv[])
2858 {
2859         char *source;
2860         char *target;
2861         u64 num_threads = 0;
2862         u64 compress_level = 0;
2863         int create = 1;
2864         int old_restore = 0;
2865         int walk_trees = 0;
2866         int multi_devices = 0;
2867         int ret;
2868         int sanitize = 0;
2869         int dev_cnt = 0;
2870         int usage_error = 0;
2871         FILE *out;
2872
2873         while (1) {
2874                 static const struct option long_options[] = {
2875                         { "help", no_argument, NULL, GETOPT_VAL_HELP},
2876                         { NULL, 0, NULL, 0 }
2877                 };
2878                 int c = getopt_long(argc, argv, "rc:t:oswm", long_options, NULL);
2879                 if (c < 0)
2880                         break;
2881                 switch (c) {
2882                 case 'r':
2883                         create = 0;
2884                         break;
2885                 case 't':
2886                         num_threads = arg_strtou64(optarg);
2887                         if (num_threads > MAX_WORKER_THREADS) {
2888                                 error("number of threads out of range: %llu > %d",
2889                                         (unsigned long long)num_threads,
2890                                         MAX_WORKER_THREADS);
2891                                 return 1;
2892                         }
2893                         break;
2894                 case 'c':
2895                         compress_level = arg_strtou64(optarg);
2896                         if (compress_level > 9) {
2897                                 error("compression level out of range: %llu",
2898                                         (unsigned long long)compress_level);
2899                                 return 1;
2900                         }
2901                         break;
2902                 case 'o':
2903                         old_restore = 1;
2904                         break;
2905                 case 's':
2906                         sanitize++;
2907                         break;
2908                 case 'w':
2909                         walk_trees = 1;
2910                         break;
2911                 case 'm':
2912                         create = 0;
2913                         multi_devices = 1;
2914                         break;
2915                         case GETOPT_VAL_HELP:
2916                 default:
2917                         print_usage(c != GETOPT_VAL_HELP);
2918                 }
2919         }
2920
2921         set_argv0(argv);
2922         if (check_argc_min(argc - optind, 2))
2923                 print_usage(1);
2924
2925         dev_cnt = argc - optind - 1;
2926
2927         if (create) {
2928                 if (old_restore) {
2929                         error(
2930                         "create and restore cannot be used at the same time");
2931                         usage_error++;
2932                 }
2933         } else {
2934                 if (walk_trees || sanitize || compress_level) {
2935                         error(
2936                         "useing -w, -s, -c options for restore makes no sense");
2937                         usage_error++;
2938                 }
2939                 if (multi_devices && dev_cnt < 2) {
2940                         error("not enough devices specified for -m option");
2941                         usage_error++;
2942                 }
2943                 if (!multi_devices && dev_cnt != 1) {
2944                         error("accepts only 1 device without -m option");
2945                         usage_error++;
2946                 }
2947         }
2948
2949         if (usage_error)
2950                 print_usage(1);
2951
2952         source = argv[optind];
2953         target = argv[optind + 1];
2954
2955         if (create && !strcmp(target, "-")) {
2956                 out = stdout;
2957         } else {
2958                 out = fopen(target, "w+");
2959                 if (!out) {
2960                         error("unable to create target file %s", target);
2961                         exit(1);
2962                 }
2963         }
2964
2965         if (compress_level > 0 || create == 0) {
2966                 if (num_threads == 0) {
2967                         long tmp = sysconf(_SC_NPROCESSORS_ONLN);
2968
2969                         if (tmp <= 0)
2970                                 tmp = 1;
2971                         num_threads = tmp;
2972                 }
2973         } else {
2974                 num_threads = 0;
2975         }
2976
2977         if (create) {
2978                 ret = check_mounted(source);
2979                 if (ret < 0) {
2980                         warning("unable to check mount status of: %s",
2981                                         strerror(-ret));
2982                 } else if (ret) {
2983                         warning("%s already mounted, results may be inaccurate",
2984                                         source);
2985                 }
2986
2987                 ret = create_metadump(source, out, num_threads,
2988                                       compress_level, sanitize, walk_trees);
2989         } else {
2990                 ret = restore_metadump(source, out, old_restore, num_threads,
2991                                        0, target, multi_devices);
2992         }
2993         if (ret) {
2994                 error("%s failed: %s", (create) ? "create" : "restore",
2995                        strerror(errno));
2996                 goto out;
2997         }
2998
2999          /* extended support for multiple devices */
3000         if (!create && multi_devices) {
3001                 struct btrfs_fs_info *info;
3002                 u64 total_devs;
3003                 int i;
3004
3005                 info = open_ctree_fs_info(target, 0, 0, 0,
3006                                           OPEN_CTREE_PARTIAL |
3007                                           OPEN_CTREE_RESTORE);
3008                 if (!info) {
3009                         error("open ctree failed at %s", target);
3010                         return 1;
3011                 }
3012
3013                 total_devs = btrfs_super_num_devices(info->super_copy);
3014                 if (total_devs != dev_cnt) {
3015                         error("it needs %llu devices but has only %d",
3016                                 total_devs, dev_cnt);
3017                         close_ctree(info->chunk_root);
3018                         goto out;
3019                 }
3020
3021                 /* update super block on other disks */
3022                 for (i = 2; i <= dev_cnt; i++) {
3023                         ret = update_disk_super_on_device(info,
3024                                         argv[optind + i], (u64)i);
3025                         if (ret) {
3026                                 error("update disk superblock failed devid %d: %d",
3027                                         i, ret);
3028                                 close_ctree(info->chunk_root);
3029                                 exit(1);
3030                         }
3031                 }
3032
3033                 close_ctree(info->chunk_root);
3034
3035                 /* fix metadata block to map correct chunk */
3036                 ret = restore_metadump(source, out, 0, num_threads, 1,
3037                                        target, 1);
3038                 if (ret) {
3039                         error("unable to fixup metadump: %d", ret);
3040                         exit(1);
3041                 }
3042         }
3043 out:
3044         if (out == stdout) {
3045                 fflush(out);
3046         } else {
3047                 fclose(out);
3048                 if (ret && create) {
3049                         int unlink_ret;
3050
3051                         unlink_ret = unlink(target);
3052                         if (unlink_ret)
3053                                 error("unlink output file %s failed: %s",
3054                                                 target, strerror(errno));
3055                 }
3056         }
3057
3058         btrfs_close_all_devices();
3059
3060         return !!ret;
3061 }