x86: Move call64 to the i386 directory
[platform/kernel/u-boot.git] / fs / zfs / zfs.c
1 /*
2  *
3  * ZFS filesystem ported to u-boot by
4  * Jorgen Lundman <lundman at lundman.net>
5  *
6  *      GRUB  --  GRand Unified Bootloader
7  *      Copyright (C) 1999,2000,2001,2002,2003,2004
8  *      Free Software Foundation, Inc.
9  *      Copyright 2004  Sun Microsystems, Inc.
10  *
11  * SPDX-License-Identifier:     GPL-2.0+
12  */
13
14 #include <common.h>
15 #include <malloc.h>
16 #include <linux/stat.h>
17 #include <linux/time.h>
18 #include <linux/ctype.h>
19 #include <asm/byteorder.h>
20 #include "zfs_common.h"
21 #include "div64.h"
22
23 struct blk_desc *zfs_dev_desc;
24
25 /*
26  * The zfs plug-in routines for GRUB are:
27  *
28  * zfs_mount() - locates a valid uberblock of the root pool and reads
29  *              in its MOS at the memory address MOS.
30  *
31  * zfs_open() - locates a plain file object by following the MOS
32  *              and places its dnode at the memory address DNODE.
33  *
34  * zfs_read() - read in the data blocks pointed by the DNODE.
35  *
36  */
37
38 #include <zfs/zfs.h>
39 #include <zfs/zio.h>
40 #include <zfs/dnode.h>
41 #include <zfs/uberblock_impl.h>
42 #include <zfs/vdev_impl.h>
43 #include <zfs/zio_checksum.h>
44 #include <zfs/zap_impl.h>
45 #include <zfs/zap_leaf.h>
46 #include <zfs/zfs_znode.h>
47 #include <zfs/dmu.h>
48 #include <zfs/dmu_objset.h>
49 #include <zfs/sa_impl.h>
50 #include <zfs/dsl_dir.h>
51 #include <zfs/dsl_dataset.h>
52
53
54 #define ZPOOL_PROP_BOOTFS               "bootfs"
55
56
57 /*
58  * For nvlist manipulation. (from nvpair.h)
59  */
60 #define NV_ENCODE_NATIVE        0
61 #define NV_ENCODE_XDR           1
62 #define NV_BIG_ENDIAN                   0
63 #define NV_LITTLE_ENDIAN        1
64 #define DATA_TYPE_UINT64        8
65 #define DATA_TYPE_STRING        9
66 #define DATA_TYPE_NVLIST        19
67 #define DATA_TYPE_NVLIST_ARRAY  20
68
69
70 /*
71  * Macros to get fields in a bp or DVA.
72  */
73 #define P2PHASE(x, align)               ((x) & ((align) - 1))
74 #define DVA_OFFSET_TO_PHYS_SECTOR(offset)                                       \
75         ((offset + VDEV_LABEL_START_SIZE) >> SPA_MINBLOCKSHIFT)
76
77 /*
78  * return x rounded down to an align boundary
79  * eg, P2ALIGN(1200, 1024) == 1024 (1*align)
80  * eg, P2ALIGN(1024, 1024) == 1024 (1*align)
81  * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
82  * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
83  */
84 #define P2ALIGN(x, align)               ((x) & -(align))
85
86 /*
87  * FAT ZAP data structures
88  */
89 #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL    /* ECMA-182, reflected form */
90 #define ZAP_HASH_IDX(hash, n)   (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
91 #define CHAIN_END       0xffff  /* end of the chunk chain */
92
93 /*
94  * The amount of space within the chunk available for the array is:
95  * chunk size - space for type (1) - space for next pointer (2)
96  */
97 #define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
98
99 #define ZAP_LEAF_HASH_SHIFT(bs) (bs - 5)
100 #define ZAP_LEAF_HASH_NUMENTRIES(bs) (1 << ZAP_LEAF_HASH_SHIFT(bs))
101 #define LEAF_HASH(bs, h)                                                                                                \
102         ((ZAP_LEAF_HASH_NUMENTRIES(bs)-1) &                                                                     \
103          ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(bs)-l->l_hdr.lh_prefix_len)))
104
105 /*
106  * The amount of space available for chunks is:
107  * block size shift - hash entry size (2) * number of hash
108  * entries - header space (2*chunksize)
109  */
110 #define ZAP_LEAF_NUMCHUNKS(bs)                                          \
111         (((1<<bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(bs)) /   \
112          ZAP_LEAF_CHUNKSIZE - 2)
113
114 /*
115  * The chunks start immediately after the hash table.  The end of the
116  * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
117  * chunk_t.
118  */
119 #define ZAP_LEAF_CHUNK(l, bs, idx)                                                                              \
120         ((zap_leaf_chunk_t *)(l->l_hash + ZAP_LEAF_HASH_NUMENTRIES(bs)))[idx]
121 #define ZAP_LEAF_ENTRY(l, bs, idx) (&ZAP_LEAF_CHUNK(l, bs, idx).l_entry)
122
123
124 /*
125  * Decompression Entry - lzjb
126  */
127 #ifndef NBBY
128 #define NBBY    8
129 #endif
130
131
132
133 typedef int zfs_decomp_func_t(void *s_start, void *d_start,
134                                                           uint32_t s_len, uint32_t d_len);
135 typedef struct decomp_entry {
136         char *name;
137         zfs_decomp_func_t *decomp_func;
138 } decomp_entry_t;
139
140 typedef struct dnode_end {
141         dnode_phys_t dn;
142         zfs_endian_t endian;
143 } dnode_end_t;
144
145 struct zfs_data {
146         /* cache for a file block of the currently zfs_open()-ed file */
147         char *file_buf;
148         uint64_t file_start;
149         uint64_t file_end;
150
151         /* XXX: ashift is per vdev, not per pool.  We currently only ever touch
152          * a single vdev, but when/if raid-z or stripes are supported, this
153          * may need revision.
154          */
155         uint64_t vdev_ashift;
156         uint64_t label_txg;
157         uint64_t pool_guid;
158
159         /* cache for a dnode block */
160         dnode_phys_t *dnode_buf;
161         dnode_phys_t *dnode_mdn;
162         uint64_t dnode_start;
163         uint64_t dnode_end;
164         zfs_endian_t dnode_endian;
165
166         uberblock_t current_uberblock;
167
168         dnode_end_t mos;
169         dnode_end_t mdn;
170         dnode_end_t dnode;
171
172         uint64_t vdev_phys_sector;
173
174         int (*userhook)(const char *, const struct zfs_dirhook_info *);
175         struct zfs_dirhook_info *dirinfo;
176
177 };
178
179
180
181
182 static int
183 zlib_decompress(void *s, void *d,
184                                 uint32_t slen, uint32_t dlen)
185 {
186         if (zlib_decompress(s, d, slen, dlen) < 0)
187                 return ZFS_ERR_BAD_FS;
188         return ZFS_ERR_NONE;
189 }
190
191 static decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] = {
192         {"inherit", NULL},              /* ZIO_COMPRESS_INHERIT */
193         {"on", lzjb_decompress},        /* ZIO_COMPRESS_ON */
194         {"off", NULL},          /* ZIO_COMPRESS_OFF */
195         {"lzjb", lzjb_decompress},      /* ZIO_COMPRESS_LZJB */
196         {"empty", NULL},                /* ZIO_COMPRESS_EMPTY */
197         {"gzip-1", zlib_decompress},  /* ZIO_COMPRESS_GZIP1 */
198         {"gzip-2", zlib_decompress},  /* ZIO_COMPRESS_GZIP2 */
199         {"gzip-3", zlib_decompress},  /* ZIO_COMPRESS_GZIP3 */
200         {"gzip-4", zlib_decompress},  /* ZIO_COMPRESS_GZIP4 */
201         {"gzip-5", zlib_decompress},  /* ZIO_COMPRESS_GZIP5 */
202         {"gzip-6", zlib_decompress},  /* ZIO_COMPRESS_GZIP6 */
203         {"gzip-7", zlib_decompress},  /* ZIO_COMPRESS_GZIP7 */
204         {"gzip-8", zlib_decompress},  /* ZIO_COMPRESS_GZIP8 */
205         {"gzip-9", zlib_decompress},  /* ZIO_COMPRESS_GZIP9 */
206 };
207
208
209
210 static int zio_read_data(blkptr_t *bp, zfs_endian_t endian,
211                                                  void *buf, struct zfs_data *data);
212
213 static int
214 zio_read(blkptr_t *bp, zfs_endian_t endian, void **buf,
215                  size_t *size, struct zfs_data *data);
216
217 /*
218  * Our own version of log2().  Same thing as highbit()-1.
219  */
220 static int
221 zfs_log2(uint64_t num)
222 {
223         int i = 0;
224
225         while (num > 1) {
226                 i++;
227                 num = num >> 1;
228         }
229
230         return i;
231 }
232
233
234 /* Checksum Functions */
235 static void
236 zio_checksum_off(const void *buf __attribute__ ((unused)),
237                                  uint64_t size __attribute__ ((unused)),
238                                  zfs_endian_t endian __attribute__ ((unused)),
239                                  zio_cksum_t *zcp)
240 {
241         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
242 }
243
244 /* Checksum Table and Values */
245 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
246         {NULL, 0, 0, "inherit"},
247         {NULL, 0, 0, "on"},
248         {zio_checksum_off, 0, 0, "off"},
249         {zio_checksum_SHA256, 1, 1, "label"},
250         {zio_checksum_SHA256, 1, 1, "gang_header"},
251         {NULL, 0, 0, "zilog"},
252         {fletcher_2_endian, 0, 0, "fletcher2"},
253         {fletcher_4_endian, 1, 0, "fletcher4"},
254         {zio_checksum_SHA256, 1, 0, "SHA256"},
255         {NULL, 0, 0, "zilog2"},
256 };
257
258 /*
259  * zio_checksum_verify: Provides support for checksum verification.
260  *
261  * Fletcher2, Fletcher4, and SHA256 are supported.
262  *
263  */
264 static int
265 zio_checksum_verify(zio_cksum_t zc, uint32_t checksum,
266                                         zfs_endian_t endian, char *buf, int size)
267 {
268         zio_eck_t *zec = (zio_eck_t *) (buf + size) - 1;
269         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
270         zio_cksum_t actual_cksum, expected_cksum;
271
272         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func == NULL) {
273                 printf("zfs unknown checksum function %d\n", checksum);
274                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
275         }
276
277         if (ci->ci_eck) {
278                 expected_cksum = zec->zec_cksum;
279                 zec->zec_cksum = zc;
280                 ci->ci_func(buf, size, endian, &actual_cksum);
281                 zec->zec_cksum = expected_cksum;
282                 zc = expected_cksum;
283         } else {
284                 ci->ci_func(buf, size, endian, &actual_cksum);
285         }
286
287         if ((actual_cksum.zc_word[0] != zc.zc_word[0])
288                 || (actual_cksum.zc_word[1] != zc.zc_word[1])
289                 || (actual_cksum.zc_word[2] != zc.zc_word[2])
290                 || (actual_cksum.zc_word[3] != zc.zc_word[3])) {
291                 return ZFS_ERR_BAD_FS;
292         }
293
294         return ZFS_ERR_NONE;
295 }
296
297 /*
298  * vdev_uberblock_compare takes two uberblock structures and returns an integer
299  * indicating the more recent of the two.
300  *      Return Value = 1 if ub2 is more recent
301  *      Return Value = -1 if ub1 is more recent
302  * The most recent uberblock is determined using its transaction number and
303  * timestamp.  The uberblock with the highest transaction number is
304  * considered "newer".  If the transaction numbers of the two blocks match, the
305  * timestamps are compared to determine the "newer" of the two.
306  */
307 static int
308 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
309 {
310         zfs_endian_t ub1_endian, ub2_endian;
311         if (zfs_to_cpu64(ub1->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC)
312                 ub1_endian = LITTLE_ENDIAN;
313         else
314                 ub1_endian = BIG_ENDIAN;
315         if (zfs_to_cpu64(ub2->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC)
316                 ub2_endian = LITTLE_ENDIAN;
317         else
318                 ub2_endian = BIG_ENDIAN;
319
320         if (zfs_to_cpu64(ub1->ub_txg, ub1_endian)
321                 < zfs_to_cpu64(ub2->ub_txg, ub2_endian))
322                 return -1;
323         if (zfs_to_cpu64(ub1->ub_txg, ub1_endian)
324                 > zfs_to_cpu64(ub2->ub_txg, ub2_endian))
325                 return 1;
326
327         if (zfs_to_cpu64(ub1->ub_timestamp, ub1_endian)
328                 < zfs_to_cpu64(ub2->ub_timestamp, ub2_endian))
329                 return -1;
330         if (zfs_to_cpu64(ub1->ub_timestamp, ub1_endian)
331                 > zfs_to_cpu64(ub2->ub_timestamp, ub2_endian))
332                 return 1;
333
334         return 0;
335 }
336
337 /*
338  * Three pieces of information are needed to verify an uberblock: the magic
339  * number, the version number, and the checksum.
340  *
341  * Currently Implemented: version number, magic number, label txg
342  * Need to Implement: checksum
343  *
344  */
345 static int
346 uberblock_verify(uberblock_t *uber, int offset, struct zfs_data *data)
347 {
348         int err;
349         zfs_endian_t endian = UNKNOWN_ENDIAN;
350         zio_cksum_t zc;
351
352         if (uber->ub_txg < data->label_txg) {
353                 debug("ignoring partially written label: uber_txg < label_txg %llu %llu\n",
354                           uber->ub_txg, data->label_txg);
355                 return ZFS_ERR_BAD_FS;
356         }
357
358         if (zfs_to_cpu64(uber->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC
359                 && zfs_to_cpu64(uber->ub_version, LITTLE_ENDIAN) > 0
360                 && zfs_to_cpu64(uber->ub_version, LITTLE_ENDIAN) <= SPA_VERSION)
361                 endian = LITTLE_ENDIAN;
362
363         if (zfs_to_cpu64(uber->ub_magic, BIG_ENDIAN) == UBERBLOCK_MAGIC
364                 && zfs_to_cpu64(uber->ub_version, BIG_ENDIAN) > 0
365                 && zfs_to_cpu64(uber->ub_version, BIG_ENDIAN) <= SPA_VERSION)
366                 endian = BIG_ENDIAN;
367
368         if (endian == UNKNOWN_ENDIAN) {
369                 printf("invalid uberblock magic\n");
370                 return ZFS_ERR_BAD_FS;
371         }
372
373         memset(&zc, 0, sizeof(zc));
374         zc.zc_word[0] = cpu_to_zfs64(offset, endian);
375         err = zio_checksum_verify(zc, ZIO_CHECKSUM_LABEL, endian,
376                                                           (char *) uber, UBERBLOCK_SIZE(data->vdev_ashift));
377
378         if (!err) {
379                 /* Check that the data pointed by the rootbp is usable. */
380                 void *osp = NULL;
381                 size_t ospsize;
382                 err = zio_read(&uber->ub_rootbp, endian, &osp, &ospsize, data);
383                 free(osp);
384
385                 if (!err && ospsize < OBJSET_PHYS_SIZE_V14) {
386                         printf("uberblock rootbp points to invalid data\n");
387                         return ZFS_ERR_BAD_FS;
388                 }
389         }
390
391         return err;
392 }
393
394 /*
395  * Find the best uberblock.
396  * Return:
397  *        Success - Pointer to the best uberblock.
398  *        Failure - NULL
399  */
400 static uberblock_t *find_bestub(char *ub_array, struct zfs_data *data)
401 {
402         const uint64_t sector = data->vdev_phys_sector;
403         uberblock_t *ubbest = NULL;
404         uberblock_t *ubnext;
405         unsigned int i, offset, pickedub = 0;
406         int err = ZFS_ERR_NONE;
407
408         const unsigned int UBCOUNT = UBERBLOCK_COUNT(data->vdev_ashift);
409         const uint64_t UBBYTES = UBERBLOCK_SIZE(data->vdev_ashift);
410
411         for (i = 0; i < UBCOUNT; i++) {
412                 ubnext = (uberblock_t *) (i * UBBYTES + ub_array);
413                 offset = (sector << SPA_MINBLOCKSHIFT) + VDEV_PHYS_SIZE + (i * UBBYTES);
414
415                 err = uberblock_verify(ubnext, offset, data);
416                 if (err)
417                         continue;
418
419                 if (ubbest == NULL || vdev_uberblock_compare(ubnext, ubbest) > 0) {
420                         ubbest = ubnext;
421                         pickedub = i;
422                 }
423         }
424
425         if (ubbest)
426                 debug("zfs Found best uberblock at idx %d, txg %llu\n",
427                           pickedub, (unsigned long long) ubbest->ub_txg);
428
429         return ubbest;
430 }
431
432 static inline size_t
433 get_psize(blkptr_t *bp, zfs_endian_t endian)
434 {
435         return (((zfs_to_cpu64((bp)->blk_prop, endian) >> 16) & 0xffff) + 1)
436                         << SPA_MINBLOCKSHIFT;
437 }
438
439 static uint64_t
440 dva_get_offset(dva_t *dva, zfs_endian_t endian)
441 {
442         return zfs_to_cpu64((dva)->dva_word[1],
443                                                          endian) << SPA_MINBLOCKSHIFT;
444 }
445
446 /*
447  * Read a block of data based on the gang block address dva,
448  * and put its data in buf.
449  *
450  */
451 static int
452 zio_read_gang(blkptr_t *bp, zfs_endian_t endian, dva_t *dva, void *buf,
453                           struct zfs_data *data)
454 {
455         zio_gbh_phys_t *zio_gb;
456         uint64_t offset, sector;
457         unsigned i;
458         int err;
459         zio_cksum_t zc;
460
461         memset(&zc, 0, sizeof(zc));
462
463         zio_gb = malloc(SPA_GANGBLOCKSIZE);
464         if (!zio_gb)
465                 return ZFS_ERR_OUT_OF_MEMORY;
466
467         offset = dva_get_offset(dva, endian);
468         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
469
470         /* read in the gang block header */
471         err = zfs_devread(sector, 0, SPA_GANGBLOCKSIZE, (char *) zio_gb);
472
473         if (err) {
474                 free(zio_gb);
475                 return err;
476         }
477
478         /* XXX */
479         /* self checksuming the gang block header */
480         ZIO_SET_CHECKSUM(&zc, DVA_GET_VDEV(dva),
481                                          dva_get_offset(dva, endian), bp->blk_birth, 0);
482         err = zio_checksum_verify(zc, ZIO_CHECKSUM_GANG_HEADER, endian,
483                                                           (char *) zio_gb, SPA_GANGBLOCKSIZE);
484         if (err) {
485                 free(zio_gb);
486                 return err;
487         }
488
489         endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
490
491         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
492                 if (zio_gb->zg_blkptr[i].blk_birth == 0)
493                         continue;
494
495                 err = zio_read_data(&zio_gb->zg_blkptr[i], endian, buf, data);
496                 if (err) {
497                         free(zio_gb);
498                         return err;
499                 }
500                 buf = (char *) buf + get_psize(&zio_gb->zg_blkptr[i], endian);
501         }
502         free(zio_gb);
503         return ZFS_ERR_NONE;
504 }
505
506 /*
507  * Read in a block of raw data to buf.
508  */
509 static int
510 zio_read_data(blkptr_t *bp, zfs_endian_t endian, void *buf,
511                           struct zfs_data *data)
512 {
513         int i, psize;
514         int err = ZFS_ERR_NONE;
515
516         psize = get_psize(bp, endian);
517
518         /* pick a good dva from the block pointer */
519         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
520                 uint64_t offset, sector;
521
522                 if (bp->blk_dva[i].dva_word[0] == 0 && bp->blk_dva[i].dva_word[1] == 0)
523                         continue;
524
525                 if ((zfs_to_cpu64(bp->blk_dva[i].dva_word[1], endian)>>63) & 1) {
526                         err = zio_read_gang(bp, endian, &bp->blk_dva[i], buf, data);
527                 } else {
528                         /* read in a data block */
529                         offset = dva_get_offset(&bp->blk_dva[i], endian);
530                         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
531
532                         err = zfs_devread(sector, 0, psize, buf);
533                 }
534
535                 if (!err) {
536                         /*Check the underlying checksum before we rule this DVA as "good"*/
537                         uint32_t checkalgo = (zfs_to_cpu64((bp)->blk_prop, endian) >> 40) & 0xff;
538
539                         err = zio_checksum_verify(bp->blk_cksum, checkalgo, endian, buf, psize);
540                         if (!err)
541                                 return ZFS_ERR_NONE;
542                 }
543
544                 /* If read failed or checksum bad, reset the error.      Hopefully we've got some more DVA's to try.*/
545         }
546
547         if (!err) {
548                 printf("couldn't find a valid DVA\n");
549                 err = ZFS_ERR_BAD_FS;
550         }
551
552         return err;
553 }
554
555 /*
556  * Read in a block of data, verify its checksum, decompress if needed,
557  * and put the uncompressed data in buf.
558  */
559 static int
560 zio_read(blkptr_t *bp, zfs_endian_t endian, void **buf,
561                  size_t *size, struct zfs_data *data)
562 {
563         size_t lsize, psize;
564         unsigned int comp;
565         char *compbuf = NULL;
566         int err;
567
568         *buf = NULL;
569
570         comp = (zfs_to_cpu64((bp)->blk_prop, endian)>>32) & 0xff;
571         lsize = (BP_IS_HOLE(bp) ? 0 :
572                          (((zfs_to_cpu64((bp)->blk_prop, endian) & 0xffff) + 1)
573                           << SPA_MINBLOCKSHIFT));
574         psize = get_psize(bp, endian);
575
576         if (size)
577                 *size = lsize;
578
579         if (comp >= ZIO_COMPRESS_FUNCTIONS) {
580                 printf("compression algorithm %u not supported\n", (unsigned int) comp);
581                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
582         }
583
584         if (comp != ZIO_COMPRESS_OFF && decomp_table[comp].decomp_func == NULL) {
585                 printf("compression algorithm %s not supported\n", decomp_table[comp].name);
586                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
587         }
588
589         if (comp != ZIO_COMPRESS_OFF) {
590                 compbuf = malloc(psize);
591                 if (!compbuf)
592                         return ZFS_ERR_OUT_OF_MEMORY;
593         } else {
594                 compbuf = *buf = malloc(lsize);
595         }
596
597         err = zio_read_data(bp, endian, compbuf, data);
598         if (err) {
599                 free(compbuf);
600                 *buf = NULL;
601                 return err;
602         }
603
604         if (comp != ZIO_COMPRESS_OFF) {
605                 *buf = malloc(lsize);
606                 if (!*buf) {
607                         free(compbuf);
608                         return ZFS_ERR_OUT_OF_MEMORY;
609                 }
610
611                 err = decomp_table[comp].decomp_func(compbuf, *buf, psize, lsize);
612                 free(compbuf);
613                 if (err) {
614                         free(*buf);
615                         *buf = NULL;
616                         return err;
617                 }
618         }
619
620         return ZFS_ERR_NONE;
621 }
622
623 /*
624  * Get the block from a block id.
625  * push the block onto the stack.
626  *
627  */
628 static int
629 dmu_read(dnode_end_t *dn, uint64_t blkid, void **buf,
630                  zfs_endian_t *endian_out, struct zfs_data *data)
631 {
632         int idx, level;
633         blkptr_t *bp_array = dn->dn.dn_blkptr;
634         int epbs = dn->dn.dn_indblkshift - SPA_BLKPTRSHIFT;
635         blkptr_t *bp;
636         void *tmpbuf = 0;
637         zfs_endian_t endian;
638         int err = ZFS_ERR_NONE;
639
640         bp = malloc(sizeof(blkptr_t));
641         if (!bp)
642                 return ZFS_ERR_OUT_OF_MEMORY;
643
644         endian = dn->endian;
645         for (level = dn->dn.dn_nlevels - 1; level >= 0; level--) {
646                 idx = (blkid >> (epbs * level)) & ((1 << epbs) - 1);
647                 *bp = bp_array[idx];
648                 if (bp_array != dn->dn.dn_blkptr) {
649                         free(bp_array);
650                         bp_array = 0;
651                 }
652
653                 if (BP_IS_HOLE(bp)) {
654                         size_t size = zfs_to_cpu16(dn->dn.dn_datablkszsec,
655                                                                                         dn->endian)
656                                 << SPA_MINBLOCKSHIFT;
657                         *buf = malloc(size);
658                         if (*buf) {
659                                 err = ZFS_ERR_OUT_OF_MEMORY;
660                                 break;
661                         }
662                         memset(*buf, 0, size);
663                         endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
664                         break;
665                 }
666                 if (level == 0) {
667                         err = zio_read(bp, endian, buf, 0, data);
668                         endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
669                         break;
670                 }
671                 err = zio_read(bp, endian, &tmpbuf, 0, data);
672                 endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
673                 if (err)
674                         break;
675                 bp_array = tmpbuf;
676         }
677         if (bp_array != dn->dn.dn_blkptr)
678                 free(bp_array);
679         if (endian_out)
680                 *endian_out = endian;
681
682         free(bp);
683         return err;
684 }
685
686 /*
687  * mzap_lookup: Looks up property described by "name" and returns the value
688  * in "value".
689  */
690 static int
691 mzap_lookup(mzap_phys_t *zapobj, zfs_endian_t endian,
692                         int objsize, char *name, uint64_t * value)
693 {
694         int i, chunks;
695         mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
696
697         chunks = objsize / MZAP_ENT_LEN - 1;
698         for (i = 0; i < chunks; i++) {
699                 if (strcmp(mzap_ent[i].mze_name, name) == 0) {
700                         *value = zfs_to_cpu64(mzap_ent[i].mze_value, endian);
701                         return ZFS_ERR_NONE;
702                 }
703         }
704
705         printf("couldn't find '%s'\n", name);
706         return ZFS_ERR_FILE_NOT_FOUND;
707 }
708
709 static int
710 mzap_iterate(mzap_phys_t *zapobj, zfs_endian_t endian, int objsize,
711                          int (*hook)(const char *name,
712                                                  uint64_t val,
713                                                  struct zfs_data *data),
714                          struct zfs_data *data)
715 {
716         int i, chunks;
717         mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
718
719         chunks = objsize / MZAP_ENT_LEN - 1;
720         for (i = 0; i < chunks; i++) {
721                 if (hook(mzap_ent[i].mze_name,
722                                  zfs_to_cpu64(mzap_ent[i].mze_value, endian),
723                                  data))
724                         return 1;
725         }
726
727         return 0;
728 }
729
730 static uint64_t
731 zap_hash(uint64_t salt, const char *name)
732 {
733         static uint64_t table[256];
734         const uint8_t *cp;
735         uint8_t c;
736         uint64_t crc = salt;
737
738         if (table[128] == 0) {
739                 uint64_t *ct = NULL;
740                 int i, j;
741                 for (i = 0; i < 256; i++) {
742                         for (ct = table + i, *ct = i, j = 8; j > 0; j--)
743                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
744                 }
745         }
746
747         for (cp = (const uint8_t *) name; (c = *cp) != '\0'; cp++)
748                 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
749
750         /*
751          * Only use 28 bits, since we need 4 bits in the cookie for the
752          * collision differentiator.  We MUST use the high bits, since
753          * those are the onces that we first pay attention to when
754          * chosing the bucket.
755          */
756         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
757
758         return crc;
759 }
760
761 /*
762  * Only to be used on 8-bit arrays.
763  * array_len is actual len in bytes (not encoded le_value_length).
764  * buf is null-terminated.
765  */
766 /* XXX */
767 static int
768 zap_leaf_array_equal(zap_leaf_phys_t *l, zfs_endian_t endian,
769                                          int blksft, int chunk, int array_len, const char *buf)
770 {
771         int bseen = 0;
772
773         while (bseen < array_len) {
774                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
775                 int toread = min(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
776
777                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
778                         return 0;
779
780                 if (memcmp(la->la_array, buf + bseen, toread) != 0)
781                         break;
782                 chunk = zfs_to_cpu16(la->la_next, endian);
783                 bseen += toread;
784         }
785         return (bseen == array_len);
786 }
787
788 /* XXX */
789 static int
790 zap_leaf_array_get(zap_leaf_phys_t *l, zfs_endian_t endian, int blksft,
791                                    int chunk, int array_len, char *buf)
792 {
793         int bseen = 0;
794
795         while (bseen < array_len) {
796                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
797                 int toread = min(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
798
799                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
800                         /* Don't use errno because this error is to be ignored.  */
801                         return ZFS_ERR_BAD_FS;
802
803                 memcpy(buf + bseen, la->la_array,  toread);
804                 chunk = zfs_to_cpu16(la->la_next, endian);
805                 bseen += toread;
806         }
807         return ZFS_ERR_NONE;
808 }
809
810
811 /*
812  * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
813  * value for the property "name".
814  *
815  */
816 /* XXX */
817 static int
818 zap_leaf_lookup(zap_leaf_phys_t *l, zfs_endian_t endian,
819                                 int blksft, uint64_t h,
820                                 const char *name, uint64_t *value)
821 {
822         uint16_t chunk;
823         struct zap_leaf_entry *le;
824
825         /* Verify if this is a valid leaf block */
826         if (zfs_to_cpu64(l->l_hdr.lh_block_type, endian) != ZBT_LEAF) {
827                 printf("invalid leaf type\n");
828                 return ZFS_ERR_BAD_FS;
829         }
830         if (zfs_to_cpu32(l->l_hdr.lh_magic, endian) != ZAP_LEAF_MAGIC) {
831                 printf("invalid leaf magic\n");
832                 return ZFS_ERR_BAD_FS;
833         }
834
835         for (chunk = zfs_to_cpu16(l->l_hash[LEAF_HASH(blksft, h)], endian);
836                  chunk != CHAIN_END; chunk = le->le_next) {
837
838                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft)) {
839                         printf("invalid chunk number\n");
840                         return ZFS_ERR_BAD_FS;
841                 }
842
843                 le = ZAP_LEAF_ENTRY(l, blksft, chunk);
844
845                 /* Verify the chunk entry */
846                 if (le->le_type != ZAP_CHUNK_ENTRY) {
847                         printf("invalid chunk entry\n");
848                         return ZFS_ERR_BAD_FS;
849                 }
850
851                 if (zfs_to_cpu64(le->le_hash, endian) != h)
852                         continue;
853
854                 if (zap_leaf_array_equal(l, endian, blksft,
855                                                                  zfs_to_cpu16(le->le_name_chunk, endian),
856                                                                  zfs_to_cpu16(le->le_name_length, endian),
857                                                                  name)) {
858                         struct zap_leaf_array *la;
859
860                         if (le->le_int_size != 8 || le->le_value_length != 1) {
861                                 printf("invalid leaf chunk entry\n");
862                                 return ZFS_ERR_BAD_FS;
863                         }
864                         /* get the uint64_t property value */
865                         la = &ZAP_LEAF_CHUNK(l, blksft, le->le_value_chunk).l_array;
866
867                         *value = be64_to_cpu(la->la_array64);
868
869                         return ZFS_ERR_NONE;
870                 }
871         }
872
873         printf("couldn't find '%s'\n", name);
874         return ZFS_ERR_FILE_NOT_FOUND;
875 }
876
877
878 /* Verify if this is a fat zap header block */
879 static int
880 zap_verify(zap_phys_t *zap)
881 {
882         if (zap->zap_magic != (uint64_t) ZAP_MAGIC) {
883                 printf("bad ZAP magic\n");
884                 return ZFS_ERR_BAD_FS;
885         }
886
887         if (zap->zap_flags != 0) {
888                 printf("bad ZAP flags\n");
889                 return ZFS_ERR_BAD_FS;
890         }
891
892         if (zap->zap_salt == 0) {
893                 printf("bad ZAP salt\n");
894                 return ZFS_ERR_BAD_FS;
895         }
896
897         return ZFS_ERR_NONE;
898 }
899
900 /*
901  * Fat ZAP lookup
902  *
903  */
904 /* XXX */
905 static int
906 fzap_lookup(dnode_end_t *zap_dnode, zap_phys_t *zap,
907                         char *name, uint64_t *value, struct zfs_data *data)
908 {
909         void *l;
910         uint64_t hash, idx, blkid;
911         int blksft = zfs_log2(zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
912                                                                                         zap_dnode->endian) << DNODE_SHIFT);
913         int err;
914         zfs_endian_t leafendian;
915
916         err = zap_verify(zap);
917         if (err)
918                 return err;
919
920         hash = zap_hash(zap->zap_salt, name);
921
922         /* get block id from index */
923         if (zap->zap_ptrtbl.zt_numblks != 0) {
924                 printf("external pointer tables not supported\n");
925                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
926         }
927         idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
928         blkid = ((uint64_t *) zap)[idx + (1 << (blksft - 3 - 1))];
929
930         /* Get the leaf block */
931         if ((1U << blksft) < sizeof(zap_leaf_phys_t)) {
932                 printf("ZAP leaf is too small\n");
933                 return ZFS_ERR_BAD_FS;
934         }
935         err = dmu_read(zap_dnode, blkid, &l, &leafendian, data);
936         if (err)
937                 return err;
938
939         err = zap_leaf_lookup(l, leafendian, blksft, hash, name, value);
940         free(l);
941         return err;
942 }
943
944 /* XXX */
945 static int
946 fzap_iterate(dnode_end_t *zap_dnode, zap_phys_t *zap,
947                          int (*hook)(const char *name,
948                                                  uint64_t val,
949                                                  struct zfs_data *data),
950                          struct zfs_data *data)
951 {
952         zap_leaf_phys_t *l;
953         void *l_in;
954         uint64_t idx, blkid;
955         uint16_t chunk;
956         int blksft = zfs_log2(zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
957                                                                                         zap_dnode->endian) << DNODE_SHIFT);
958         int err;
959         zfs_endian_t endian;
960
961         if (zap_verify(zap))
962                 return 0;
963
964         /* get block id from index */
965         if (zap->zap_ptrtbl.zt_numblks != 0) {
966                 printf("external pointer tables not supported\n");
967                 return 0;
968         }
969         /* Get the leaf block */
970         if ((1U << blksft) < sizeof(zap_leaf_phys_t)) {
971                 printf("ZAP leaf is too small\n");
972                 return 0;
973         }
974         for (idx = 0; idx < zap->zap_ptrtbl.zt_numblks; idx++) {
975                 blkid = ((uint64_t *) zap)[idx + (1 << (blksft - 3 - 1))];
976
977                 err = dmu_read(zap_dnode, blkid, &l_in, &endian, data);
978                 l = l_in;
979                 if (err)
980                         continue;
981
982                 /* Verify if this is a valid leaf block */
983                 if (zfs_to_cpu64(l->l_hdr.lh_block_type, endian) != ZBT_LEAF) {
984                         free(l);
985                         continue;
986                 }
987                 if (zfs_to_cpu32(l->l_hdr.lh_magic, endian) != ZAP_LEAF_MAGIC) {
988                         free(l);
989                         continue;
990                 }
991
992                 for (chunk = 0; chunk < ZAP_LEAF_NUMCHUNKS(blksft); chunk++) {
993                         char *buf;
994                         struct zap_leaf_array *la;
995                         struct zap_leaf_entry *le;
996                         uint64_t val;
997                         le = ZAP_LEAF_ENTRY(l, blksft, chunk);
998
999                         /* Verify the chunk entry */
1000                         if (le->le_type != ZAP_CHUNK_ENTRY)
1001                                 continue;
1002
1003                         buf = malloc(zfs_to_cpu16(le->le_name_length, endian)
1004                                                  + 1);
1005                         if (zap_leaf_array_get(l, endian, blksft, le->le_name_chunk,
1006                                                                    le->le_name_length, buf)) {
1007                                 free(buf);
1008                                 continue;
1009                         }
1010                         buf[le->le_name_length] = 0;
1011
1012                         if (le->le_int_size != 8
1013                                 || zfs_to_cpu16(le->le_value_length, endian) != 1)
1014                                 continue;
1015
1016                         /* get the uint64_t property value */
1017                         la = &ZAP_LEAF_CHUNK(l, blksft, le->le_value_chunk).l_array;
1018                         val = be64_to_cpu(la->la_array64);
1019                         if (hook(buf, val, data))
1020                                 return 1;
1021                         free(buf);
1022                 }
1023         }
1024         return 0;
1025 }
1026
1027
1028 /*
1029  * Read in the data of a zap object and find the value for a matching
1030  * property name.
1031  *
1032  */
1033 static int
1034 zap_lookup(dnode_end_t *zap_dnode, char *name, uint64_t *val,
1035                    struct zfs_data *data)
1036 {
1037         uint64_t block_type;
1038         int size;
1039         void *zapbuf;
1040         int err;
1041         zfs_endian_t endian;
1042
1043         /* Read in the first block of the zap object data. */
1044         size = zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
1045                                                          zap_dnode->endian) << SPA_MINBLOCKSHIFT;
1046         err = dmu_read(zap_dnode, 0, &zapbuf, &endian, data);
1047         if (err)
1048                 return err;
1049         block_type = zfs_to_cpu64(*((uint64_t *) zapbuf), endian);
1050
1051         if (block_type == ZBT_MICRO) {
1052                 err = (mzap_lookup(zapbuf, endian, size, name, val));
1053                 free(zapbuf);
1054                 return err;
1055         } else if (block_type == ZBT_HEADER) {
1056                 /* this is a fat zap */
1057                 err = (fzap_lookup(zap_dnode, zapbuf, name, val, data));
1058                 free(zapbuf);
1059                 return err;
1060         }
1061
1062         printf("unknown ZAP type\n");
1063         free(zapbuf);
1064         return ZFS_ERR_BAD_FS;
1065 }
1066
1067 static int
1068 zap_iterate(dnode_end_t *zap_dnode,
1069                         int (*hook)(const char *name, uint64_t val,
1070                                                 struct zfs_data *data),
1071                         struct zfs_data *data)
1072 {
1073         uint64_t block_type;
1074         int size;
1075         void *zapbuf;
1076         int err;
1077         int ret;
1078         zfs_endian_t endian;
1079
1080         /* Read in the first block of the zap object data. */
1081         size = zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec, zap_dnode->endian) << SPA_MINBLOCKSHIFT;
1082         err = dmu_read(zap_dnode, 0, &zapbuf, &endian, data);
1083         if (err)
1084                 return 0;
1085         block_type = zfs_to_cpu64(*((uint64_t *) zapbuf), endian);
1086
1087         if (block_type == ZBT_MICRO) {
1088                 ret = mzap_iterate(zapbuf, endian, size, hook, data);
1089                 free(zapbuf);
1090                 return ret;
1091         } else if (block_type == ZBT_HEADER) {
1092                 /* this is a fat zap */
1093                 ret = fzap_iterate(zap_dnode, zapbuf, hook, data);
1094                 free(zapbuf);
1095                 return ret;
1096         }
1097         printf("unknown ZAP type\n");
1098         free(zapbuf);
1099         return 0;
1100 }
1101
1102
1103 /*
1104  * Get the dnode of an object number from the metadnode of an object set.
1105  *
1106  * Input
1107  *      mdn - metadnode to get the object dnode
1108  *      objnum - object number for the object dnode
1109  *      buf - data buffer that holds the returning dnode
1110  */
1111 static int
1112 dnode_get(dnode_end_t *mdn, uint64_t objnum, uint8_t type,
1113                   dnode_end_t *buf, struct zfs_data *data)
1114 {
1115         uint64_t blkid, blksz;  /* the block id this object dnode is in */
1116         int epbs;                       /* shift of number of dnodes in a block */
1117         int idx;                        /* index within a block */
1118         void *dnbuf;
1119         int err;
1120         zfs_endian_t endian;
1121
1122         blksz = zfs_to_cpu16(mdn->dn.dn_datablkszsec,
1123                                                           mdn->endian) << SPA_MINBLOCKSHIFT;
1124
1125         epbs = zfs_log2(blksz) - DNODE_SHIFT;
1126         blkid = objnum >> epbs;
1127         idx = objnum & ((1 << epbs) - 1);
1128
1129         if (data->dnode_buf != NULL && memcmp(data->dnode_mdn, mdn,
1130                                                                                   sizeof(*mdn)) == 0
1131                 && objnum >= data->dnode_start && objnum < data->dnode_end) {
1132                 memmove(&(buf->dn), &(data->dnode_buf)[idx], DNODE_SIZE);
1133                 buf->endian = data->dnode_endian;
1134                 if (type && buf->dn.dn_type != type)  {
1135                         printf("incorrect dnode type: %02X != %02x\n", buf->dn.dn_type, type);
1136                         return ZFS_ERR_BAD_FS;
1137                 }
1138                 return ZFS_ERR_NONE;
1139         }
1140
1141         err = dmu_read(mdn, blkid, &dnbuf, &endian, data);
1142         if (err)
1143                 return err;
1144
1145         free(data->dnode_buf);
1146         free(data->dnode_mdn);
1147         data->dnode_mdn = malloc(sizeof(*mdn));
1148         if (!data->dnode_mdn) {
1149                 data->dnode_buf = 0;
1150         } else {
1151                 memcpy(data->dnode_mdn, mdn, sizeof(*mdn));
1152                 data->dnode_buf = dnbuf;
1153                 data->dnode_start = blkid << epbs;
1154                 data->dnode_end = (blkid + 1) << epbs;
1155                 data->dnode_endian = endian;
1156         }
1157
1158         memmove(&(buf->dn), (dnode_phys_t *) dnbuf + idx, DNODE_SIZE);
1159         buf->endian = endian;
1160         if (type && buf->dn.dn_type != type) {
1161                 printf("incorrect dnode type\n");
1162                 return ZFS_ERR_BAD_FS;
1163         }
1164
1165         return ZFS_ERR_NONE;
1166 }
1167
1168 /*
1169  * Get the file dnode for a given file name where mdn is the meta dnode
1170  * for this ZFS object set. When found, place the file dnode in dn.
1171  * The 'path' argument will be mangled.
1172  *
1173  */
1174 static int
1175 dnode_get_path(dnode_end_t *mdn, const char *path_in, dnode_end_t *dn,
1176                            struct zfs_data *data)
1177 {
1178         uint64_t objnum, version;
1179         char *cname, ch;
1180         int err = ZFS_ERR_NONE;
1181         char *path, *path_buf;
1182         struct dnode_chain {
1183                 struct dnode_chain *next;
1184                 dnode_end_t dn;
1185         };
1186         struct dnode_chain *dnode_path = 0, *dn_new, *root;
1187
1188         dn_new = malloc(sizeof(*dn_new));
1189         if (!dn_new)
1190                 return ZFS_ERR_OUT_OF_MEMORY;
1191         dn_new->next = 0;
1192         dnode_path = root = dn_new;
1193
1194         err = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
1195                                         &(dnode_path->dn), data);
1196         if (err) {
1197                 free(dn_new);
1198                 return err;
1199         }
1200
1201         err = zap_lookup(&(dnode_path->dn), ZPL_VERSION_STR, &version, data);
1202         if (err) {
1203                 free(dn_new);
1204                 return err;
1205         }
1206         if (version > ZPL_VERSION) {
1207                 free(dn_new);
1208                 printf("too new ZPL version\n");
1209                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
1210         }
1211
1212         err = zap_lookup(&(dnode_path->dn), ZFS_ROOT_OBJ, &objnum, data);
1213         if (err) {
1214                 free(dn_new);
1215                 return err;
1216         }
1217
1218         err = dnode_get(mdn, objnum, 0, &(dnode_path->dn), data);
1219         if (err) {
1220                 free(dn_new);
1221                 return err;
1222         }
1223
1224         path = path_buf = strdup(path_in);
1225         if (!path_buf) {
1226                 free(dn_new);
1227                 return ZFS_ERR_OUT_OF_MEMORY;
1228         }
1229
1230         while (1) {
1231                 /* skip leading slashes */
1232                 while (*path == '/')
1233                         path++;
1234                 if (!*path)
1235                         break;
1236                 /* get the next component name */
1237                 cname = path;
1238                 while (*path && *path != '/')
1239                         path++;
1240                 /* Skip dot.  */
1241                 if (cname + 1 == path && cname[0] == '.')
1242                         continue;
1243                 /* Handle double dot.  */
1244                 if (cname + 2 == path && cname[0] == '.' && cname[1] == '.')  {
1245                         if (dn_new->next) {
1246                                 dn_new = dnode_path;
1247                                 dnode_path = dn_new->next;
1248                                 free(dn_new);
1249                         } else {
1250                                 printf("can't resolve ..\n");
1251                                 err = ZFS_ERR_FILE_NOT_FOUND;
1252                                 break;
1253                         }
1254                         continue;
1255                 }
1256
1257                 ch = *path;
1258                 *path = 0;              /* ensure null termination */
1259
1260                 if (dnode_path->dn.dn.dn_type != DMU_OT_DIRECTORY_CONTENTS) {
1261                         free(path_buf);
1262                         printf("not a directory\n");
1263                         return ZFS_ERR_BAD_FILE_TYPE;
1264                 }
1265                 err = zap_lookup(&(dnode_path->dn), cname, &objnum, data);
1266                 if (err)
1267                         break;
1268
1269                 dn_new = malloc(sizeof(*dn_new));
1270                 if (!dn_new) {
1271                         err = ZFS_ERR_OUT_OF_MEMORY;
1272                         break;
1273                 }
1274                 dn_new->next = dnode_path;
1275                 dnode_path = dn_new;
1276
1277                 objnum = ZFS_DIRENT_OBJ(objnum);
1278                 err = dnode_get(mdn, objnum, 0, &(dnode_path->dn), data);
1279                 if (err)
1280                         break;
1281
1282                 *path = ch;
1283         }
1284
1285         if (!err)
1286                 memcpy(dn, &(dnode_path->dn), sizeof(*dn));
1287
1288         while (dnode_path) {
1289                 dn_new = dnode_path->next;
1290                 free(dnode_path);
1291                 dnode_path = dn_new;
1292         }
1293         free(path_buf);
1294         return err;
1295 }
1296
1297
1298 /*
1299  * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1300  * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1301  * of pool/rootfs.
1302  *
1303  * If no fsname and no obj are given, return the DSL_DIR metadnode.
1304  * If fsname is given, return its metadnode and its matching object number.
1305  * If only obj is given, return the metadnode for this object number.
1306  *
1307  */
1308 static int
1309 get_filesystem_dnode(dnode_end_t *mosmdn, char *fsname,
1310                                          dnode_end_t *mdn, struct zfs_data *data)
1311 {
1312         uint64_t objnum;
1313         int err;
1314
1315         err = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1316                                         DMU_OT_OBJECT_DIRECTORY, mdn, data);
1317         if (err)
1318                 return err;
1319
1320         err = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum, data);
1321         if (err)
1322                 return err;
1323
1324         err = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, data);
1325         if (err)
1326                 return err;
1327
1328         while (*fsname) {
1329                 uint64_t childobj;
1330                 char *cname, ch;
1331
1332                 while (*fsname == '/')
1333                         fsname++;
1334
1335                 if (!*fsname || *fsname == '@')
1336                         break;
1337
1338                 cname = fsname;
1339                 while (*fsname && !isspace(*fsname) && *fsname != '/')
1340                         fsname++;
1341                 ch = *fsname;
1342                 *fsname = 0;
1343
1344                 childobj = zfs_to_cpu64((((dsl_dir_phys_t *) DN_BONUS(&mdn->dn)))->dd_child_dir_zapobj, mdn->endian);
1345                 err = dnode_get(mosmdn, childobj,
1346                                                 DMU_OT_DSL_DIR_CHILD_MAP, mdn, data);
1347                 if (err)
1348                         return err;
1349
1350                 err = zap_lookup(mdn, cname, &objnum, data);
1351                 if (err)
1352                         return err;
1353
1354                 err = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, data);
1355                 if (err)
1356                         return err;
1357
1358                 *fsname = ch;
1359         }
1360         return ZFS_ERR_NONE;
1361 }
1362
1363 static int
1364 make_mdn(dnode_end_t *mdn, struct zfs_data *data)
1365 {
1366         void *osp;
1367         blkptr_t *bp;
1368         size_t ospsize;
1369         int err;
1370
1371         bp = &(((dsl_dataset_phys_t *) DN_BONUS(&mdn->dn))->ds_bp);
1372         err = zio_read(bp, mdn->endian, &osp, &ospsize, data);
1373         if (err)
1374                 return err;
1375         if (ospsize < OBJSET_PHYS_SIZE_V14) {
1376                 free(osp);
1377                 printf("too small osp\n");
1378                 return ZFS_ERR_BAD_FS;
1379         }
1380
1381         mdn->endian = (zfs_to_cpu64(bp->blk_prop, mdn->endian)>>63) & 1;
1382         memmove((char *) &(mdn->dn),
1383                         (char *) &((objset_phys_t *) osp)->os_meta_dnode, DNODE_SIZE);
1384         free(osp);
1385         return ZFS_ERR_NONE;
1386 }
1387
1388 static int
1389 dnode_get_fullpath(const char *fullpath, dnode_end_t *mdn,
1390                                    uint64_t *mdnobj, dnode_end_t *dn, int *isfs,
1391                                    struct zfs_data *data)
1392 {
1393         char *fsname, *snapname;
1394         const char *ptr_at, *filename;
1395         uint64_t headobj;
1396         int err;
1397
1398         ptr_at = strchr(fullpath, '@');
1399         if (!ptr_at) {
1400                 *isfs = 1;
1401                 filename = 0;
1402                 snapname = 0;
1403                 fsname = strdup(fullpath);
1404         } else {
1405                 const char *ptr_slash = strchr(ptr_at, '/');
1406
1407                 *isfs = 0;
1408                 fsname = malloc(ptr_at - fullpath + 1);
1409                 if (!fsname)
1410                         return ZFS_ERR_OUT_OF_MEMORY;
1411                 memcpy(fsname, fullpath, ptr_at - fullpath);
1412                 fsname[ptr_at - fullpath] = 0;
1413                 if (ptr_at[1] && ptr_at[1] != '/') {
1414                         snapname = malloc(ptr_slash - ptr_at);
1415                         if (!snapname) {
1416                                 free(fsname);
1417                                 return ZFS_ERR_OUT_OF_MEMORY;
1418                         }
1419                         memcpy(snapname, ptr_at + 1, ptr_slash - ptr_at - 1);
1420                         snapname[ptr_slash - ptr_at - 1] = 0;
1421                 } else {
1422                         snapname = 0;
1423                 }
1424                 if (ptr_slash)
1425                         filename = ptr_slash;
1426                 else
1427                         filename = "/";
1428                 printf("zfs fsname = '%s' snapname='%s' filename = '%s'\n",
1429                            fsname, snapname, filename);
1430         }
1431
1432
1433         err = get_filesystem_dnode(&(data->mos), fsname, dn, data);
1434
1435         if (err) {
1436                 free(fsname);
1437                 free(snapname);
1438                 return err;
1439         }
1440
1441         headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&dn->dn))->dd_head_dataset_obj, dn->endian);
1442
1443         err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, mdn, data);
1444         if (err) {
1445                 free(fsname);
1446                 free(snapname);
1447                 return err;
1448         }
1449
1450         if (snapname) {
1451                 uint64_t snapobj;
1452
1453                 snapobj = zfs_to_cpu64(((dsl_dataset_phys_t *) DN_BONUS(&mdn->dn))->ds_snapnames_zapobj, mdn->endian);
1454
1455                 err = dnode_get(&(data->mos), snapobj,
1456                                                 DMU_OT_DSL_DS_SNAP_MAP, mdn, data);
1457                 if (!err)
1458                         err = zap_lookup(mdn, snapname, &headobj, data);
1459                 if (!err)
1460                         err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, mdn, data);
1461                 if (err) {
1462                         free(fsname);
1463                         free(snapname);
1464                         return err;
1465                 }
1466         }
1467
1468         if (mdnobj)
1469                 *mdnobj = headobj;
1470
1471         make_mdn(mdn, data);
1472
1473         if (*isfs) {
1474                 free(fsname);
1475                 free(snapname);
1476                 return ZFS_ERR_NONE;
1477         }
1478         err = dnode_get_path(mdn, filename, dn, data);
1479         free(fsname);
1480         free(snapname);
1481         return err;
1482 }
1483
1484 /*
1485  * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1486  *
1487  * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1488  *
1489  *              encoding method/host endian             (4 bytes)
1490  *              nvl_version                                             (4 bytes)
1491  *              nvl_nvflag                                              (4 bytes)
1492  *      encoded nvpairs:
1493  *              encoded size of the nvpair              (4 bytes)
1494  *              decoded size of the nvpair              (4 bytes)
1495  *              name string size                                (4 bytes)
1496  *              name string data                                (sizeof(NV_ALIGN4(string))
1497  *              data type                                               (4 bytes)
1498  *              # of elements in the nvpair             (4 bytes)
1499  *              data
1500  *              2 zero's for the last nvpair
1501  *              (end of the entire list)        (8 bytes)
1502  *
1503  */
1504
1505 static int
1506 nvlist_find_value(char *nvlist, char *name, int valtype, char **val,
1507                                   size_t *size_out, size_t *nelm_out)
1508 {
1509         int name_len, type, encode_size;
1510         char *nvpair, *nvp_name;
1511
1512         /* Verify if the 1st and 2nd byte in the nvlist are valid. */
1513         /* NOTE: independently of what endianness header announces all
1514            subsequent values are big-endian.  */
1515         if (nvlist[0] != NV_ENCODE_XDR || (nvlist[1] != NV_LITTLE_ENDIAN
1516                                                                            && nvlist[1] != NV_BIG_ENDIAN)) {
1517                 printf("zfs incorrect nvlist header\n");
1518                 return ZFS_ERR_BAD_FS;
1519         }
1520
1521         /* skip the header, nvl_version, and nvl_nvflag */
1522         nvlist = nvlist + 4 * 3;
1523         /*
1524          * Loop thru the nvpair list
1525          * The XDR representation of an integer is in big-endian byte order.
1526          */
1527         while ((encode_size = be32_to_cpu(*(uint32_t *) nvlist))) {
1528                 int nelm;
1529
1530                 nvpair = nvlist + 4 * 2;        /* skip the encode/decode size */
1531
1532                 name_len = be32_to_cpu(*(uint32_t *) nvpair);
1533                 nvpair += 4;
1534
1535                 nvp_name = nvpair;
1536                 nvpair = nvpair + ((name_len + 3) & ~3);        /* align */
1537
1538                 type = be32_to_cpu(*(uint32_t *) nvpair);
1539                 nvpair += 4;
1540
1541                 nelm = be32_to_cpu(*(uint32_t *) nvpair);
1542                 if (nelm < 1) {
1543                         printf("empty nvpair\n");
1544                         return ZFS_ERR_BAD_FS;
1545                 }
1546
1547                 nvpair += 4;
1548
1549                 if ((strncmp(nvp_name, name, name_len) == 0) && type == valtype) {
1550                         *val = nvpair;
1551                         *size_out = encode_size;
1552                         if (nelm_out)
1553                                 *nelm_out = nelm;
1554                         return 1;
1555                 }
1556
1557                 nvlist += encode_size;  /* goto the next nvpair */
1558         }
1559         return 0;
1560 }
1561
1562 int
1563 zfs_nvlist_lookup_uint64(char *nvlist, char *name, uint64_t *out)
1564 {
1565         char *nvpair;
1566         size_t size;
1567         int found;
1568
1569         found = nvlist_find_value(nvlist, name, DATA_TYPE_UINT64, &nvpair, &size, 0);
1570         if (!found)
1571                 return 0;
1572         if (size < sizeof(uint64_t)) {
1573                 printf("invalid uint64\n");
1574                 return ZFS_ERR_BAD_FS;
1575         }
1576
1577         *out = be64_to_cpu(*(uint64_t *) nvpair);
1578         return 1;
1579 }
1580
1581 char *
1582 zfs_nvlist_lookup_string(char *nvlist, char *name)
1583 {
1584         char *nvpair;
1585         char *ret;
1586         size_t slen;
1587         size_t size;
1588         int found;
1589
1590         found = nvlist_find_value(nvlist, name, DATA_TYPE_STRING, &nvpair, &size, 0);
1591         if (!found)
1592                 return 0;
1593         if (size < 4) {
1594                 printf("invalid string\n");
1595                 return 0;
1596         }
1597         slen = be32_to_cpu(*(uint32_t *) nvpair);
1598         if (slen > size - 4)
1599                 slen = size - 4;
1600         ret = malloc(slen + 1);
1601         if (!ret)
1602                 return 0;
1603         memcpy(ret, nvpair + 4, slen);
1604         ret[slen] = 0;
1605         return ret;
1606 }
1607
1608 char *
1609 zfs_nvlist_lookup_nvlist(char *nvlist, char *name)
1610 {
1611         char *nvpair;
1612         char *ret;
1613         size_t size;
1614         int found;
1615
1616         found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1617                                                           &size, 0);
1618         if (!found)
1619                 return 0;
1620         ret = calloc(1, size + 3 * sizeof(uint32_t));
1621         if (!ret)
1622                 return 0;
1623         memcpy(ret, nvlist, sizeof(uint32_t));
1624
1625         memcpy(ret + sizeof(uint32_t), nvpair, size);
1626         return ret;
1627 }
1628
1629 int
1630 zfs_nvlist_lookup_nvlist_array_get_nelm(char *nvlist, char *name)
1631 {
1632         char *nvpair;
1633         size_t nelm, size;
1634         int found;
1635
1636         found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1637                                                           &size, &nelm);
1638         if (!found)
1639                 return -1;
1640         return nelm;
1641 }
1642
1643 char *
1644 zfs_nvlist_lookup_nvlist_array(char *nvlist, char *name,
1645                                                                         size_t index)
1646 {
1647         char *nvpair, *nvpairptr;
1648         int found;
1649         char *ret;
1650         size_t size;
1651         unsigned i;
1652         size_t nelm;
1653
1654         found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1655                                                           &size, &nelm);
1656         if (!found)
1657                 return 0;
1658         if (index >= nelm) {
1659                 printf("trying to lookup past nvlist array\n");
1660                 return 0;
1661         }
1662
1663         nvpairptr = nvpair;
1664
1665         for (i = 0; i < index; i++) {
1666                 uint32_t encode_size;
1667
1668                 /* skip the header, nvl_version, and nvl_nvflag */
1669                 nvpairptr = nvpairptr + 4 * 2;
1670
1671                 while (nvpairptr < nvpair + size
1672                            && (encode_size = be32_to_cpu(*(uint32_t *) nvpairptr)))
1673                         nvlist += encode_size;  /* goto the next nvpair */
1674
1675                 nvlist = nvlist + 4 * 2;        /* skip the ending 2 zeros - 8 bytes */
1676         }
1677
1678         if (nvpairptr >= nvpair + size
1679                 || nvpairptr + be32_to_cpu(*(uint32_t *) (nvpairptr + 4 * 2))
1680                 >= nvpair + size) {
1681                 printf("incorrect nvlist array\n");
1682                 return 0;
1683         }
1684
1685         ret = calloc(1, be32_to_cpu(*(uint32_t *) (nvpairptr + 4 * 2))
1686                                  + 3 * sizeof(uint32_t));
1687         if (!ret)
1688                 return 0;
1689         memcpy(ret, nvlist, sizeof(uint32_t));
1690
1691         memcpy(ret + sizeof(uint32_t), nvpairptr, size);
1692         return ret;
1693 }
1694
1695 static int
1696 int_zfs_fetch_nvlist(struct zfs_data *data, char **nvlist)
1697 {
1698         int err;
1699
1700         *nvlist = malloc(VDEV_PHYS_SIZE);
1701         /* Read in the vdev name-value pair list (112K). */
1702         err = zfs_devread(data->vdev_phys_sector, 0, VDEV_PHYS_SIZE, *nvlist);
1703         if (err) {
1704                 free(*nvlist);
1705                 *nvlist = 0;
1706                 return err;
1707         }
1708         return ZFS_ERR_NONE;
1709 }
1710
1711 /*
1712  * Check the disk label information and retrieve needed vdev name-value pairs.
1713  *
1714  */
1715 static int
1716 check_pool_label(struct zfs_data *data)
1717 {
1718         uint64_t pool_state;
1719         char *nvlist;                   /* for the pool */
1720         char *vdevnvlist;               /* for the vdev */
1721         uint64_t diskguid;
1722         uint64_t version;
1723         int found;
1724         int err;
1725
1726         err = int_zfs_fetch_nvlist(data, &nvlist);
1727         if (err)
1728                 return err;
1729
1730         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_STATE,
1731                                                                                   &pool_state);
1732         if (!found) {
1733                 free(nvlist);
1734                 printf("zfs pool state not found\n");
1735                 return ZFS_ERR_BAD_FS;
1736         }
1737
1738         if (pool_state == POOL_STATE_DESTROYED) {
1739                 free(nvlist);
1740                 printf("zpool is marked as destroyed\n");
1741                 return ZFS_ERR_BAD_FS;
1742         }
1743
1744         data->label_txg = 0;
1745         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_TXG,
1746                                                                                   &data->label_txg);
1747         if (!found) {
1748                 free(nvlist);
1749                 printf("zfs pool txg not found\n");
1750                 return ZFS_ERR_BAD_FS;
1751         }
1752
1753         /* not an active device */
1754         if (data->label_txg == 0) {
1755                 free(nvlist);
1756                 printf("zpool is not active\n");
1757                 return ZFS_ERR_BAD_FS;
1758         }
1759
1760         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_VERSION,
1761                                                                                   &version);
1762         if (!found) {
1763                 free(nvlist);
1764                 printf("zpool config version not found\n");
1765                 return ZFS_ERR_BAD_FS;
1766         }
1767
1768         if (version > SPA_VERSION) {
1769                 free(nvlist);
1770                 printf("SPA version too new %llu > %llu\n",
1771                            (unsigned long long) version,
1772                            (unsigned long long) SPA_VERSION);
1773                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
1774         }
1775
1776         vdevnvlist = zfs_nvlist_lookup_nvlist(nvlist, ZPOOL_CONFIG_VDEV_TREE);
1777         if (!vdevnvlist) {
1778                 free(nvlist);
1779                 printf("ZFS config vdev tree not found\n");
1780                 return ZFS_ERR_BAD_FS;
1781         }
1782
1783         found = zfs_nvlist_lookup_uint64(vdevnvlist, ZPOOL_CONFIG_ASHIFT,
1784                                                                                   &data->vdev_ashift);
1785         free(vdevnvlist);
1786         if (!found) {
1787                 free(nvlist);
1788                 printf("ZPOOL config ashift not found\n");
1789                 return ZFS_ERR_BAD_FS;
1790         }
1791
1792         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_GUID, &diskguid);
1793         if (!found) {
1794                 free(nvlist);
1795                 printf("ZPOOL config guid not found\n");
1796                 return ZFS_ERR_BAD_FS;
1797         }
1798
1799         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_GUID, &data->pool_guid);
1800         if (!found) {
1801                 free(nvlist);
1802                 printf("ZPOOL config pool guid not found\n");
1803                 return ZFS_ERR_BAD_FS;
1804         }
1805
1806         free(nvlist);
1807
1808         printf("ZFS Pool GUID: %llu (%016llx) Label: GUID: %llu (%016llx), txg: %llu, SPA v%llu, ashift: %llu\n",
1809                    (unsigned long long) data->pool_guid,
1810                    (unsigned long long) data->pool_guid,
1811                    (unsigned long long) diskguid,
1812                    (unsigned long long) diskguid,
1813                    (unsigned long long) data->label_txg,
1814                    (unsigned long long) version,
1815                    (unsigned long long) data->vdev_ashift);
1816
1817         return ZFS_ERR_NONE;
1818 }
1819
1820 /*
1821  * vdev_label_start returns the physical disk offset (in bytes) of
1822  * label "l".
1823  */
1824 static uint64_t vdev_label_start(uint64_t psize, int l)
1825 {
1826         return (l * sizeof(vdev_label_t) + (l < VDEV_LABELS / 2 ?
1827                                                                                 0 : psize -
1828                                                                                 VDEV_LABELS * sizeof(vdev_label_t)));
1829 }
1830
1831 void
1832 zfs_unmount(struct zfs_data *data)
1833 {
1834         free(data->dnode_buf);
1835         free(data->dnode_mdn);
1836         free(data->file_buf);
1837         free(data);
1838 }
1839
1840 /*
1841  * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1842  * to the memory address MOS.
1843  *
1844  */
1845 struct zfs_data *
1846 zfs_mount(device_t dev)
1847 {
1848         struct zfs_data *data = 0;
1849         int label = 0, bestlabel = -1;
1850         char *ub_array;
1851         uberblock_t *ubbest;
1852         uberblock_t *ubcur = NULL;
1853         void *osp = 0;
1854         size_t ospsize;
1855         int err;
1856
1857         data = malloc(sizeof(*data));
1858         if (!data)
1859                 return 0;
1860         memset(data, 0, sizeof(*data));
1861
1862         ub_array = malloc(VDEV_UBERBLOCK_RING);
1863         if (!ub_array) {
1864                 zfs_unmount(data);
1865                 return 0;
1866         }
1867
1868         ubbest = malloc(sizeof(*ubbest));
1869         if (!ubbest) {
1870                 free(ub_array);
1871                 zfs_unmount(data);
1872                 return 0;
1873         }
1874         memset(ubbest, 0, sizeof(*ubbest));
1875
1876         /*
1877          * some eltorito stacks don't give us a size and
1878          * we end up setting the size to MAXUINT, further
1879          * some of these devices stop working once a single
1880          * read past the end has been issued. Checking
1881          * for a maximum part_length and skipping the backup
1882          * labels at the end of the slice/partition/device
1883          * avoids breaking down on such devices.
1884          */
1885         const int vdevnum =
1886                 dev->part_length == 0 ?
1887                 VDEV_LABELS / 2 : VDEV_LABELS;
1888
1889         /* Size in bytes of the device (disk or partition) aligned to label size*/
1890         uint64_t device_size =
1891                 dev->part_length << SECTOR_BITS;
1892
1893         const uint64_t alignedbytes =
1894                 P2ALIGN(device_size, (uint64_t) sizeof(vdev_label_t));
1895
1896         for (label = 0; label < vdevnum; label++) {
1897                 uint64_t labelstartbytes = vdev_label_start(alignedbytes, label);
1898                 uint64_t labelstart = labelstartbytes >> SECTOR_BITS;
1899
1900                 debug("zfs reading label %d at sector %llu (byte %llu)\n",
1901                           label, (unsigned long long) labelstart,
1902                           (unsigned long long) labelstartbytes);
1903
1904                 data->vdev_phys_sector = labelstart +
1905                         ((VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE) >> SECTOR_BITS);
1906
1907                 err = check_pool_label(data);
1908                 if (err) {
1909                         printf("zfs error checking label %d\n", label);
1910                         continue;
1911                 }
1912
1913                 /* Read in the uberblock ring (128K). */
1914                 err = zfs_devread(data->vdev_phys_sector  +
1915                                                   (VDEV_PHYS_SIZE >> SECTOR_BITS),
1916                                                   0, VDEV_UBERBLOCK_RING, ub_array);
1917                 if (err) {
1918                         printf("zfs error reading uberblock ring for label %d\n", label);
1919                         continue;
1920                 }
1921
1922                 ubcur = find_bestub(ub_array, data);
1923                 if (!ubcur) {
1924                         printf("zfs No good uberblocks found in label %d\n", label);
1925                         continue;
1926                 }
1927
1928                 if (vdev_uberblock_compare(ubcur, ubbest) > 0) {
1929                         /* Looks like the block is good, so use it.*/
1930                         memcpy(ubbest, ubcur, sizeof(*ubbest));
1931                         bestlabel = label;
1932                         debug("zfs Current best uberblock found in label %d\n", label);
1933                 }
1934         }
1935         free(ub_array);
1936
1937         /* We zero'd the structure to begin with.  If we never assigned to it,
1938            magic will still be zero. */
1939         if (!ubbest->ub_magic) {
1940                 printf("couldn't find a valid ZFS label\n");
1941                 zfs_unmount(data);
1942                 free(ubbest);
1943                 return 0;
1944         }
1945
1946         debug("zfs ubbest %p in label %d\n", ubbest, bestlabel);
1947
1948         zfs_endian_t ub_endian =
1949                 zfs_to_cpu64(ubbest->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC
1950                 ? LITTLE_ENDIAN : BIG_ENDIAN;
1951
1952         debug("zfs endian set to %s\n", !ub_endian ? "big" : "little");
1953
1954         err = zio_read(&ubbest->ub_rootbp, ub_endian, &osp, &ospsize, data);
1955
1956         if (err) {
1957                 printf("couldn't zio_read object directory\n");
1958                 zfs_unmount(data);
1959                 free(osp);
1960                 free(ubbest);
1961                 return 0;
1962         }
1963
1964         if (ospsize < OBJSET_PHYS_SIZE_V14) {
1965                 printf("osp too small\n");
1966                 zfs_unmount(data);
1967                 free(osp);
1968                 free(ubbest);
1969                 return 0;
1970         }
1971
1972         /* Got the MOS. Save it at the memory addr MOS. */
1973         memmove(&(data->mos.dn), &((objset_phys_t *) osp)->os_meta_dnode, DNODE_SIZE);
1974         data->mos.endian =
1975                 (zfs_to_cpu64(ubbest->ub_rootbp.blk_prop, ub_endian) >> 63) & 1;
1976         memmove(&(data->current_uberblock), ubbest, sizeof(uberblock_t));
1977
1978         free(osp);
1979         free(ubbest);
1980
1981         return data;
1982 }
1983
1984 int
1985 zfs_fetch_nvlist(device_t dev, char **nvlist)
1986 {
1987         struct zfs_data *zfs;
1988         int err;
1989
1990         zfs = zfs_mount(dev);
1991         if (!zfs)
1992                 return ZFS_ERR_BAD_FS;
1993         err = int_zfs_fetch_nvlist(zfs, nvlist);
1994         zfs_unmount(zfs);
1995         return err;
1996 }
1997
1998 /*
1999  * zfs_open() locates a file in the rootpool by following the
2000  * MOS and places the dnode of the file in the memory address DNODE.
2001  */
2002 int
2003 zfs_open(struct zfs_file *file, const char *fsfilename)
2004 {
2005         struct zfs_data *data;
2006         int err;
2007         int isfs;
2008
2009         data = zfs_mount(file->device);
2010         if (!data)
2011                 return ZFS_ERR_BAD_FS;
2012
2013         err = dnode_get_fullpath(fsfilename, &(data->mdn), 0,
2014                                                          &(data->dnode), &isfs, data);
2015         if (err) {
2016                 zfs_unmount(data);
2017                 return err;
2018         }
2019
2020         if (isfs) {
2021                 zfs_unmount(data);
2022                 printf("Missing @ or / separator\n");
2023                 return ZFS_ERR_FILE_NOT_FOUND;
2024         }
2025
2026         /* We found the dnode for this file. Verify if it is a plain file. */
2027         if (data->dnode.dn.dn_type != DMU_OT_PLAIN_FILE_CONTENTS) {
2028                 zfs_unmount(data);
2029                 printf("not a file\n");
2030                 return ZFS_ERR_BAD_FILE_TYPE;
2031         }
2032
2033         /* get the file size and set the file position to 0 */
2034
2035         /*
2036          * For DMU_OT_SA we will need to locate the SIZE attribute
2037          * attribute, which could be either in the bonus buffer
2038          * or the "spill" block.
2039          */
2040         if (data->dnode.dn.dn_bonustype == DMU_OT_SA) {
2041                 void *sahdrp;
2042                 int hdrsize;
2043
2044                 if (data->dnode.dn.dn_bonuslen != 0) {
2045                         sahdrp = (sa_hdr_phys_t *) DN_BONUS(&data->dnode.dn);
2046                 } else if (data->dnode.dn.dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
2047                         blkptr_t *bp = &data->dnode.dn.dn_spill;
2048
2049                         err = zio_read(bp, data->dnode.endian, &sahdrp, NULL, data);
2050                         if (err)
2051                                 return err;
2052                 } else {
2053                         printf("filesystem is corrupt :(\n");
2054                         return ZFS_ERR_BAD_FS;
2055                 }
2056
2057                 hdrsize = SA_HDR_SIZE(((sa_hdr_phys_t *) sahdrp));
2058                 file->size = *(uint64_t *) ((char *) sahdrp + hdrsize + SA_SIZE_OFFSET);
2059                 if ((data->dnode.dn.dn_bonuslen == 0) &&
2060                         (data->dnode.dn.dn_flags & DNODE_FLAG_SPILL_BLKPTR))
2061                         free(sahdrp);
2062         } else {
2063                 file->size = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&data->dnode.dn))->zp_size, data->dnode.endian);
2064         }
2065
2066         file->data = data;
2067         file->offset = 0;
2068
2069         return ZFS_ERR_NONE;
2070 }
2071
2072 uint64_t
2073 zfs_read(zfs_file_t file, char *buf, uint64_t len)
2074 {
2075         struct zfs_data *data = (struct zfs_data *) file->data;
2076         int blksz, movesize;
2077         uint64_t length;
2078         int64_t red;
2079         int err;
2080
2081         if (data->file_buf == NULL) {
2082                 data->file_buf = malloc(SPA_MAXBLOCKSIZE);
2083                 if (!data->file_buf)
2084                         return -1;
2085                 data->file_start = data->file_end = 0;
2086         }
2087
2088         /*
2089          * If offset is in memory, move it into the buffer provided and return.
2090          */
2091         if (file->offset >= data->file_start
2092                 && file->offset + len <= data->file_end) {
2093                 memmove(buf, data->file_buf + file->offset - data->file_start,
2094                                 len);
2095                 return len;
2096         }
2097
2098         blksz = zfs_to_cpu16(data->dnode.dn.dn_datablkszsec,
2099                                                           data->dnode.endian) << SPA_MINBLOCKSHIFT;
2100
2101         /*
2102          * Entire Dnode is too big to fit into the space available.      We
2103          * will need to read it in chunks.      This could be optimized to
2104          * read in as large a chunk as there is space available, but for
2105          * now, this only reads in one data block at a time.
2106          */
2107         length = len;
2108         red = 0;
2109         while (length) {
2110                 void *t;
2111                 /*
2112                  * Find requested blkid and the offset within that block.
2113                  */
2114                 uint64_t blkid = file->offset + red;
2115                 blkid = do_div(blkid, blksz);
2116                 free(data->file_buf);
2117                 data->file_buf = 0;
2118
2119                 err = dmu_read(&(data->dnode), blkid, &t,
2120                                            0, data);
2121                 data->file_buf = t;
2122                 if (err)
2123                         return -1;
2124
2125                 data->file_start = blkid * blksz;
2126                 data->file_end = data->file_start + blksz;
2127
2128                 movesize = min(length, data->file_end - (int)file->offset - red);
2129
2130                 memmove(buf, data->file_buf + file->offset + red
2131                                 - data->file_start, movesize);
2132                 buf += movesize;
2133                 length -= movesize;
2134                 red += movesize;
2135         }
2136
2137         return len;
2138 }
2139
2140 int
2141 zfs_close(zfs_file_t file)
2142 {
2143         zfs_unmount((struct zfs_data *) file->data);
2144         return ZFS_ERR_NONE;
2145 }
2146
2147 int
2148 zfs_getmdnobj(device_t dev, const char *fsfilename,
2149                                    uint64_t *mdnobj)
2150 {
2151         struct zfs_data *data;
2152         int err;
2153         int isfs;
2154
2155         data = zfs_mount(dev);
2156         if (!data)
2157                 return ZFS_ERR_BAD_FS;
2158
2159         err = dnode_get_fullpath(fsfilename, &(data->mdn), mdnobj,
2160                                                          &(data->dnode), &isfs, data);
2161         zfs_unmount(data);
2162         return err;
2163 }
2164
2165 static void
2166 fill_fs_info(struct zfs_dirhook_info *info,
2167                          dnode_end_t mdn, struct zfs_data *data)
2168 {
2169         int err;
2170         dnode_end_t dn;
2171         uint64_t objnum;
2172         uint64_t headobj;
2173
2174         memset(info, 0, sizeof(*info));
2175
2176         info->dir = 1;
2177
2178         if (mdn.dn.dn_type == DMU_OT_DSL_DIR) {
2179                 headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&mdn.dn))->dd_head_dataset_obj, mdn.endian);
2180
2181                 err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, &mdn, data);
2182                 if (err) {
2183                         printf("zfs failed here 1\n");
2184                         return;
2185                 }
2186         }
2187         make_mdn(&mdn, data);
2188         err = dnode_get(&mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
2189                                         &dn, data);
2190         if (err) {
2191                 printf("zfs failed here 2\n");
2192                 return;
2193         }
2194
2195         err = zap_lookup(&dn, ZFS_ROOT_OBJ, &objnum, data);
2196         if (err) {
2197                 printf("zfs failed here 3\n");
2198                 return;
2199         }
2200
2201         err = dnode_get(&mdn, objnum, 0, &dn, data);
2202         if (err) {
2203                 printf("zfs failed here 4\n");
2204                 return;
2205         }
2206
2207         info->mtimeset = 1;
2208         info->mtime = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&dn.dn))->zp_mtime[0], dn.endian);
2209
2210         return;
2211 }
2212
2213 static int iterate_zap(const char *name, uint64_t val, struct zfs_data *data)
2214 {
2215         struct zfs_dirhook_info info;
2216         dnode_end_t dn;
2217
2218         memset(&info, 0, sizeof(info));
2219
2220         dnode_get(&(data->mdn), val, 0, &dn, data);
2221         info.mtimeset = 1;
2222         info.mtime = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&dn.dn))->zp_mtime[0], dn.endian);
2223         info.dir = (dn.dn.dn_type == DMU_OT_DIRECTORY_CONTENTS);
2224         debug("zfs type=%d, name=%s\n",
2225                   (int)dn.dn.dn_type, (char *)name);
2226         if (!data->userhook)
2227                 return 0;
2228         return data->userhook(name, &info);
2229 }
2230
2231 static int iterate_zap_fs(const char *name, uint64_t val, struct zfs_data *data)
2232 {
2233         struct zfs_dirhook_info info;
2234         dnode_end_t mdn;
2235         int err;
2236         err = dnode_get(&(data->mos), val, 0, &mdn, data);
2237         if (err)
2238                 return 0;
2239         if (mdn.dn.dn_type != DMU_OT_DSL_DIR)
2240                 return 0;
2241
2242         fill_fs_info(&info, mdn, data);
2243
2244         if (!data->userhook)
2245                 return 0;
2246         return data->userhook(name, &info);
2247 }
2248
2249 static int iterate_zap_snap(const char *name, uint64_t val, struct zfs_data *data)
2250 {
2251         struct zfs_dirhook_info info;
2252         char *name2;
2253         int ret = 0;
2254         dnode_end_t mdn;
2255         int err;
2256
2257         err = dnode_get(&(data->mos), val, 0, &mdn, data);
2258         if (err)
2259                 return 0;
2260
2261         if (mdn.dn.dn_type != DMU_OT_DSL_DATASET)
2262                 return 0;
2263
2264         fill_fs_info(&info, mdn, data);
2265
2266         name2 = malloc(strlen(name) + 2);
2267         name2[0] = '@';
2268         memcpy(name2 + 1, name, strlen(name) + 1);
2269         if (data->userhook)
2270                 ret = data->userhook(name2, &info);
2271         free(name2);
2272         return ret;
2273 }
2274
2275 int
2276 zfs_ls(device_t device, const char *path,
2277            int (*hook)(const char *, const struct zfs_dirhook_info *))
2278 {
2279         struct zfs_data *data;
2280         int err;
2281         int isfs;
2282
2283         data = zfs_mount(device);
2284         if (!data)
2285                 return ZFS_ERR_BAD_FS;
2286
2287         data->userhook = hook;
2288
2289         err = dnode_get_fullpath(path, &(data->mdn), 0, &(data->dnode), &isfs, data);
2290         if (err) {
2291                 zfs_unmount(data);
2292                 return err;
2293         }
2294         if (isfs) {
2295                 uint64_t childobj, headobj;
2296                 uint64_t snapobj;
2297                 dnode_end_t dn;
2298                 struct zfs_dirhook_info info;
2299
2300                 fill_fs_info(&info, data->dnode, data);
2301                 hook("@", &info);
2302
2303                 childobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&data->dnode.dn))->dd_child_dir_zapobj, data->dnode.endian);
2304                 headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&data->dnode.dn))->dd_head_dataset_obj, data->dnode.endian);
2305                 err = dnode_get(&(data->mos), childobj,
2306                                                 DMU_OT_DSL_DIR_CHILD_MAP, &dn, data);
2307                 if (err) {
2308                         zfs_unmount(data);
2309                         return err;
2310                 }
2311
2312
2313                 zap_iterate(&dn, iterate_zap_fs, data);
2314
2315                 err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, &dn, data);
2316                 if (err) {
2317                         zfs_unmount(data);
2318                         return err;
2319                 }
2320
2321                 snapobj = zfs_to_cpu64(((dsl_dataset_phys_t *) DN_BONUS(&dn.dn))->ds_snapnames_zapobj, dn.endian);
2322
2323                 err = dnode_get(&(data->mos), snapobj,
2324                                                 DMU_OT_DSL_DS_SNAP_MAP, &dn, data);
2325                 if (err) {
2326                         zfs_unmount(data);
2327                         return err;
2328                 }
2329
2330                 zap_iterate(&dn, iterate_zap_snap, data);
2331         } else {
2332                 if (data->dnode.dn.dn_type != DMU_OT_DIRECTORY_CONTENTS) {
2333                         zfs_unmount(data);
2334                         printf("not a directory\n");
2335                         return ZFS_ERR_BAD_FILE_TYPE;
2336                 }
2337                 zap_iterate(&(data->dnode), iterate_zap, data);
2338         }
2339         zfs_unmount(data);
2340         return ZFS_ERR_NONE;
2341 }