rockchip: make_fit_atf: use elf entry point
[platform/kernel/u-boot.git] / fs / zfs / zfs.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  *
4  * ZFS filesystem ported to u-boot by
5  * Jorgen Lundman <lundman at lundman.net>
6  *
7  *      GRUB  --  GRand Unified Bootloader
8  *      Copyright (C) 1999,2000,2001,2002,2003,2004
9  *      Free Software Foundation, Inc.
10  *      Copyright 2004  Sun Microsystems, Inc.
11  */
12
13 #include <common.h>
14 #include <malloc.h>
15 #include <linux/stat.h>
16 #include <linux/time.h>
17 #include <linux/ctype.h>
18 #include <asm/byteorder.h>
19 #include "zfs_common.h"
20 #include "div64.h"
21
22 struct blk_desc *zfs_dev_desc;
23
24 /*
25  * The zfs plug-in routines for GRUB are:
26  *
27  * zfs_mount() - locates a valid uberblock of the root pool and reads
28  *              in its MOS at the memory address MOS.
29  *
30  * zfs_open() - locates a plain file object by following the MOS
31  *              and places its dnode at the memory address DNODE.
32  *
33  * zfs_read() - read in the data blocks pointed by the DNODE.
34  *
35  */
36
37 #include <zfs/zfs.h>
38 #include <zfs/zio.h>
39 #include <zfs/dnode.h>
40 #include <zfs/uberblock_impl.h>
41 #include <zfs/vdev_impl.h>
42 #include <zfs/zio_checksum.h>
43 #include <zfs/zap_impl.h>
44 #include <zfs/zap_leaf.h>
45 #include <zfs/zfs_znode.h>
46 #include <zfs/dmu.h>
47 #include <zfs/dmu_objset.h>
48 #include <zfs/sa_impl.h>
49 #include <zfs/dsl_dir.h>
50 #include <zfs/dsl_dataset.h>
51
52
53 #define ZPOOL_PROP_BOOTFS               "bootfs"
54
55
56 /*
57  * For nvlist manipulation. (from nvpair.h)
58  */
59 #define NV_ENCODE_NATIVE        0
60 #define NV_ENCODE_XDR           1
61 #define NV_BIG_ENDIAN                   0
62 #define NV_LITTLE_ENDIAN        1
63 #define DATA_TYPE_UINT64        8
64 #define DATA_TYPE_STRING        9
65 #define DATA_TYPE_NVLIST        19
66 #define DATA_TYPE_NVLIST_ARRAY  20
67
68
69 /*
70  * Macros to get fields in a bp or DVA.
71  */
72 #define P2PHASE(x, align)               ((x) & ((align) - 1))
73 #define DVA_OFFSET_TO_PHYS_SECTOR(offset)                                       \
74         ((offset + VDEV_LABEL_START_SIZE) >> SPA_MINBLOCKSHIFT)
75
76 /*
77  * return x rounded down to an align boundary
78  * eg, P2ALIGN(1200, 1024) == 1024 (1*align)
79  * eg, P2ALIGN(1024, 1024) == 1024 (1*align)
80  * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align)
81  * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align)
82  */
83 #define P2ALIGN(x, align)               ((x) & -(align))
84
85 /*
86  * FAT ZAP data structures
87  */
88 #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL    /* ECMA-182, reflected form */
89 #define ZAP_HASH_IDX(hash, n)   (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
90 #define CHAIN_END       0xffff  /* end of the chunk chain */
91
92 /*
93  * The amount of space within the chunk available for the array is:
94  * chunk size - space for type (1) - space for next pointer (2)
95  */
96 #define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
97
98 #define ZAP_LEAF_HASH_SHIFT(bs) (bs - 5)
99 #define ZAP_LEAF_HASH_NUMENTRIES(bs) (1 << ZAP_LEAF_HASH_SHIFT(bs))
100 #define LEAF_HASH(bs, h)                                                                                                \
101         ((ZAP_LEAF_HASH_NUMENTRIES(bs)-1) &                                                                     \
102          ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(bs)-l->l_hdr.lh_prefix_len)))
103
104 /*
105  * The amount of space available for chunks is:
106  * block size shift - hash entry size (2) * number of hash
107  * entries - header space (2*chunksize)
108  */
109 #define ZAP_LEAF_NUMCHUNKS(bs)                                          \
110         (((1<<bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(bs)) /   \
111          ZAP_LEAF_CHUNKSIZE - 2)
112
113 /*
114  * The chunks start immediately after the hash table.  The end of the
115  * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
116  * chunk_t.
117  */
118 #define ZAP_LEAF_CHUNK(l, bs, idx)                                                                              \
119         ((zap_leaf_chunk_t *)(l->l_hash + ZAP_LEAF_HASH_NUMENTRIES(bs)))[idx]
120 #define ZAP_LEAF_ENTRY(l, bs, idx) (&ZAP_LEAF_CHUNK(l, bs, idx).l_entry)
121
122
123 /*
124  * Decompression Entry - lzjb
125  */
126 #ifndef NBBY
127 #define NBBY    8
128 #endif
129
130
131
132 typedef int zfs_decomp_func_t(void *s_start, void *d_start,
133                                                           uint32_t s_len, uint32_t d_len);
134 typedef struct decomp_entry {
135         char *name;
136         zfs_decomp_func_t *decomp_func;
137 } decomp_entry_t;
138
139 typedef struct dnode_end {
140         dnode_phys_t dn;
141         zfs_endian_t endian;
142 } dnode_end_t;
143
144 struct zfs_data {
145         /* cache for a file block of the currently zfs_open()-ed file */
146         char *file_buf;
147         uint64_t file_start;
148         uint64_t file_end;
149
150         /* XXX: ashift is per vdev, not per pool.  We currently only ever touch
151          * a single vdev, but when/if raid-z or stripes are supported, this
152          * may need revision.
153          */
154         uint64_t vdev_ashift;
155         uint64_t label_txg;
156         uint64_t pool_guid;
157
158         /* cache for a dnode block */
159         dnode_phys_t *dnode_buf;
160         dnode_phys_t *dnode_mdn;
161         uint64_t dnode_start;
162         uint64_t dnode_end;
163         zfs_endian_t dnode_endian;
164
165         uberblock_t current_uberblock;
166
167         dnode_end_t mos;
168         dnode_end_t mdn;
169         dnode_end_t dnode;
170
171         uint64_t vdev_phys_sector;
172
173         int (*userhook)(const char *, const struct zfs_dirhook_info *);
174         struct zfs_dirhook_info *dirinfo;
175
176 };
177
178
179
180
181 static int
182 zlib_decompress(void *s, void *d,
183                                 uint32_t slen, uint32_t dlen)
184 {
185         if (zlib_decompress(s, d, slen, dlen) < 0)
186                 return ZFS_ERR_BAD_FS;
187         return ZFS_ERR_NONE;
188 }
189
190 static decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] = {
191         {"inherit", NULL},              /* ZIO_COMPRESS_INHERIT */
192         {"on", lzjb_decompress},        /* ZIO_COMPRESS_ON */
193         {"off", NULL},          /* ZIO_COMPRESS_OFF */
194         {"lzjb", lzjb_decompress},      /* ZIO_COMPRESS_LZJB */
195         {"empty", NULL},                /* ZIO_COMPRESS_EMPTY */
196         {"gzip-1", zlib_decompress},  /* ZIO_COMPRESS_GZIP1 */
197         {"gzip-2", zlib_decompress},  /* ZIO_COMPRESS_GZIP2 */
198         {"gzip-3", zlib_decompress},  /* ZIO_COMPRESS_GZIP3 */
199         {"gzip-4", zlib_decompress},  /* ZIO_COMPRESS_GZIP4 */
200         {"gzip-5", zlib_decompress},  /* ZIO_COMPRESS_GZIP5 */
201         {"gzip-6", zlib_decompress},  /* ZIO_COMPRESS_GZIP6 */
202         {"gzip-7", zlib_decompress},  /* ZIO_COMPRESS_GZIP7 */
203         {"gzip-8", zlib_decompress},  /* ZIO_COMPRESS_GZIP8 */
204         {"gzip-9", zlib_decompress},  /* ZIO_COMPRESS_GZIP9 */
205 };
206
207
208
209 static int zio_read_data(blkptr_t *bp, zfs_endian_t endian,
210                                                  void *buf, struct zfs_data *data);
211
212 static int
213 zio_read(blkptr_t *bp, zfs_endian_t endian, void **buf,
214                  size_t *size, struct zfs_data *data);
215
216 /*
217  * Our own version of log2().  Same thing as highbit()-1.
218  */
219 static int
220 zfs_log2(uint64_t num)
221 {
222         int i = 0;
223
224         while (num > 1) {
225                 i++;
226                 num = num >> 1;
227         }
228
229         return i;
230 }
231
232
233 /* Checksum Functions */
234 static void
235 zio_checksum_off(const void *buf __attribute__ ((unused)),
236                                  uint64_t size __attribute__ ((unused)),
237                                  zfs_endian_t endian __attribute__ ((unused)),
238                                  zio_cksum_t *zcp)
239 {
240         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
241 }
242
243 /* Checksum Table and Values */
244 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
245         {NULL, 0, 0, "inherit"},
246         {NULL, 0, 0, "on"},
247         {zio_checksum_off, 0, 0, "off"},
248         {zio_checksum_SHA256, 1, 1, "label"},
249         {zio_checksum_SHA256, 1, 1, "gang_header"},
250         {NULL, 0, 0, "zilog"},
251         {fletcher_2_endian, 0, 0, "fletcher2"},
252         {fletcher_4_endian, 1, 0, "fletcher4"},
253         {zio_checksum_SHA256, 1, 0, "SHA256"},
254         {NULL, 0, 0, "zilog2"},
255 };
256
257 /*
258  * zio_checksum_verify: Provides support for checksum verification.
259  *
260  * Fletcher2, Fletcher4, and SHA256 are supported.
261  *
262  */
263 static int
264 zio_checksum_verify(zio_cksum_t zc, uint32_t checksum,
265                                         zfs_endian_t endian, char *buf, int size)
266 {
267         zio_eck_t *zec = (zio_eck_t *) (buf + size) - 1;
268         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
269         zio_cksum_t actual_cksum, expected_cksum;
270
271         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func == NULL) {
272                 printf("zfs unknown checksum function %d\n", checksum);
273                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
274         }
275
276         if (ci->ci_eck) {
277                 expected_cksum = zec->zec_cksum;
278                 zec->zec_cksum = zc;
279                 ci->ci_func(buf, size, endian, &actual_cksum);
280                 zec->zec_cksum = expected_cksum;
281                 zc = expected_cksum;
282         } else {
283                 ci->ci_func(buf, size, endian, &actual_cksum);
284         }
285
286         if ((actual_cksum.zc_word[0] != zc.zc_word[0])
287                 || (actual_cksum.zc_word[1] != zc.zc_word[1])
288                 || (actual_cksum.zc_word[2] != zc.zc_word[2])
289                 || (actual_cksum.zc_word[3] != zc.zc_word[3])) {
290                 return ZFS_ERR_BAD_FS;
291         }
292
293         return ZFS_ERR_NONE;
294 }
295
296 /*
297  * vdev_uberblock_compare takes two uberblock structures and returns an integer
298  * indicating the more recent of the two.
299  *      Return Value = 1 if ub2 is more recent
300  *      Return Value = -1 if ub1 is more recent
301  * The most recent uberblock is determined using its transaction number and
302  * timestamp.  The uberblock with the highest transaction number is
303  * considered "newer".  If the transaction numbers of the two blocks match, the
304  * timestamps are compared to determine the "newer" of the two.
305  */
306 static int
307 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
308 {
309         zfs_endian_t ub1_endian, ub2_endian;
310         if (zfs_to_cpu64(ub1->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC)
311                 ub1_endian = LITTLE_ENDIAN;
312         else
313                 ub1_endian = BIG_ENDIAN;
314         if (zfs_to_cpu64(ub2->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC)
315                 ub2_endian = LITTLE_ENDIAN;
316         else
317                 ub2_endian = BIG_ENDIAN;
318
319         if (zfs_to_cpu64(ub1->ub_txg, ub1_endian)
320                 < zfs_to_cpu64(ub2->ub_txg, ub2_endian))
321                 return -1;
322         if (zfs_to_cpu64(ub1->ub_txg, ub1_endian)
323                 > zfs_to_cpu64(ub2->ub_txg, ub2_endian))
324                 return 1;
325
326         if (zfs_to_cpu64(ub1->ub_timestamp, ub1_endian)
327                 < zfs_to_cpu64(ub2->ub_timestamp, ub2_endian))
328                 return -1;
329         if (zfs_to_cpu64(ub1->ub_timestamp, ub1_endian)
330                 > zfs_to_cpu64(ub2->ub_timestamp, ub2_endian))
331                 return 1;
332
333         return 0;
334 }
335
336 /*
337  * Three pieces of information are needed to verify an uberblock: the magic
338  * number, the version number, and the checksum.
339  *
340  * Currently Implemented: version number, magic number, label txg
341  * Need to Implement: checksum
342  *
343  */
344 static int
345 uberblock_verify(uberblock_t *uber, int offset, struct zfs_data *data)
346 {
347         int err;
348         zfs_endian_t endian = UNKNOWN_ENDIAN;
349         zio_cksum_t zc;
350
351         if (uber->ub_txg < data->label_txg) {
352                 debug("ignoring partially written label: uber_txg < label_txg %llu %llu\n",
353                           uber->ub_txg, data->label_txg);
354                 return ZFS_ERR_BAD_FS;
355         }
356
357         if (zfs_to_cpu64(uber->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC
358                 && zfs_to_cpu64(uber->ub_version, LITTLE_ENDIAN) > 0
359                 && zfs_to_cpu64(uber->ub_version, LITTLE_ENDIAN) <= SPA_VERSION)
360                 endian = LITTLE_ENDIAN;
361
362         if (zfs_to_cpu64(uber->ub_magic, BIG_ENDIAN) == UBERBLOCK_MAGIC
363                 && zfs_to_cpu64(uber->ub_version, BIG_ENDIAN) > 0
364                 && zfs_to_cpu64(uber->ub_version, BIG_ENDIAN) <= SPA_VERSION)
365                 endian = BIG_ENDIAN;
366
367         if (endian == UNKNOWN_ENDIAN) {
368                 printf("invalid uberblock magic\n");
369                 return ZFS_ERR_BAD_FS;
370         }
371
372         memset(&zc, 0, sizeof(zc));
373         zc.zc_word[0] = cpu_to_zfs64(offset, endian);
374         err = zio_checksum_verify(zc, ZIO_CHECKSUM_LABEL, endian,
375                                                           (char *) uber, UBERBLOCK_SIZE(data->vdev_ashift));
376
377         if (!err) {
378                 /* Check that the data pointed by the rootbp is usable. */
379                 void *osp = NULL;
380                 size_t ospsize;
381                 err = zio_read(&uber->ub_rootbp, endian, &osp, &ospsize, data);
382                 free(osp);
383
384                 if (!err && ospsize < OBJSET_PHYS_SIZE_V14) {
385                         printf("uberblock rootbp points to invalid data\n");
386                         return ZFS_ERR_BAD_FS;
387                 }
388         }
389
390         return err;
391 }
392
393 /*
394  * Find the best uberblock.
395  * Return:
396  *        Success - Pointer to the best uberblock.
397  *        Failure - NULL
398  */
399 static uberblock_t *find_bestub(char *ub_array, struct zfs_data *data)
400 {
401         const uint64_t sector = data->vdev_phys_sector;
402         uberblock_t *ubbest = NULL;
403         uberblock_t *ubnext;
404         unsigned int i, offset, pickedub = 0;
405         int err = ZFS_ERR_NONE;
406
407         const unsigned int UBCOUNT = UBERBLOCK_COUNT(data->vdev_ashift);
408         const uint64_t UBBYTES = UBERBLOCK_SIZE(data->vdev_ashift);
409
410         for (i = 0; i < UBCOUNT; i++) {
411                 ubnext = (uberblock_t *) (i * UBBYTES + ub_array);
412                 offset = (sector << SPA_MINBLOCKSHIFT) + VDEV_PHYS_SIZE + (i * UBBYTES);
413
414                 err = uberblock_verify(ubnext, offset, data);
415                 if (err)
416                         continue;
417
418                 if (ubbest == NULL || vdev_uberblock_compare(ubnext, ubbest) > 0) {
419                         ubbest = ubnext;
420                         pickedub = i;
421                 }
422         }
423
424         if (ubbest)
425                 debug("zfs Found best uberblock at idx %d, txg %llu\n",
426                           pickedub, (unsigned long long) ubbest->ub_txg);
427
428         return ubbest;
429 }
430
431 static inline size_t
432 get_psize(blkptr_t *bp, zfs_endian_t endian)
433 {
434         return (((zfs_to_cpu64((bp)->blk_prop, endian) >> 16) & 0xffff) + 1)
435                         << SPA_MINBLOCKSHIFT;
436 }
437
438 static uint64_t
439 dva_get_offset(dva_t *dva, zfs_endian_t endian)
440 {
441         return zfs_to_cpu64((dva)->dva_word[1],
442                                                          endian) << SPA_MINBLOCKSHIFT;
443 }
444
445 /*
446  * Read a block of data based on the gang block address dva,
447  * and put its data in buf.
448  *
449  */
450 static int
451 zio_read_gang(blkptr_t *bp, zfs_endian_t endian, dva_t *dva, void *buf,
452                           struct zfs_data *data)
453 {
454         zio_gbh_phys_t *zio_gb;
455         uint64_t offset, sector;
456         unsigned i;
457         int err;
458         zio_cksum_t zc;
459
460         memset(&zc, 0, sizeof(zc));
461
462         zio_gb = malloc(SPA_GANGBLOCKSIZE);
463         if (!zio_gb)
464                 return ZFS_ERR_OUT_OF_MEMORY;
465
466         offset = dva_get_offset(dva, endian);
467         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
468
469         /* read in the gang block header */
470         err = zfs_devread(sector, 0, SPA_GANGBLOCKSIZE, (char *) zio_gb);
471
472         if (err) {
473                 free(zio_gb);
474                 return err;
475         }
476
477         /* XXX */
478         /* self checksuming the gang block header */
479         ZIO_SET_CHECKSUM(&zc, DVA_GET_VDEV(dva),
480                                          dva_get_offset(dva, endian), bp->blk_birth, 0);
481         err = zio_checksum_verify(zc, ZIO_CHECKSUM_GANG_HEADER, endian,
482                                                           (char *) zio_gb, SPA_GANGBLOCKSIZE);
483         if (err) {
484                 free(zio_gb);
485                 return err;
486         }
487
488         endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
489
490         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
491                 if (zio_gb->zg_blkptr[i].blk_birth == 0)
492                         continue;
493
494                 err = zio_read_data(&zio_gb->zg_blkptr[i], endian, buf, data);
495                 if (err) {
496                         free(zio_gb);
497                         return err;
498                 }
499                 buf = (char *) buf + get_psize(&zio_gb->zg_blkptr[i], endian);
500         }
501         free(zio_gb);
502         return ZFS_ERR_NONE;
503 }
504
505 /*
506  * Read in a block of raw data to buf.
507  */
508 static int
509 zio_read_data(blkptr_t *bp, zfs_endian_t endian, void *buf,
510                           struct zfs_data *data)
511 {
512         int i, psize;
513         int err = ZFS_ERR_NONE;
514
515         psize = get_psize(bp, endian);
516
517         /* pick a good dva from the block pointer */
518         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
519                 uint64_t offset, sector;
520
521                 if (bp->blk_dva[i].dva_word[0] == 0 && bp->blk_dva[i].dva_word[1] == 0)
522                         continue;
523
524                 if ((zfs_to_cpu64(bp->blk_dva[i].dva_word[1], endian)>>63) & 1) {
525                         err = zio_read_gang(bp, endian, &bp->blk_dva[i], buf, data);
526                 } else {
527                         /* read in a data block */
528                         offset = dva_get_offset(&bp->blk_dva[i], endian);
529                         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
530
531                         err = zfs_devread(sector, 0, psize, buf);
532                 }
533
534                 if (!err) {
535                         /*Check the underlying checksum before we rule this DVA as "good"*/
536                         uint32_t checkalgo = (zfs_to_cpu64((bp)->blk_prop, endian) >> 40) & 0xff;
537
538                         err = zio_checksum_verify(bp->blk_cksum, checkalgo, endian, buf, psize);
539                         if (!err)
540                                 return ZFS_ERR_NONE;
541                 }
542
543                 /* If read failed or checksum bad, reset the error.      Hopefully we've got some more DVA's to try.*/
544         }
545
546         if (!err) {
547                 printf("couldn't find a valid DVA\n");
548                 err = ZFS_ERR_BAD_FS;
549         }
550
551         return err;
552 }
553
554 /*
555  * Read in a block of data, verify its checksum, decompress if needed,
556  * and put the uncompressed data in buf.
557  */
558 static int
559 zio_read(blkptr_t *bp, zfs_endian_t endian, void **buf,
560                  size_t *size, struct zfs_data *data)
561 {
562         size_t lsize, psize;
563         unsigned int comp;
564         char *compbuf = NULL;
565         int err;
566
567         *buf = NULL;
568
569         comp = (zfs_to_cpu64((bp)->blk_prop, endian)>>32) & 0xff;
570         lsize = (BP_IS_HOLE(bp) ? 0 :
571                          (((zfs_to_cpu64((bp)->blk_prop, endian) & 0xffff) + 1)
572                           << SPA_MINBLOCKSHIFT));
573         psize = get_psize(bp, endian);
574
575         if (size)
576                 *size = lsize;
577
578         if (comp >= ZIO_COMPRESS_FUNCTIONS) {
579                 printf("compression algorithm %u not supported\n", (unsigned int) comp);
580                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
581         }
582
583         if (comp != ZIO_COMPRESS_OFF && decomp_table[comp].decomp_func == NULL) {
584                 printf("compression algorithm %s not supported\n", decomp_table[comp].name);
585                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
586         }
587
588         if (comp != ZIO_COMPRESS_OFF) {
589                 compbuf = malloc(psize);
590                 if (!compbuf)
591                         return ZFS_ERR_OUT_OF_MEMORY;
592         } else {
593                 compbuf = *buf = malloc(lsize);
594         }
595
596         err = zio_read_data(bp, endian, compbuf, data);
597         if (err) {
598                 free(compbuf);
599                 *buf = NULL;
600                 return err;
601         }
602
603         if (comp != ZIO_COMPRESS_OFF) {
604                 *buf = malloc(lsize);
605                 if (!*buf) {
606                         free(compbuf);
607                         return ZFS_ERR_OUT_OF_MEMORY;
608                 }
609
610                 err = decomp_table[comp].decomp_func(compbuf, *buf, psize, lsize);
611                 free(compbuf);
612                 if (err) {
613                         free(*buf);
614                         *buf = NULL;
615                         return err;
616                 }
617         }
618
619         return ZFS_ERR_NONE;
620 }
621
622 /*
623  * Get the block from a block id.
624  * push the block onto the stack.
625  *
626  */
627 static int
628 dmu_read(dnode_end_t *dn, uint64_t blkid, void **buf,
629                  zfs_endian_t *endian_out, struct zfs_data *data)
630 {
631         int idx, level;
632         blkptr_t *bp_array = dn->dn.dn_blkptr;
633         int epbs = dn->dn.dn_indblkshift - SPA_BLKPTRSHIFT;
634         blkptr_t *bp;
635         void *tmpbuf = 0;
636         zfs_endian_t endian;
637         int err = ZFS_ERR_NONE;
638
639         bp = malloc(sizeof(blkptr_t));
640         if (!bp)
641                 return ZFS_ERR_OUT_OF_MEMORY;
642
643         endian = dn->endian;
644         for (level = dn->dn.dn_nlevels - 1; level >= 0; level--) {
645                 idx = (blkid >> (epbs * level)) & ((1 << epbs) - 1);
646                 *bp = bp_array[idx];
647                 if (bp_array != dn->dn.dn_blkptr) {
648                         free(bp_array);
649                         bp_array = 0;
650                 }
651
652                 if (BP_IS_HOLE(bp)) {
653                         size_t size = zfs_to_cpu16(dn->dn.dn_datablkszsec,
654                                                                                         dn->endian)
655                                 << SPA_MINBLOCKSHIFT;
656                         *buf = malloc(size);
657                         if (*buf) {
658                                 err = ZFS_ERR_OUT_OF_MEMORY;
659                                 break;
660                         }
661                         memset(*buf, 0, size);
662                         endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
663                         break;
664                 }
665                 if (level == 0) {
666                         err = zio_read(bp, endian, buf, 0, data);
667                         endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
668                         break;
669                 }
670                 err = zio_read(bp, endian, &tmpbuf, 0, data);
671                 endian = (zfs_to_cpu64(bp->blk_prop, endian) >> 63) & 1;
672                 if (err)
673                         break;
674                 bp_array = tmpbuf;
675         }
676         if (bp_array != dn->dn.dn_blkptr)
677                 free(bp_array);
678         if (endian_out)
679                 *endian_out = endian;
680
681         free(bp);
682         return err;
683 }
684
685 /*
686  * mzap_lookup: Looks up property described by "name" and returns the value
687  * in "value".
688  */
689 static int
690 mzap_lookup(mzap_phys_t *zapobj, zfs_endian_t endian,
691                         int objsize, char *name, uint64_t * value)
692 {
693         int i, chunks;
694         mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
695
696         chunks = objsize / MZAP_ENT_LEN - 1;
697         for (i = 0; i < chunks; i++) {
698                 if (strcmp(mzap_ent[i].mze_name, name) == 0) {
699                         *value = zfs_to_cpu64(mzap_ent[i].mze_value, endian);
700                         return ZFS_ERR_NONE;
701                 }
702         }
703
704         printf("couldn't find '%s'\n", name);
705         return ZFS_ERR_FILE_NOT_FOUND;
706 }
707
708 static int
709 mzap_iterate(mzap_phys_t *zapobj, zfs_endian_t endian, int objsize,
710                          int (*hook)(const char *name,
711                                                  uint64_t val,
712                                                  struct zfs_data *data),
713                          struct zfs_data *data)
714 {
715         int i, chunks;
716         mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
717
718         chunks = objsize / MZAP_ENT_LEN - 1;
719         for (i = 0; i < chunks; i++) {
720                 if (hook(mzap_ent[i].mze_name,
721                                  zfs_to_cpu64(mzap_ent[i].mze_value, endian),
722                                  data))
723                         return 1;
724         }
725
726         return 0;
727 }
728
729 static uint64_t
730 zap_hash(uint64_t salt, const char *name)
731 {
732         static uint64_t table[256];
733         const uint8_t *cp;
734         uint8_t c;
735         uint64_t crc = salt;
736
737         if (table[128] == 0) {
738                 uint64_t *ct = NULL;
739                 int i, j;
740                 for (i = 0; i < 256; i++) {
741                         for (ct = table + i, *ct = i, j = 8; j > 0; j--)
742                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
743                 }
744         }
745
746         for (cp = (const uint8_t *) name; (c = *cp) != '\0'; cp++)
747                 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
748
749         /*
750          * Only use 28 bits, since we need 4 bits in the cookie for the
751          * collision differentiator.  We MUST use the high bits, since
752          * those are the onces that we first pay attention to when
753          * chosing the bucket.
754          */
755         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
756
757         return crc;
758 }
759
760 /*
761  * Only to be used on 8-bit arrays.
762  * array_len is actual len in bytes (not encoded le_value_length).
763  * buf is null-terminated.
764  */
765 /* XXX */
766 static int
767 zap_leaf_array_equal(zap_leaf_phys_t *l, zfs_endian_t endian,
768                                          int blksft, int chunk, int array_len, const char *buf)
769 {
770         int bseen = 0;
771
772         while (bseen < array_len) {
773                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
774                 int toread = min(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
775
776                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
777                         return 0;
778
779                 if (memcmp(la->la_array, buf + bseen, toread) != 0)
780                         break;
781                 chunk = zfs_to_cpu16(la->la_next, endian);
782                 bseen += toread;
783         }
784         return (bseen == array_len);
785 }
786
787 /* XXX */
788 static int
789 zap_leaf_array_get(zap_leaf_phys_t *l, zfs_endian_t endian, int blksft,
790                                    int chunk, int array_len, char *buf)
791 {
792         int bseen = 0;
793
794         while (bseen < array_len) {
795                 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
796                 int toread = min(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
797
798                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
799                         /* Don't use errno because this error is to be ignored.  */
800                         return ZFS_ERR_BAD_FS;
801
802                 memcpy(buf + bseen, la->la_array,  toread);
803                 chunk = zfs_to_cpu16(la->la_next, endian);
804                 bseen += toread;
805         }
806         return ZFS_ERR_NONE;
807 }
808
809
810 /*
811  * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
812  * value for the property "name".
813  *
814  */
815 /* XXX */
816 static int
817 zap_leaf_lookup(zap_leaf_phys_t *l, zfs_endian_t endian,
818                                 int blksft, uint64_t h,
819                                 const char *name, uint64_t *value)
820 {
821         uint16_t chunk;
822         struct zap_leaf_entry *le;
823
824         /* Verify if this is a valid leaf block */
825         if (zfs_to_cpu64(l->l_hdr.lh_block_type, endian) != ZBT_LEAF) {
826                 printf("invalid leaf type\n");
827                 return ZFS_ERR_BAD_FS;
828         }
829         if (zfs_to_cpu32(l->l_hdr.lh_magic, endian) != ZAP_LEAF_MAGIC) {
830                 printf("invalid leaf magic\n");
831                 return ZFS_ERR_BAD_FS;
832         }
833
834         for (chunk = zfs_to_cpu16(l->l_hash[LEAF_HASH(blksft, h)], endian);
835                  chunk != CHAIN_END; chunk = le->le_next) {
836
837                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft)) {
838                         printf("invalid chunk number\n");
839                         return ZFS_ERR_BAD_FS;
840                 }
841
842                 le = ZAP_LEAF_ENTRY(l, blksft, chunk);
843
844                 /* Verify the chunk entry */
845                 if (le->le_type != ZAP_CHUNK_ENTRY) {
846                         printf("invalid chunk entry\n");
847                         return ZFS_ERR_BAD_FS;
848                 }
849
850                 if (zfs_to_cpu64(le->le_hash, endian) != h)
851                         continue;
852
853                 if (zap_leaf_array_equal(l, endian, blksft,
854                                                                  zfs_to_cpu16(le->le_name_chunk, endian),
855                                                                  zfs_to_cpu16(le->le_name_length, endian),
856                                                                  name)) {
857                         struct zap_leaf_array *la;
858
859                         if (le->le_int_size != 8 || le->le_value_length != 1) {
860                                 printf("invalid leaf chunk entry\n");
861                                 return ZFS_ERR_BAD_FS;
862                         }
863                         /* get the uint64_t property value */
864                         la = &ZAP_LEAF_CHUNK(l, blksft, le->le_value_chunk).l_array;
865
866                         *value = be64_to_cpu(la->la_array64);
867
868                         return ZFS_ERR_NONE;
869                 }
870         }
871
872         printf("couldn't find '%s'\n", name);
873         return ZFS_ERR_FILE_NOT_FOUND;
874 }
875
876
877 /* Verify if this is a fat zap header block */
878 static int
879 zap_verify(zap_phys_t *zap)
880 {
881         if (zap->zap_magic != (uint64_t) ZAP_MAGIC) {
882                 printf("bad ZAP magic\n");
883                 return ZFS_ERR_BAD_FS;
884         }
885
886         if (zap->zap_flags != 0) {
887                 printf("bad ZAP flags\n");
888                 return ZFS_ERR_BAD_FS;
889         }
890
891         if (zap->zap_salt == 0) {
892                 printf("bad ZAP salt\n");
893                 return ZFS_ERR_BAD_FS;
894         }
895
896         return ZFS_ERR_NONE;
897 }
898
899 /*
900  * Fat ZAP lookup
901  *
902  */
903 /* XXX */
904 static int
905 fzap_lookup(dnode_end_t *zap_dnode, zap_phys_t *zap,
906                         char *name, uint64_t *value, struct zfs_data *data)
907 {
908         void *l;
909         uint64_t hash, idx, blkid;
910         int blksft = zfs_log2(zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
911                                                                                         zap_dnode->endian) << DNODE_SHIFT);
912         int err;
913         zfs_endian_t leafendian;
914
915         err = zap_verify(zap);
916         if (err)
917                 return err;
918
919         hash = zap_hash(zap->zap_salt, name);
920
921         /* get block id from index */
922         if (zap->zap_ptrtbl.zt_numblks != 0) {
923                 printf("external pointer tables not supported\n");
924                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
925         }
926         idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
927         blkid = ((uint64_t *) zap)[idx + (1 << (blksft - 3 - 1))];
928
929         /* Get the leaf block */
930         if ((1U << blksft) < sizeof(zap_leaf_phys_t)) {
931                 printf("ZAP leaf is too small\n");
932                 return ZFS_ERR_BAD_FS;
933         }
934         err = dmu_read(zap_dnode, blkid, &l, &leafendian, data);
935         if (err)
936                 return err;
937
938         err = zap_leaf_lookup(l, leafendian, blksft, hash, name, value);
939         free(l);
940         return err;
941 }
942
943 /* XXX */
944 static int
945 fzap_iterate(dnode_end_t *zap_dnode, zap_phys_t *zap,
946                          int (*hook)(const char *name,
947                                                  uint64_t val,
948                                                  struct zfs_data *data),
949                          struct zfs_data *data)
950 {
951         zap_leaf_phys_t *l;
952         void *l_in;
953         uint64_t idx, blkid;
954         uint16_t chunk;
955         int blksft = zfs_log2(zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
956                                                                                         zap_dnode->endian) << DNODE_SHIFT);
957         int err;
958         zfs_endian_t endian;
959
960         if (zap_verify(zap))
961                 return 0;
962
963         /* get block id from index */
964         if (zap->zap_ptrtbl.zt_numblks != 0) {
965                 printf("external pointer tables not supported\n");
966                 return 0;
967         }
968         /* Get the leaf block */
969         if ((1U << blksft) < sizeof(zap_leaf_phys_t)) {
970                 printf("ZAP leaf is too small\n");
971                 return 0;
972         }
973         for (idx = 0; idx < zap->zap_ptrtbl.zt_numblks; idx++) {
974                 blkid = ((uint64_t *) zap)[idx + (1 << (blksft - 3 - 1))];
975
976                 err = dmu_read(zap_dnode, blkid, &l_in, &endian, data);
977                 l = l_in;
978                 if (err)
979                         continue;
980
981                 /* Verify if this is a valid leaf block */
982                 if (zfs_to_cpu64(l->l_hdr.lh_block_type, endian) != ZBT_LEAF) {
983                         free(l);
984                         continue;
985                 }
986                 if (zfs_to_cpu32(l->l_hdr.lh_magic, endian) != ZAP_LEAF_MAGIC) {
987                         free(l);
988                         continue;
989                 }
990
991                 for (chunk = 0; chunk < ZAP_LEAF_NUMCHUNKS(blksft); chunk++) {
992                         char *buf;
993                         struct zap_leaf_array *la;
994                         struct zap_leaf_entry *le;
995                         uint64_t val;
996                         le = ZAP_LEAF_ENTRY(l, blksft, chunk);
997
998                         /* Verify the chunk entry */
999                         if (le->le_type != ZAP_CHUNK_ENTRY)
1000                                 continue;
1001
1002                         buf = malloc(zfs_to_cpu16(le->le_name_length, endian)
1003                                                  + 1);
1004                         if (zap_leaf_array_get(l, endian, blksft, le->le_name_chunk,
1005                                                                    le->le_name_length, buf)) {
1006                                 free(buf);
1007                                 continue;
1008                         }
1009                         buf[le->le_name_length] = 0;
1010
1011                         if (le->le_int_size != 8
1012                                 || zfs_to_cpu16(le->le_value_length, endian) != 1)
1013                                 continue;
1014
1015                         /* get the uint64_t property value */
1016                         la = &ZAP_LEAF_CHUNK(l, blksft, le->le_value_chunk).l_array;
1017                         val = be64_to_cpu(la->la_array64);
1018                         if (hook(buf, val, data))
1019                                 return 1;
1020                         free(buf);
1021                 }
1022         }
1023         return 0;
1024 }
1025
1026
1027 /*
1028  * Read in the data of a zap object and find the value for a matching
1029  * property name.
1030  *
1031  */
1032 static int
1033 zap_lookup(dnode_end_t *zap_dnode, char *name, uint64_t *val,
1034                    struct zfs_data *data)
1035 {
1036         uint64_t block_type;
1037         int size;
1038         void *zapbuf;
1039         int err;
1040         zfs_endian_t endian;
1041
1042         /* Read in the first block of the zap object data. */
1043         size = zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec,
1044                                                          zap_dnode->endian) << SPA_MINBLOCKSHIFT;
1045         err = dmu_read(zap_dnode, 0, &zapbuf, &endian, data);
1046         if (err)
1047                 return err;
1048         block_type = zfs_to_cpu64(*((uint64_t *) zapbuf), endian);
1049
1050         if (block_type == ZBT_MICRO) {
1051                 err = (mzap_lookup(zapbuf, endian, size, name, val));
1052                 free(zapbuf);
1053                 return err;
1054         } else if (block_type == ZBT_HEADER) {
1055                 /* this is a fat zap */
1056                 err = (fzap_lookup(zap_dnode, zapbuf, name, val, data));
1057                 free(zapbuf);
1058                 return err;
1059         }
1060
1061         printf("unknown ZAP type\n");
1062         free(zapbuf);
1063         return ZFS_ERR_BAD_FS;
1064 }
1065
1066 static int
1067 zap_iterate(dnode_end_t *zap_dnode,
1068                         int (*hook)(const char *name, uint64_t val,
1069                                                 struct zfs_data *data),
1070                         struct zfs_data *data)
1071 {
1072         uint64_t block_type;
1073         int size;
1074         void *zapbuf;
1075         int err;
1076         int ret;
1077         zfs_endian_t endian;
1078
1079         /* Read in the first block of the zap object data. */
1080         size = zfs_to_cpu16(zap_dnode->dn.dn_datablkszsec, zap_dnode->endian) << SPA_MINBLOCKSHIFT;
1081         err = dmu_read(zap_dnode, 0, &zapbuf, &endian, data);
1082         if (err)
1083                 return 0;
1084         block_type = zfs_to_cpu64(*((uint64_t *) zapbuf), endian);
1085
1086         if (block_type == ZBT_MICRO) {
1087                 ret = mzap_iterate(zapbuf, endian, size, hook, data);
1088                 free(zapbuf);
1089                 return ret;
1090         } else if (block_type == ZBT_HEADER) {
1091                 /* this is a fat zap */
1092                 ret = fzap_iterate(zap_dnode, zapbuf, hook, data);
1093                 free(zapbuf);
1094                 return ret;
1095         }
1096         printf("unknown ZAP type\n");
1097         free(zapbuf);
1098         return 0;
1099 }
1100
1101
1102 /*
1103  * Get the dnode of an object number from the metadnode of an object set.
1104  *
1105  * Input
1106  *      mdn - metadnode to get the object dnode
1107  *      objnum - object number for the object dnode
1108  *      buf - data buffer that holds the returning dnode
1109  */
1110 static int
1111 dnode_get(dnode_end_t *mdn, uint64_t objnum, uint8_t type,
1112                   dnode_end_t *buf, struct zfs_data *data)
1113 {
1114         uint64_t blkid, blksz;  /* the block id this object dnode is in */
1115         int epbs;                       /* shift of number of dnodes in a block */
1116         int idx;                        /* index within a block */
1117         void *dnbuf;
1118         int err;
1119         zfs_endian_t endian;
1120
1121         blksz = zfs_to_cpu16(mdn->dn.dn_datablkszsec,
1122                                                           mdn->endian) << SPA_MINBLOCKSHIFT;
1123
1124         epbs = zfs_log2(blksz) - DNODE_SHIFT;
1125         blkid = objnum >> epbs;
1126         idx = objnum & ((1 << epbs) - 1);
1127
1128         if (data->dnode_buf != NULL && memcmp(data->dnode_mdn, mdn,
1129                                                                                   sizeof(*mdn)) == 0
1130                 && objnum >= data->dnode_start && objnum < data->dnode_end) {
1131                 memmove(&(buf->dn), &(data->dnode_buf)[idx], DNODE_SIZE);
1132                 buf->endian = data->dnode_endian;
1133                 if (type && buf->dn.dn_type != type)  {
1134                         printf("incorrect dnode type: %02X != %02x\n", buf->dn.dn_type, type);
1135                         return ZFS_ERR_BAD_FS;
1136                 }
1137                 return ZFS_ERR_NONE;
1138         }
1139
1140         err = dmu_read(mdn, blkid, &dnbuf, &endian, data);
1141         if (err)
1142                 return err;
1143
1144         free(data->dnode_buf);
1145         free(data->dnode_mdn);
1146         data->dnode_mdn = malloc(sizeof(*mdn));
1147         if (!data->dnode_mdn) {
1148                 data->dnode_buf = 0;
1149         } else {
1150                 memcpy(data->dnode_mdn, mdn, sizeof(*mdn));
1151                 data->dnode_buf = dnbuf;
1152                 data->dnode_start = blkid << epbs;
1153                 data->dnode_end = (blkid + 1) << epbs;
1154                 data->dnode_endian = endian;
1155         }
1156
1157         memmove(&(buf->dn), (dnode_phys_t *) dnbuf + idx, DNODE_SIZE);
1158         buf->endian = endian;
1159         if (type && buf->dn.dn_type != type) {
1160                 printf("incorrect dnode type\n");
1161                 return ZFS_ERR_BAD_FS;
1162         }
1163
1164         return ZFS_ERR_NONE;
1165 }
1166
1167 /*
1168  * Get the file dnode for a given file name where mdn is the meta dnode
1169  * for this ZFS object set. When found, place the file dnode in dn.
1170  * The 'path' argument will be mangled.
1171  *
1172  */
1173 static int
1174 dnode_get_path(dnode_end_t *mdn, const char *path_in, dnode_end_t *dn,
1175                            struct zfs_data *data)
1176 {
1177         uint64_t objnum, version;
1178         char *cname, ch;
1179         int err = ZFS_ERR_NONE;
1180         char *path, *path_buf;
1181         struct dnode_chain {
1182                 struct dnode_chain *next;
1183                 dnode_end_t dn;
1184         };
1185         struct dnode_chain *dnode_path = 0, *dn_new, *root;
1186
1187         dn_new = malloc(sizeof(*dn_new));
1188         if (!dn_new)
1189                 return ZFS_ERR_OUT_OF_MEMORY;
1190         dn_new->next = 0;
1191         dnode_path = root = dn_new;
1192
1193         err = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
1194                                         &(dnode_path->dn), data);
1195         if (err) {
1196                 free(dn_new);
1197                 return err;
1198         }
1199
1200         err = zap_lookup(&(dnode_path->dn), ZPL_VERSION_STR, &version, data);
1201         if (err) {
1202                 free(dn_new);
1203                 return err;
1204         }
1205         if (version > ZPL_VERSION) {
1206                 free(dn_new);
1207                 printf("too new ZPL version\n");
1208                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
1209         }
1210
1211         err = zap_lookup(&(dnode_path->dn), ZFS_ROOT_OBJ, &objnum, data);
1212         if (err) {
1213                 free(dn_new);
1214                 return err;
1215         }
1216
1217         err = dnode_get(mdn, objnum, 0, &(dnode_path->dn), data);
1218         if (err) {
1219                 free(dn_new);
1220                 return err;
1221         }
1222
1223         path = path_buf = strdup(path_in);
1224         if (!path_buf) {
1225                 free(dn_new);
1226                 return ZFS_ERR_OUT_OF_MEMORY;
1227         }
1228
1229         while (1) {
1230                 /* skip leading slashes */
1231                 while (*path == '/')
1232                         path++;
1233                 if (!*path)
1234                         break;
1235                 /* get the next component name */
1236                 cname = path;
1237                 while (*path && *path != '/')
1238                         path++;
1239                 /* Skip dot.  */
1240                 if (cname + 1 == path && cname[0] == '.')
1241                         continue;
1242                 /* Handle double dot.  */
1243                 if (cname + 2 == path && cname[0] == '.' && cname[1] == '.')  {
1244                         if (dn_new->next) {
1245                                 dn_new = dnode_path;
1246                                 dnode_path = dn_new->next;
1247                                 free(dn_new);
1248                         } else {
1249                                 printf("can't resolve ..\n");
1250                                 err = ZFS_ERR_FILE_NOT_FOUND;
1251                                 break;
1252                         }
1253                         continue;
1254                 }
1255
1256                 ch = *path;
1257                 *path = 0;              /* ensure null termination */
1258
1259                 if (dnode_path->dn.dn.dn_type != DMU_OT_DIRECTORY_CONTENTS) {
1260                         free(path_buf);
1261                         printf("not a directory\n");
1262                         return ZFS_ERR_BAD_FILE_TYPE;
1263                 }
1264                 err = zap_lookup(&(dnode_path->dn), cname, &objnum, data);
1265                 if (err)
1266                         break;
1267
1268                 dn_new = malloc(sizeof(*dn_new));
1269                 if (!dn_new) {
1270                         err = ZFS_ERR_OUT_OF_MEMORY;
1271                         break;
1272                 }
1273                 dn_new->next = dnode_path;
1274                 dnode_path = dn_new;
1275
1276                 objnum = ZFS_DIRENT_OBJ(objnum);
1277                 err = dnode_get(mdn, objnum, 0, &(dnode_path->dn), data);
1278                 if (err)
1279                         break;
1280
1281                 *path = ch;
1282         }
1283
1284         if (!err)
1285                 memcpy(dn, &(dnode_path->dn), sizeof(*dn));
1286
1287         while (dnode_path) {
1288                 dn_new = dnode_path->next;
1289                 free(dnode_path);
1290                 dnode_path = dn_new;
1291         }
1292         free(path_buf);
1293         return err;
1294 }
1295
1296
1297 /*
1298  * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1299  * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1300  * of pool/rootfs.
1301  *
1302  * If no fsname and no obj are given, return the DSL_DIR metadnode.
1303  * If fsname is given, return its metadnode and its matching object number.
1304  * If only obj is given, return the metadnode for this object number.
1305  *
1306  */
1307 static int
1308 get_filesystem_dnode(dnode_end_t *mosmdn, char *fsname,
1309                                          dnode_end_t *mdn, struct zfs_data *data)
1310 {
1311         uint64_t objnum;
1312         int err;
1313
1314         err = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1315                                         DMU_OT_OBJECT_DIRECTORY, mdn, data);
1316         if (err)
1317                 return err;
1318
1319         err = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum, data);
1320         if (err)
1321                 return err;
1322
1323         err = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, data);
1324         if (err)
1325                 return err;
1326
1327         while (*fsname) {
1328                 uint64_t childobj;
1329                 char *cname, ch;
1330
1331                 while (*fsname == '/')
1332                         fsname++;
1333
1334                 if (!*fsname || *fsname == '@')
1335                         break;
1336
1337                 cname = fsname;
1338                 while (*fsname && !isspace(*fsname) && *fsname != '/')
1339                         fsname++;
1340                 ch = *fsname;
1341                 *fsname = 0;
1342
1343                 childobj = zfs_to_cpu64((((dsl_dir_phys_t *) DN_BONUS(&mdn->dn)))->dd_child_dir_zapobj, mdn->endian);
1344                 err = dnode_get(mosmdn, childobj,
1345                                                 DMU_OT_DSL_DIR_CHILD_MAP, mdn, data);
1346                 if (err)
1347                         return err;
1348
1349                 err = zap_lookup(mdn, cname, &objnum, data);
1350                 if (err)
1351                         return err;
1352
1353                 err = dnode_get(mosmdn, objnum, DMU_OT_DSL_DIR, mdn, data);
1354                 if (err)
1355                         return err;
1356
1357                 *fsname = ch;
1358         }
1359         return ZFS_ERR_NONE;
1360 }
1361
1362 static int
1363 make_mdn(dnode_end_t *mdn, struct zfs_data *data)
1364 {
1365         void *osp;
1366         blkptr_t *bp;
1367         size_t ospsize;
1368         int err;
1369
1370         bp = &(((dsl_dataset_phys_t *) DN_BONUS(&mdn->dn))->ds_bp);
1371         err = zio_read(bp, mdn->endian, &osp, &ospsize, data);
1372         if (err)
1373                 return err;
1374         if (ospsize < OBJSET_PHYS_SIZE_V14) {
1375                 free(osp);
1376                 printf("too small osp\n");
1377                 return ZFS_ERR_BAD_FS;
1378         }
1379
1380         mdn->endian = (zfs_to_cpu64(bp->blk_prop, mdn->endian)>>63) & 1;
1381         memmove((char *) &(mdn->dn),
1382                         (char *) &((objset_phys_t *) osp)->os_meta_dnode, DNODE_SIZE);
1383         free(osp);
1384         return ZFS_ERR_NONE;
1385 }
1386
1387 static int
1388 dnode_get_fullpath(const char *fullpath, dnode_end_t *mdn,
1389                                    uint64_t *mdnobj, dnode_end_t *dn, int *isfs,
1390                                    struct zfs_data *data)
1391 {
1392         char *fsname, *snapname;
1393         const char *ptr_at, *filename;
1394         uint64_t headobj;
1395         int err;
1396
1397         ptr_at = strchr(fullpath, '@');
1398         if (!ptr_at) {
1399                 *isfs = 1;
1400                 filename = 0;
1401                 snapname = 0;
1402                 fsname = strdup(fullpath);
1403         } else {
1404                 const char *ptr_slash = strchr(ptr_at, '/');
1405
1406                 *isfs = 0;
1407                 fsname = malloc(ptr_at - fullpath + 1);
1408                 if (!fsname)
1409                         return ZFS_ERR_OUT_OF_MEMORY;
1410                 memcpy(fsname, fullpath, ptr_at - fullpath);
1411                 fsname[ptr_at - fullpath] = 0;
1412                 if (ptr_at[1] && ptr_at[1] != '/') {
1413                         snapname = malloc(ptr_slash - ptr_at);
1414                         if (!snapname) {
1415                                 free(fsname);
1416                                 return ZFS_ERR_OUT_OF_MEMORY;
1417                         }
1418                         memcpy(snapname, ptr_at + 1, ptr_slash - ptr_at - 1);
1419                         snapname[ptr_slash - ptr_at - 1] = 0;
1420                 } else {
1421                         snapname = 0;
1422                 }
1423                 if (ptr_slash)
1424                         filename = ptr_slash;
1425                 else
1426                         filename = "/";
1427                 printf("zfs fsname = '%s' snapname='%s' filename = '%s'\n",
1428                            fsname, snapname, filename);
1429         }
1430
1431
1432         err = get_filesystem_dnode(&(data->mos), fsname, dn, data);
1433
1434         if (err) {
1435                 free(fsname);
1436                 free(snapname);
1437                 return err;
1438         }
1439
1440         headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&dn->dn))->dd_head_dataset_obj, dn->endian);
1441
1442         err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, mdn, data);
1443         if (err) {
1444                 free(fsname);
1445                 free(snapname);
1446                 return err;
1447         }
1448
1449         if (snapname) {
1450                 uint64_t snapobj;
1451
1452                 snapobj = zfs_to_cpu64(((dsl_dataset_phys_t *) DN_BONUS(&mdn->dn))->ds_snapnames_zapobj, mdn->endian);
1453
1454                 err = dnode_get(&(data->mos), snapobj,
1455                                                 DMU_OT_DSL_DS_SNAP_MAP, mdn, data);
1456                 if (!err)
1457                         err = zap_lookup(mdn, snapname, &headobj, data);
1458                 if (!err)
1459                         err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, mdn, data);
1460                 if (err) {
1461                         free(fsname);
1462                         free(snapname);
1463                         return err;
1464                 }
1465         }
1466
1467         if (mdnobj)
1468                 *mdnobj = headobj;
1469
1470         make_mdn(mdn, data);
1471
1472         if (*isfs) {
1473                 free(fsname);
1474                 free(snapname);
1475                 return ZFS_ERR_NONE;
1476         }
1477         err = dnode_get_path(mdn, filename, dn, data);
1478         free(fsname);
1479         free(snapname);
1480         return err;
1481 }
1482
1483 /*
1484  * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1485  *
1486  * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1487  *
1488  *              encoding method/host endian             (4 bytes)
1489  *              nvl_version                                             (4 bytes)
1490  *              nvl_nvflag                                              (4 bytes)
1491  *      encoded nvpairs:
1492  *              encoded size of the nvpair              (4 bytes)
1493  *              decoded size of the nvpair              (4 bytes)
1494  *              name string size                                (4 bytes)
1495  *              name string data                                (sizeof(NV_ALIGN4(string))
1496  *              data type                                               (4 bytes)
1497  *              # of elements in the nvpair             (4 bytes)
1498  *              data
1499  *              2 zero's for the last nvpair
1500  *              (end of the entire list)        (8 bytes)
1501  *
1502  */
1503
1504 static int
1505 nvlist_find_value(char *nvlist, char *name, int valtype, char **val,
1506                                   size_t *size_out, size_t *nelm_out)
1507 {
1508         int name_len, type, encode_size;
1509         char *nvpair, *nvp_name;
1510
1511         /* Verify if the 1st and 2nd byte in the nvlist are valid. */
1512         /* NOTE: independently of what endianness header announces all
1513            subsequent values are big-endian.  */
1514         if (nvlist[0] != NV_ENCODE_XDR || (nvlist[1] != NV_LITTLE_ENDIAN
1515                                                                            && nvlist[1] != NV_BIG_ENDIAN)) {
1516                 printf("zfs incorrect nvlist header\n");
1517                 return ZFS_ERR_BAD_FS;
1518         }
1519
1520         /* skip the header, nvl_version, and nvl_nvflag */
1521         nvlist = nvlist + 4 * 3;
1522         /*
1523          * Loop thru the nvpair list
1524          * The XDR representation of an integer is in big-endian byte order.
1525          */
1526         while ((encode_size = be32_to_cpu(*(uint32_t *) nvlist))) {
1527                 int nelm;
1528
1529                 nvpair = nvlist + 4 * 2;        /* skip the encode/decode size */
1530
1531                 name_len = be32_to_cpu(*(uint32_t *) nvpair);
1532                 nvpair += 4;
1533
1534                 nvp_name = nvpair;
1535                 nvpair = nvpair + ((name_len + 3) & ~3);        /* align */
1536
1537                 type = be32_to_cpu(*(uint32_t *) nvpair);
1538                 nvpair += 4;
1539
1540                 nelm = be32_to_cpu(*(uint32_t *) nvpair);
1541                 if (nelm < 1) {
1542                         printf("empty nvpair\n");
1543                         return ZFS_ERR_BAD_FS;
1544                 }
1545
1546                 nvpair += 4;
1547
1548                 if ((strncmp(nvp_name, name, name_len) == 0) && type == valtype) {
1549                         *val = nvpair;
1550                         *size_out = encode_size;
1551                         if (nelm_out)
1552                                 *nelm_out = nelm;
1553                         return 1;
1554                 }
1555
1556                 nvlist += encode_size;  /* goto the next nvpair */
1557         }
1558         return 0;
1559 }
1560
1561 int
1562 zfs_nvlist_lookup_uint64(char *nvlist, char *name, uint64_t *out)
1563 {
1564         char *nvpair;
1565         size_t size;
1566         int found;
1567
1568         found = nvlist_find_value(nvlist, name, DATA_TYPE_UINT64, &nvpair, &size, 0);
1569         if (!found)
1570                 return 0;
1571         if (size < sizeof(uint64_t)) {
1572                 printf("invalid uint64\n");
1573                 return ZFS_ERR_BAD_FS;
1574         }
1575
1576         *out = be64_to_cpu(*(uint64_t *) nvpair);
1577         return 1;
1578 }
1579
1580 char *
1581 zfs_nvlist_lookup_string(char *nvlist, char *name)
1582 {
1583         char *nvpair;
1584         char *ret;
1585         size_t slen;
1586         size_t size;
1587         int found;
1588
1589         found = nvlist_find_value(nvlist, name, DATA_TYPE_STRING, &nvpair, &size, 0);
1590         if (!found)
1591                 return 0;
1592         if (size < 4) {
1593                 printf("invalid string\n");
1594                 return 0;
1595         }
1596         slen = be32_to_cpu(*(uint32_t *) nvpair);
1597         if (slen > size - 4)
1598                 slen = size - 4;
1599         ret = malloc(slen + 1);
1600         if (!ret)
1601                 return 0;
1602         memcpy(ret, nvpair + 4, slen);
1603         ret[slen] = 0;
1604         return ret;
1605 }
1606
1607 char *
1608 zfs_nvlist_lookup_nvlist(char *nvlist, char *name)
1609 {
1610         char *nvpair;
1611         char *ret;
1612         size_t size;
1613         int found;
1614
1615         found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1616                                                           &size, 0);
1617         if (!found)
1618                 return 0;
1619         ret = calloc(1, size + 3 * sizeof(uint32_t));
1620         if (!ret)
1621                 return 0;
1622         memcpy(ret, nvlist, sizeof(uint32_t));
1623
1624         memcpy(ret + sizeof(uint32_t), nvpair, size);
1625         return ret;
1626 }
1627
1628 int
1629 zfs_nvlist_lookup_nvlist_array_get_nelm(char *nvlist, char *name)
1630 {
1631         char *nvpair;
1632         size_t nelm, size;
1633         int found;
1634
1635         found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1636                                                           &size, &nelm);
1637         if (!found)
1638                 return -1;
1639         return nelm;
1640 }
1641
1642 char *
1643 zfs_nvlist_lookup_nvlist_array(char *nvlist, char *name,
1644                                                                         size_t index)
1645 {
1646         char *nvpair, *nvpairptr;
1647         int found;
1648         char *ret;
1649         size_t size;
1650         unsigned i;
1651         size_t nelm;
1652
1653         found = nvlist_find_value(nvlist, name, DATA_TYPE_NVLIST, &nvpair,
1654                                                           &size, &nelm);
1655         if (!found)
1656                 return 0;
1657         if (index >= nelm) {
1658                 printf("trying to lookup past nvlist array\n");
1659                 return 0;
1660         }
1661
1662         nvpairptr = nvpair;
1663
1664         for (i = 0; i < index; i++) {
1665                 uint32_t encode_size;
1666
1667                 /* skip the header, nvl_version, and nvl_nvflag */
1668                 nvpairptr = nvpairptr + 4 * 2;
1669
1670                 while (nvpairptr < nvpair + size
1671                            && (encode_size = be32_to_cpu(*(uint32_t *) nvpairptr)))
1672                         nvlist += encode_size;  /* goto the next nvpair */
1673
1674                 nvlist = nvlist + 4 * 2;        /* skip the ending 2 zeros - 8 bytes */
1675         }
1676
1677         if (nvpairptr >= nvpair + size
1678                 || nvpairptr + be32_to_cpu(*(uint32_t *) (nvpairptr + 4 * 2))
1679                 >= nvpair + size) {
1680                 printf("incorrect nvlist array\n");
1681                 return 0;
1682         }
1683
1684         ret = calloc(1, be32_to_cpu(*(uint32_t *) (nvpairptr + 4 * 2))
1685                                  + 3 * sizeof(uint32_t));
1686         if (!ret)
1687                 return 0;
1688         memcpy(ret, nvlist, sizeof(uint32_t));
1689
1690         memcpy(ret + sizeof(uint32_t), nvpairptr, size);
1691         return ret;
1692 }
1693
1694 static int
1695 int_zfs_fetch_nvlist(struct zfs_data *data, char **nvlist)
1696 {
1697         int err;
1698
1699         *nvlist = malloc(VDEV_PHYS_SIZE);
1700         /* Read in the vdev name-value pair list (112K). */
1701         err = zfs_devread(data->vdev_phys_sector, 0, VDEV_PHYS_SIZE, *nvlist);
1702         if (err) {
1703                 free(*nvlist);
1704                 *nvlist = 0;
1705                 return err;
1706         }
1707         return ZFS_ERR_NONE;
1708 }
1709
1710 /*
1711  * Check the disk label information and retrieve needed vdev name-value pairs.
1712  *
1713  */
1714 static int
1715 check_pool_label(struct zfs_data *data)
1716 {
1717         uint64_t pool_state;
1718         char *nvlist;                   /* for the pool */
1719         char *vdevnvlist;               /* for the vdev */
1720         uint64_t diskguid;
1721         uint64_t version;
1722         int found;
1723         int err;
1724
1725         err = int_zfs_fetch_nvlist(data, &nvlist);
1726         if (err)
1727                 return err;
1728
1729         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_STATE,
1730                                                                                   &pool_state);
1731         if (!found) {
1732                 free(nvlist);
1733                 printf("zfs pool state not found\n");
1734                 return ZFS_ERR_BAD_FS;
1735         }
1736
1737         if (pool_state == POOL_STATE_DESTROYED) {
1738                 free(nvlist);
1739                 printf("zpool is marked as destroyed\n");
1740                 return ZFS_ERR_BAD_FS;
1741         }
1742
1743         data->label_txg = 0;
1744         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_TXG,
1745                                                                                   &data->label_txg);
1746         if (!found) {
1747                 free(nvlist);
1748                 printf("zfs pool txg not found\n");
1749                 return ZFS_ERR_BAD_FS;
1750         }
1751
1752         /* not an active device */
1753         if (data->label_txg == 0) {
1754                 free(nvlist);
1755                 printf("zpool is not active\n");
1756                 return ZFS_ERR_BAD_FS;
1757         }
1758
1759         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_VERSION,
1760                                                                                   &version);
1761         if (!found) {
1762                 free(nvlist);
1763                 printf("zpool config version not found\n");
1764                 return ZFS_ERR_BAD_FS;
1765         }
1766
1767         if (version > SPA_VERSION) {
1768                 free(nvlist);
1769                 printf("SPA version too new %llu > %llu\n",
1770                            (unsigned long long) version,
1771                            (unsigned long long) SPA_VERSION);
1772                 return ZFS_ERR_NOT_IMPLEMENTED_YET;
1773         }
1774
1775         vdevnvlist = zfs_nvlist_lookup_nvlist(nvlist, ZPOOL_CONFIG_VDEV_TREE);
1776         if (!vdevnvlist) {
1777                 free(nvlist);
1778                 printf("ZFS config vdev tree not found\n");
1779                 return ZFS_ERR_BAD_FS;
1780         }
1781
1782         found = zfs_nvlist_lookup_uint64(vdevnvlist, ZPOOL_CONFIG_ASHIFT,
1783                                                                                   &data->vdev_ashift);
1784         free(vdevnvlist);
1785         if (!found) {
1786                 free(nvlist);
1787                 printf("ZPOOL config ashift not found\n");
1788                 return ZFS_ERR_BAD_FS;
1789         }
1790
1791         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_GUID, &diskguid);
1792         if (!found) {
1793                 free(nvlist);
1794                 printf("ZPOOL config guid not found\n");
1795                 return ZFS_ERR_BAD_FS;
1796         }
1797
1798         found = zfs_nvlist_lookup_uint64(nvlist, ZPOOL_CONFIG_POOL_GUID, &data->pool_guid);
1799         if (!found) {
1800                 free(nvlist);
1801                 printf("ZPOOL config pool guid not found\n");
1802                 return ZFS_ERR_BAD_FS;
1803         }
1804
1805         free(nvlist);
1806
1807         printf("ZFS Pool GUID: %llu (%016llx) Label: GUID: %llu (%016llx), txg: %llu, SPA v%llu, ashift: %llu\n",
1808                    (unsigned long long) data->pool_guid,
1809                    (unsigned long long) data->pool_guid,
1810                    (unsigned long long) diskguid,
1811                    (unsigned long long) diskguid,
1812                    (unsigned long long) data->label_txg,
1813                    (unsigned long long) version,
1814                    (unsigned long long) data->vdev_ashift);
1815
1816         return ZFS_ERR_NONE;
1817 }
1818
1819 /*
1820  * vdev_label_start returns the physical disk offset (in bytes) of
1821  * label "l".
1822  */
1823 static uint64_t vdev_label_start(uint64_t psize, int l)
1824 {
1825         return (l * sizeof(vdev_label_t) + (l < VDEV_LABELS / 2 ?
1826                                                                                 0 : psize -
1827                                                                                 VDEV_LABELS * sizeof(vdev_label_t)));
1828 }
1829
1830 void
1831 zfs_unmount(struct zfs_data *data)
1832 {
1833         free(data->dnode_buf);
1834         free(data->dnode_mdn);
1835         free(data->file_buf);
1836         free(data);
1837 }
1838
1839 /*
1840  * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1841  * to the memory address MOS.
1842  *
1843  */
1844 struct zfs_data *
1845 zfs_mount(device_t dev)
1846 {
1847         struct zfs_data *data = 0;
1848         int label = 0, bestlabel = -1;
1849         char *ub_array;
1850         uberblock_t *ubbest;
1851         uberblock_t *ubcur = NULL;
1852         void *osp = 0;
1853         size_t ospsize;
1854         int err;
1855
1856         data = malloc(sizeof(*data));
1857         if (!data)
1858                 return 0;
1859         memset(data, 0, sizeof(*data));
1860
1861         ub_array = malloc(VDEV_UBERBLOCK_RING);
1862         if (!ub_array) {
1863                 zfs_unmount(data);
1864                 return 0;
1865         }
1866
1867         ubbest = malloc(sizeof(*ubbest));
1868         if (!ubbest) {
1869                 free(ub_array);
1870                 zfs_unmount(data);
1871                 return 0;
1872         }
1873         memset(ubbest, 0, sizeof(*ubbest));
1874
1875         /*
1876          * some eltorito stacks don't give us a size and
1877          * we end up setting the size to MAXUINT, further
1878          * some of these devices stop working once a single
1879          * read past the end has been issued. Checking
1880          * for a maximum part_length and skipping the backup
1881          * labels at the end of the slice/partition/device
1882          * avoids breaking down on such devices.
1883          */
1884         const int vdevnum =
1885                 dev->part_length == 0 ?
1886                 VDEV_LABELS / 2 : VDEV_LABELS;
1887
1888         /* Size in bytes of the device (disk or partition) aligned to label size*/
1889         uint64_t device_size =
1890                 dev->part_length << SECTOR_BITS;
1891
1892         const uint64_t alignedbytes =
1893                 P2ALIGN(device_size, (uint64_t) sizeof(vdev_label_t));
1894
1895         for (label = 0; label < vdevnum; label++) {
1896                 uint64_t labelstartbytes = vdev_label_start(alignedbytes, label);
1897                 uint64_t labelstart = labelstartbytes >> SECTOR_BITS;
1898
1899                 debug("zfs reading label %d at sector %llu (byte %llu)\n",
1900                           label, (unsigned long long) labelstart,
1901                           (unsigned long long) labelstartbytes);
1902
1903                 data->vdev_phys_sector = labelstart +
1904                         ((VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE) >> SECTOR_BITS);
1905
1906                 err = check_pool_label(data);
1907                 if (err) {
1908                         printf("zfs error checking label %d\n", label);
1909                         continue;
1910                 }
1911
1912                 /* Read in the uberblock ring (128K). */
1913                 err = zfs_devread(data->vdev_phys_sector  +
1914                                                   (VDEV_PHYS_SIZE >> SECTOR_BITS),
1915                                                   0, VDEV_UBERBLOCK_RING, ub_array);
1916                 if (err) {
1917                         printf("zfs error reading uberblock ring for label %d\n", label);
1918                         continue;
1919                 }
1920
1921                 ubcur = find_bestub(ub_array, data);
1922                 if (!ubcur) {
1923                         printf("zfs No good uberblocks found in label %d\n", label);
1924                         continue;
1925                 }
1926
1927                 if (vdev_uberblock_compare(ubcur, ubbest) > 0) {
1928                         /* Looks like the block is good, so use it.*/
1929                         memcpy(ubbest, ubcur, sizeof(*ubbest));
1930                         bestlabel = label;
1931                         debug("zfs Current best uberblock found in label %d\n", label);
1932                 }
1933         }
1934         free(ub_array);
1935
1936         /* We zero'd the structure to begin with.  If we never assigned to it,
1937            magic will still be zero. */
1938         if (!ubbest->ub_magic) {
1939                 printf("couldn't find a valid ZFS label\n");
1940                 zfs_unmount(data);
1941                 free(ubbest);
1942                 return 0;
1943         }
1944
1945         debug("zfs ubbest %p in label %d\n", ubbest, bestlabel);
1946
1947         zfs_endian_t ub_endian =
1948                 zfs_to_cpu64(ubbest->ub_magic, LITTLE_ENDIAN) == UBERBLOCK_MAGIC
1949                 ? LITTLE_ENDIAN : BIG_ENDIAN;
1950
1951         debug("zfs endian set to %s\n", !ub_endian ? "big" : "little");
1952
1953         err = zio_read(&ubbest->ub_rootbp, ub_endian, &osp, &ospsize, data);
1954
1955         if (err) {
1956                 printf("couldn't zio_read object directory\n");
1957                 zfs_unmount(data);
1958                 free(osp);
1959                 free(ubbest);
1960                 return 0;
1961         }
1962
1963         if (ospsize < OBJSET_PHYS_SIZE_V14) {
1964                 printf("osp too small\n");
1965                 zfs_unmount(data);
1966                 free(osp);
1967                 free(ubbest);
1968                 return 0;
1969         }
1970
1971         /* Got the MOS. Save it at the memory addr MOS. */
1972         memmove(&(data->mos.dn), &((objset_phys_t *) osp)->os_meta_dnode, DNODE_SIZE);
1973         data->mos.endian =
1974                 (zfs_to_cpu64(ubbest->ub_rootbp.blk_prop, ub_endian) >> 63) & 1;
1975         memmove(&(data->current_uberblock), ubbest, sizeof(uberblock_t));
1976
1977         free(osp);
1978         free(ubbest);
1979
1980         return data;
1981 }
1982
1983 int
1984 zfs_fetch_nvlist(device_t dev, char **nvlist)
1985 {
1986         struct zfs_data *zfs;
1987         int err;
1988
1989         zfs = zfs_mount(dev);
1990         if (!zfs)
1991                 return ZFS_ERR_BAD_FS;
1992         err = int_zfs_fetch_nvlist(zfs, nvlist);
1993         zfs_unmount(zfs);
1994         return err;
1995 }
1996
1997 /*
1998  * zfs_open() locates a file in the rootpool by following the
1999  * MOS and places the dnode of the file in the memory address DNODE.
2000  */
2001 int
2002 zfs_open(struct zfs_file *file, const char *fsfilename)
2003 {
2004         struct zfs_data *data;
2005         int err;
2006         int isfs;
2007
2008         data = zfs_mount(file->device);
2009         if (!data)
2010                 return ZFS_ERR_BAD_FS;
2011
2012         err = dnode_get_fullpath(fsfilename, &(data->mdn), 0,
2013                                                          &(data->dnode), &isfs, data);
2014         if (err) {
2015                 zfs_unmount(data);
2016                 return err;
2017         }
2018
2019         if (isfs) {
2020                 zfs_unmount(data);
2021                 printf("Missing @ or / separator\n");
2022                 return ZFS_ERR_FILE_NOT_FOUND;
2023         }
2024
2025         /* We found the dnode for this file. Verify if it is a plain file. */
2026         if (data->dnode.dn.dn_type != DMU_OT_PLAIN_FILE_CONTENTS) {
2027                 zfs_unmount(data);
2028                 printf("not a file\n");
2029                 return ZFS_ERR_BAD_FILE_TYPE;
2030         }
2031
2032         /* get the file size and set the file position to 0 */
2033
2034         /*
2035          * For DMU_OT_SA we will need to locate the SIZE attribute
2036          * attribute, which could be either in the bonus buffer
2037          * or the "spill" block.
2038          */
2039         if (data->dnode.dn.dn_bonustype == DMU_OT_SA) {
2040                 void *sahdrp;
2041                 int hdrsize;
2042
2043                 if (data->dnode.dn.dn_bonuslen != 0) {
2044                         sahdrp = (sa_hdr_phys_t *) DN_BONUS(&data->dnode.dn);
2045                 } else if (data->dnode.dn.dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
2046                         blkptr_t *bp = &data->dnode.dn.dn_spill;
2047
2048                         err = zio_read(bp, data->dnode.endian, &sahdrp, NULL, data);
2049                         if (err)
2050                                 return err;
2051                 } else {
2052                         printf("filesystem is corrupt :(\n");
2053                         return ZFS_ERR_BAD_FS;
2054                 }
2055
2056                 hdrsize = SA_HDR_SIZE(((sa_hdr_phys_t *) sahdrp));
2057                 file->size = *(uint64_t *) ((char *) sahdrp + hdrsize + SA_SIZE_OFFSET);
2058                 if ((data->dnode.dn.dn_bonuslen == 0) &&
2059                         (data->dnode.dn.dn_flags & DNODE_FLAG_SPILL_BLKPTR))
2060                         free(sahdrp);
2061         } else {
2062                 file->size = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&data->dnode.dn))->zp_size, data->dnode.endian);
2063         }
2064
2065         file->data = data;
2066         file->offset = 0;
2067
2068         return ZFS_ERR_NONE;
2069 }
2070
2071 uint64_t
2072 zfs_read(zfs_file_t file, char *buf, uint64_t len)
2073 {
2074         struct zfs_data *data = (struct zfs_data *) file->data;
2075         int blksz, movesize;
2076         uint64_t length;
2077         int64_t red;
2078         int err;
2079
2080         if (data->file_buf == NULL) {
2081                 data->file_buf = malloc(SPA_MAXBLOCKSIZE);
2082                 if (!data->file_buf)
2083                         return -1;
2084                 data->file_start = data->file_end = 0;
2085         }
2086
2087         /*
2088          * If offset is in memory, move it into the buffer provided and return.
2089          */
2090         if (file->offset >= data->file_start
2091                 && file->offset + len <= data->file_end) {
2092                 memmove(buf, data->file_buf + file->offset - data->file_start,
2093                                 len);
2094                 return len;
2095         }
2096
2097         blksz = zfs_to_cpu16(data->dnode.dn.dn_datablkszsec,
2098                                                           data->dnode.endian) << SPA_MINBLOCKSHIFT;
2099
2100         /*
2101          * Entire Dnode is too big to fit into the space available.      We
2102          * will need to read it in chunks.      This could be optimized to
2103          * read in as large a chunk as there is space available, but for
2104          * now, this only reads in one data block at a time.
2105          */
2106         length = len;
2107         red = 0;
2108         while (length) {
2109                 void *t;
2110                 /*
2111                  * Find requested blkid and the offset within that block.
2112                  */
2113                 uint64_t blkid = file->offset + red;
2114                 blkid = do_div(blkid, blksz);
2115                 free(data->file_buf);
2116                 data->file_buf = 0;
2117
2118                 err = dmu_read(&(data->dnode), blkid, &t,
2119                                            0, data);
2120                 data->file_buf = t;
2121                 if (err)
2122                         return -1;
2123
2124                 data->file_start = blkid * blksz;
2125                 data->file_end = data->file_start + blksz;
2126
2127                 movesize = min(length, data->file_end - (int)file->offset - red);
2128
2129                 memmove(buf, data->file_buf + file->offset + red
2130                                 - data->file_start, movesize);
2131                 buf += movesize;
2132                 length -= movesize;
2133                 red += movesize;
2134         }
2135
2136         return len;
2137 }
2138
2139 int
2140 zfs_close(zfs_file_t file)
2141 {
2142         zfs_unmount((struct zfs_data *) file->data);
2143         return ZFS_ERR_NONE;
2144 }
2145
2146 int
2147 zfs_getmdnobj(device_t dev, const char *fsfilename,
2148                                    uint64_t *mdnobj)
2149 {
2150         struct zfs_data *data;
2151         int err;
2152         int isfs;
2153
2154         data = zfs_mount(dev);
2155         if (!data)
2156                 return ZFS_ERR_BAD_FS;
2157
2158         err = dnode_get_fullpath(fsfilename, &(data->mdn), mdnobj,
2159                                                          &(data->dnode), &isfs, data);
2160         zfs_unmount(data);
2161         return err;
2162 }
2163
2164 static void
2165 fill_fs_info(struct zfs_dirhook_info *info,
2166                          dnode_end_t mdn, struct zfs_data *data)
2167 {
2168         int err;
2169         dnode_end_t dn;
2170         uint64_t objnum;
2171         uint64_t headobj;
2172
2173         memset(info, 0, sizeof(*info));
2174
2175         info->dir = 1;
2176
2177         if (mdn.dn.dn_type == DMU_OT_DSL_DIR) {
2178                 headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&mdn.dn))->dd_head_dataset_obj, mdn.endian);
2179
2180                 err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, &mdn, data);
2181                 if (err) {
2182                         printf("zfs failed here 1\n");
2183                         return;
2184                 }
2185         }
2186         make_mdn(&mdn, data);
2187         err = dnode_get(&mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
2188                                         &dn, data);
2189         if (err) {
2190                 printf("zfs failed here 2\n");
2191                 return;
2192         }
2193
2194         err = zap_lookup(&dn, ZFS_ROOT_OBJ, &objnum, data);
2195         if (err) {
2196                 printf("zfs failed here 3\n");
2197                 return;
2198         }
2199
2200         err = dnode_get(&mdn, objnum, 0, &dn, data);
2201         if (err) {
2202                 printf("zfs failed here 4\n");
2203                 return;
2204         }
2205
2206         info->mtimeset = 1;
2207         info->mtime = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&dn.dn))->zp_mtime[0], dn.endian);
2208
2209         return;
2210 }
2211
2212 static int iterate_zap(const char *name, uint64_t val, struct zfs_data *data)
2213 {
2214         struct zfs_dirhook_info info;
2215         dnode_end_t dn;
2216
2217         memset(&info, 0, sizeof(info));
2218
2219         dnode_get(&(data->mdn), val, 0, &dn, data);
2220         info.mtimeset = 1;
2221         info.mtime = zfs_to_cpu64(((znode_phys_t *) DN_BONUS(&dn.dn))->zp_mtime[0], dn.endian);
2222         info.dir = (dn.dn.dn_type == DMU_OT_DIRECTORY_CONTENTS);
2223         debug("zfs type=%d, name=%s\n",
2224                   (int)dn.dn.dn_type, (char *)name);
2225         if (!data->userhook)
2226                 return 0;
2227         return data->userhook(name, &info);
2228 }
2229
2230 static int iterate_zap_fs(const char *name, uint64_t val, struct zfs_data *data)
2231 {
2232         struct zfs_dirhook_info info;
2233         dnode_end_t mdn;
2234         int err;
2235         err = dnode_get(&(data->mos), val, 0, &mdn, data);
2236         if (err)
2237                 return 0;
2238         if (mdn.dn.dn_type != DMU_OT_DSL_DIR)
2239                 return 0;
2240
2241         fill_fs_info(&info, mdn, data);
2242
2243         if (!data->userhook)
2244                 return 0;
2245         return data->userhook(name, &info);
2246 }
2247
2248 static int iterate_zap_snap(const char *name, uint64_t val, struct zfs_data *data)
2249 {
2250         struct zfs_dirhook_info info;
2251         char *name2;
2252         int ret = 0;
2253         dnode_end_t mdn;
2254         int err;
2255
2256         err = dnode_get(&(data->mos), val, 0, &mdn, data);
2257         if (err)
2258                 return 0;
2259
2260         if (mdn.dn.dn_type != DMU_OT_DSL_DATASET)
2261                 return 0;
2262
2263         fill_fs_info(&info, mdn, data);
2264
2265         name2 = malloc(strlen(name) + 2);
2266         name2[0] = '@';
2267         memcpy(name2 + 1, name, strlen(name) + 1);
2268         if (data->userhook)
2269                 ret = data->userhook(name2, &info);
2270         free(name2);
2271         return ret;
2272 }
2273
2274 int
2275 zfs_ls(device_t device, const char *path,
2276            int (*hook)(const char *, const struct zfs_dirhook_info *))
2277 {
2278         struct zfs_data *data;
2279         int err;
2280         int isfs;
2281
2282         data = zfs_mount(device);
2283         if (!data)
2284                 return ZFS_ERR_BAD_FS;
2285
2286         data->userhook = hook;
2287
2288         err = dnode_get_fullpath(path, &(data->mdn), 0, &(data->dnode), &isfs, data);
2289         if (err) {
2290                 zfs_unmount(data);
2291                 return err;
2292         }
2293         if (isfs) {
2294                 uint64_t childobj, headobj;
2295                 uint64_t snapobj;
2296                 dnode_end_t dn;
2297                 struct zfs_dirhook_info info;
2298
2299                 fill_fs_info(&info, data->dnode, data);
2300                 hook("@", &info);
2301
2302                 childobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&data->dnode.dn))->dd_child_dir_zapobj, data->dnode.endian);
2303                 headobj = zfs_to_cpu64(((dsl_dir_phys_t *) DN_BONUS(&data->dnode.dn))->dd_head_dataset_obj, data->dnode.endian);
2304                 err = dnode_get(&(data->mos), childobj,
2305                                                 DMU_OT_DSL_DIR_CHILD_MAP, &dn, data);
2306                 if (err) {
2307                         zfs_unmount(data);
2308                         return err;
2309                 }
2310
2311
2312                 zap_iterate(&dn, iterate_zap_fs, data);
2313
2314                 err = dnode_get(&(data->mos), headobj, DMU_OT_DSL_DATASET, &dn, data);
2315                 if (err) {
2316                         zfs_unmount(data);
2317                         return err;
2318                 }
2319
2320                 snapobj = zfs_to_cpu64(((dsl_dataset_phys_t *) DN_BONUS(&dn.dn))->ds_snapnames_zapobj, dn.endian);
2321
2322                 err = dnode_get(&(data->mos), snapobj,
2323                                                 DMU_OT_DSL_DS_SNAP_MAP, &dn, data);
2324                 if (err) {
2325                         zfs_unmount(data);
2326                         return err;
2327                 }
2328
2329                 zap_iterate(&dn, iterate_zap_snap, data);
2330         } else {
2331                 if (data->dnode.dn.dn_type != DMU_OT_DIRECTORY_CONTENTS) {
2332                         zfs_unmount(data);
2333                         printf("not a directory\n");
2334                         return ZFS_ERR_BAD_FILE_TYPE;
2335                 }
2336                 zap_iterate(&(data->dnode), iterate_zap, data);
2337         }
2338         zfs_unmount(data);
2339         return ZFS_ERR_NONE;
2340 }