Merge tag 'zonefs-6.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 22 Feb 2023 22:11:54 +0000 (14:11 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 22 Feb 2023 22:11:54 +0000 (14:11 -0800)
Pull zonefs updates from Damien Le Moal:

 - Reorganize zonefs code to split file related operations to a new
   fs/zonefs/file.c file (me)

 - Modify zonefs to use dynamically allocated inodes and dentries (using
   the inode and dentry caches) instead of statically allocating
   everything on mount. This saves a significant amount of memory for
   very large zoned block devices with 10s of thousands of zones (me)

 - Make zonefs_sb_ktype a const struct kobj_type (Thomas)

* tag 'zonefs-6.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dlemoal/zonefs:
  zonefs: make kobj_type structure constant
  zonefs: Cache zone group directory inodes
  zonefs: Dynamically create file inodes when needed
  zonefs: Separate zone information from inode information
  zonefs: Reduce struct zonefs_inode_info size
  zonefs: Simplify IO error handling
  zonefs: Reorganize code

1  2 
fs/zonefs/super.c

@@@ -526,85 -402,145 +402,145 @@@ void __zonefs_io_error(struct inode *in
        memalloc_noio_restore(noio_flag);
  }
  
- static void zonefs_io_error(struct inode *inode, bool write)
+ static struct kmem_cache *zonefs_inode_cachep;
+ static struct inode *zonefs_alloc_inode(struct super_block *sb)
  {
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       struct zonefs_inode_info *zi;
+       zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL);
+       if (!zi)
+               return NULL;
+       inode_init_once(&zi->i_vnode);
+       mutex_init(&zi->i_truncate_mutex);
+       zi->i_wr_refcnt = 0;
  
-       mutex_lock(&zi->i_truncate_mutex);
-       __zonefs_io_error(inode, write);
-       mutex_unlock(&zi->i_truncate_mutex);
+       return &zi->i_vnode;
  }
  
- static int zonefs_file_truncate(struct inode *inode, loff_t isize)
+ static void zonefs_free_inode(struct inode *inode)
  {
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       loff_t old_isize;
-       enum req_op op;
-       int ret = 0;
+       kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode));
+ }
  
-       /*
-        * Only sequential zone files can be truncated and truncation is allowed
-        * only down to a 0 size, which is equivalent to a zone reset, and to
-        * the maximum file size, which is equivalent to a zone finish.
-        */
-       if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
-               return -EPERM;
+ /*
+  * File system stat.
+  */
+ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
+ {
+       struct super_block *sb = dentry->d_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       enum zonefs_ztype t;
  
-       if (!isize)
-               op = REQ_OP_ZONE_RESET;
-       else if (isize == zi->i_max_size)
-               op = REQ_OP_ZONE_FINISH;
+       buf->f_type = ZONEFS_MAGIC;
+       buf->f_bsize = sb->s_blocksize;
+       buf->f_namelen = ZONEFS_NAME_MAX;
+       spin_lock(&sbi->s_lock);
+       buf->f_blocks = sbi->s_blocks;
+       if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks))
+               buf->f_bfree = 0;
        else
-               return -EPERM;
+               buf->f_bfree = buf->f_blocks - sbi->s_used_blocks;
+       buf->f_bavail = buf->f_bfree;
+       for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
+               if (sbi->s_zgroup[t].g_nr_zones)
+                       buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1;
+       }
+       buf->f_ffree = 0;
  
-       inode_dio_wait(inode);
+       spin_unlock(&sbi->s_lock);
  
-       /* Serialize against page faults */
-       filemap_invalidate_lock(inode->i_mapping);
+       buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
  
-       /* Serialize against zonefs_iomap_begin() */
-       mutex_lock(&zi->i_truncate_mutex);
+       return 0;
+ }
  
-       old_isize = i_size_read(inode);
-       if (isize == old_isize)
-               goto unlock;
+ enum {
+       Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
+       Opt_explicit_open, Opt_err,
+ };
  
-       ret = zonefs_zone_mgmt(inode, op);
-       if (ret)
-               goto unlock;
+ static const match_table_t tokens = {
+       { Opt_errors_ro,        "errors=remount-ro"},
+       { Opt_errors_zro,       "errors=zone-ro"},
+       { Opt_errors_zol,       "errors=zone-offline"},
+       { Opt_errors_repair,    "errors=repair"},
+       { Opt_explicit_open,    "explicit-open" },
+       { Opt_err,              NULL}
+ };
  
-       /*
-        * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
-        * take care of open zones.
-        */
-       if (zi->i_flags & ZONEFS_ZONE_OPEN) {
-               /*
-                * Truncating a zone to EMPTY or FULL is the equivalent of
-                * closing the zone. For a truncation to 0, we need to
-                * re-open the zone to ensure new writes can be processed.
-                * For a truncation to the maximum file size, the zone is
-                * closed and writes cannot be accepted anymore, so clear
-                * the open flag.
-                */
-               if (!isize)
-                       ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
-               else
-                       zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+ static int zonefs_parse_options(struct super_block *sb, char *options)
+ {
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       substring_t args[MAX_OPT_ARGS];
+       char *p;
+       if (!options)
+               return 0;
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+               if (!*p)
+                       continue;
+               token = match_token(p, tokens, args);
+               switch (token) {
+               case Opt_errors_ro:
+                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
+                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO;
+                       break;
+               case Opt_errors_zro:
+                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
+                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO;
+                       break;
+               case Opt_errors_zol:
+                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
+                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL;
+                       break;
+               case Opt_errors_repair:
+                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
+                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
+                       break;
+               case Opt_explicit_open:
+                       sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
+                       break;
+               default:
+                       return -EINVAL;
+               }
        }
  
-       zonefs_update_stats(inode, isize);
-       truncate_setsize(inode, isize);
-       zi->i_wpoffset = isize;
-       zonefs_account_active(inode);
+       return 0;
+ }
+ static int zonefs_show_options(struct seq_file *seq, struct dentry *root)
+ {
+       struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb);
+       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO)
+               seq_puts(seq, ",errors=remount-ro");
+       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)
+               seq_puts(seq, ",errors=zone-ro");
+       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)
+               seq_puts(seq, ",errors=zone-offline");
+       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR)
+               seq_puts(seq, ",errors=repair");
  
- unlock:
-       mutex_unlock(&zi->i_truncate_mutex);
-       filemap_invalidate_unlock(inode->i_mapping);
+       return 0;
+ }
  
-       return ret;
+ static int zonefs_remount(struct super_block *sb, int *flags, char *data)
+ {
+       sync_filesystem(sb);
+       return zonefs_parse_options(sb, data);
  }
  
 -static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
 +static int zonefs_inode_setattr(struct mnt_idmap *idmap,
                                struct dentry *dentry, struct iattr *iattr)
  {
        struct inode *inode = d_inode(dentry);
                        return ret;
        }
  
 -      setattr_copy(&init_user_ns, inode, iattr);
 +      setattr_copy(&nop_mnt_idmap, inode, iattr);
  
+       if (S_ISREG(inode->i_mode)) {
+               struct zonefs_zone *z = zonefs_inode_zone(inode);
+               z->z_mode = inode->i_mode;
+               z->z_uid = inode->i_uid;
+               z->z_gid = inode->i_gid;
+       }
        return 0;
  }
  
@@@ -650,753 -594,194 +594,194 @@@ static const struct inode_operations zo
        .setattr        = zonefs_inode_setattr,
  };
  
- static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
-                            int datasync)
+ static long zonefs_fname_to_fno(const struct qstr *fname)
  {
-       struct inode *inode = file_inode(file);
-       int ret = 0;
-       if (unlikely(IS_IMMUTABLE(inode)))
-               return -EPERM;
+       const char *name = fname->name;
+       unsigned int len = fname->len;
+       long fno = 0, shift = 1;
+       const char *rname;
+       char c = *name;
+       unsigned int i;
  
        /*
-        * Since only direct writes are allowed in sequential files, page cache
-        * flush is needed only for conventional zone files.
+        * File names are always a base-10 number string without any
+        * leading 0s.
         */
-       if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
-               ret = file_write_and_wait_range(file, start, end);
-       if (!ret)
-               ret = blkdev_issue_flush(inode->i_sb->s_bdev);
+       if (!isdigit(c))
+               return -ENOENT;
  
-       if (ret)
-               zonefs_io_error(inode, true);
+       if (len > 1 && c == '0')
+               return -ENOENT;
  
-       return ret;
- }
+       if (len == 1)
+               return c - '0';
  
- static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
- {
-       struct inode *inode = file_inode(vmf->vma->vm_file);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       vm_fault_t ret;
-       if (unlikely(IS_IMMUTABLE(inode)))
-               return VM_FAULT_SIGBUS;
-       /*
-        * Sanity check: only conventional zone files can have shared
-        * writeable mappings.
-        */
-       if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
-               return VM_FAULT_NOPAGE;
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vmf->vma->vm_file);
-       /* Serialize against truncates */
-       filemap_invalidate_lock_shared(inode->i_mapping);
-       ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
-       filemap_invalidate_unlock_shared(inode->i_mapping);
-       sb_end_pagefault(inode->i_sb);
-       return ret;
- }
- static const struct vm_operations_struct zonefs_file_vm_ops = {
-       .fault          = filemap_fault,
-       .map_pages      = filemap_map_pages,
-       .page_mkwrite   = zonefs_filemap_page_mkwrite,
- };
- static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
- {
-       /*
-        * Conventional zones accept random writes, so their files can support
-        * shared writable mappings. For sequential zone files, only read
-        * mappings are possible since there are no guarantees for write
-        * ordering between msync() and page cache writeback.
-        */
-       if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
-           (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
-               return -EINVAL;
-       file_accessed(file);
-       vma->vm_ops = &zonefs_file_vm_ops;
-       return 0;
- }
- static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
- {
-       loff_t isize = i_size_read(file_inode(file));
-       /*
-        * Seeks are limited to below the zone size for conventional zones
-        * and below the zone write pointer for sequential zones. In both
-        * cases, this limit is the inode size.
-        */
-       return generic_file_llseek_size(file, offset, whence, isize, isize);
- }
- static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
-                                       int error, unsigned int flags)
- {
-       struct inode *inode = file_inode(iocb->ki_filp);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       if (error) {
-               zonefs_io_error(inode, true);
-               return error;
-       }
-       if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
-               /*
-                * Note that we may be seeing completions out of order,
-                * but that is not a problem since a write completed
-                * successfully necessarily means that all preceding writes
-                * were also successful. So we can safely increase the inode
-                * size to the write end location.
-                */
-               mutex_lock(&zi->i_truncate_mutex);
-               if (i_size_read(inode) < iocb->ki_pos + size) {
-                       zonefs_update_stats(inode, iocb->ki_pos + size);
-                       zonefs_i_size_write(inode, iocb->ki_pos + size);
-               }
-               mutex_unlock(&zi->i_truncate_mutex);
-       }
-       return 0;
- }
- static const struct iomap_dio_ops zonefs_write_dio_ops = {
-       .end_io                 = zonefs_file_write_dio_end_io,
- };
- static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
- {
-       struct inode *inode = file_inode(iocb->ki_filp);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       struct block_device *bdev = inode->i_sb->s_bdev;
-       unsigned int max = bdev_max_zone_append_sectors(bdev);
-       struct bio *bio;
-       ssize_t size;
-       int nr_pages;
-       ssize_t ret;
-       max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
-       iov_iter_truncate(from, max);
-       nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
-       if (!nr_pages)
-               return 0;
-       bio = bio_alloc(bdev, nr_pages,
-                       REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
-       bio->bi_iter.bi_sector = zi->i_zsector;
-       bio->bi_ioprio = iocb->ki_ioprio;
-       if (iocb_is_dsync(iocb))
-               bio->bi_opf |= REQ_FUA;
-       ret = bio_iov_iter_get_pages(bio, from);
-       if (unlikely(ret))
-               goto out_release;
-       size = bio->bi_iter.bi_size;
-       task_io_account_write(size);
-       if (iocb->ki_flags & IOCB_HIPRI)
-               bio_set_polled(bio, iocb);
-       ret = submit_bio_wait(bio);
-       /*
-        * If the file zone was written underneath the file system, the zone
-        * write pointer may not be where we expect it to be, but the zone
-        * append write can still succeed. So check manually that we wrote where
-        * we intended to, that is, at zi->i_wpoffset.
-        */
-       if (!ret) {
-               sector_t wpsector =
-                       zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
-               if (bio->bi_iter.bi_sector != wpsector) {
-                       zonefs_warn(inode->i_sb,
-                               "Corrupted write pointer %llu for zone at %llu\n",
-                               wpsector, zi->i_zsector);
-                       ret = -EIO;
-               }
-       }
-       zonefs_file_write_dio_end_io(iocb, size, ret, 0);
-       trace_zonefs_file_dio_append(inode, size, ret);
- out_release:
-       bio_release_pages(bio, false);
-       bio_put(bio);
-       if (ret >= 0) {
-               iocb->ki_pos += size;
-               return size;
-       }
-       return ret;
- }
- /*
-  * Do not exceed the LFS limits nor the file zone size. If pos is under the
-  * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
-  */
- static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
-                                       loff_t count)
- {
-       struct inode *inode = file_inode(file);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       loff_t limit = rlimit(RLIMIT_FSIZE);
-       loff_t max_size = zi->i_max_size;
-       if (limit != RLIM_INFINITY) {
-               if (pos >= limit) {
-                       send_sig(SIGXFSZ, current, 0);
-                       return -EFBIG;
-               }
-               count = min(count, limit - pos);
-       }
-       if (!(file->f_flags & O_LARGEFILE))
-               max_size = min_t(loff_t, MAX_NON_LFS, max_size);
-       if (unlikely(pos >= max_size))
-               return -EFBIG;
-       return min(count, max_size - pos);
- }
- static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
- {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file_inode(file);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       loff_t count;
-       if (IS_SWAPFILE(inode))
-               return -ETXTBSY;
-       if (!iov_iter_count(from))
-               return 0;
-       if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
-               return -EINVAL;
-       if (iocb->ki_flags & IOCB_APPEND) {
-               if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
-                       return -EINVAL;
-               mutex_lock(&zi->i_truncate_mutex);
-               iocb->ki_pos = zi->i_wpoffset;
-               mutex_unlock(&zi->i_truncate_mutex);
+       for (i = 0, rname = name + len - 1; i < len; i++, rname--) {
+               c = *rname;
+               if (!isdigit(c))
+                       return -ENOENT;
+               fno += (c - '0') * shift;
+               shift *= 10;
        }
  
-       count = zonefs_write_check_limits(file, iocb->ki_pos,
-                                         iov_iter_count(from));
-       if (count < 0)
-               return count;
-       iov_iter_truncate(from, count);
-       return iov_iter_count(from);
- }
- /*
-  * Handle direct writes. For sequential zone files, this is the only possible
-  * write path. For these files, check that the user is issuing writes
-  * sequentially from the end of the file. This code assumes that the block layer
-  * delivers write requests to the device in sequential order. This is always the
-  * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
-  * elevator feature is being used (e.g. mq-deadline). The block layer always
-  * automatically select such an elevator for zoned block devices during the
-  * device initialization.
-  */
- static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
- {
-       struct inode *inode = file_inode(iocb->ki_filp);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       struct super_block *sb = inode->i_sb;
-       bool sync = is_sync_kiocb(iocb);
-       bool append = false;
-       ssize_t ret, count;
-       /*
-        * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
-        * as this can cause write reordering (e.g. the first aio gets EAGAIN
-        * on the inode lock but the second goes through but is now unaligned).
-        */
-       if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
-           (iocb->ki_flags & IOCB_NOWAIT))
-               return -EOPNOTSUPP;
-       if (iocb->ki_flags & IOCB_NOWAIT) {
-               if (!inode_trylock(inode))
-                       return -EAGAIN;
-       } else {
-               inode_lock(inode);
-       }
-       count = zonefs_write_checks(iocb, from);
-       if (count <= 0) {
-               ret = count;
-               goto inode_unlock;
-       }
-       if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
-               ret = -EINVAL;
-               goto inode_unlock;
-       }
-       /* Enforce sequential writes (append only) in sequential zones */
-       if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
-               mutex_lock(&zi->i_truncate_mutex);
-               if (iocb->ki_pos != zi->i_wpoffset) {
-                       mutex_unlock(&zi->i_truncate_mutex);
-                       ret = -EINVAL;
-                       goto inode_unlock;
-               }
-               mutex_unlock(&zi->i_truncate_mutex);
-               append = sync;
-       }
-       if (append)
-               ret = zonefs_file_dio_append(iocb, from);
-       else
-               ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
-                                  &zonefs_write_dio_ops, 0, NULL, 0);
-       if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
-           (ret > 0 || ret == -EIOCBQUEUED)) {
-               if (ret > 0)
-                       count = ret;
-               /*
-                * Update the zone write pointer offset assuming the write
-                * operation succeeded. If it did not, the error recovery path
-                * will correct it. Also do active seq file accounting.
-                */
-               mutex_lock(&zi->i_truncate_mutex);
-               zi->i_wpoffset += count;
-               zonefs_account_active(inode);
-               mutex_unlock(&zi->i_truncate_mutex);
-       }
- inode_unlock:
-       inode_unlock(inode);
-       return ret;
- }
- static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
-                                         struct iov_iter *from)
- {
-       struct inode *inode = file_inode(iocb->ki_filp);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       ssize_t ret;
-       /*
-        * Direct IO writes are mandatory for sequential zone files so that the
-        * write IO issuing order is preserved.
-        */
-       if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
-               return -EIO;
-       if (iocb->ki_flags & IOCB_NOWAIT) {
-               if (!inode_trylock(inode))
-                       return -EAGAIN;
-       } else {
-               inode_lock(inode);
-       }
-       ret = zonefs_write_checks(iocb, from);
-       if (ret <= 0)
-               goto inode_unlock;
-       ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
-       if (ret > 0)
-               iocb->ki_pos += ret;
-       else if (ret == -EIO)
-               zonefs_io_error(inode, true);
- inode_unlock:
-       inode_unlock(inode);
-       if (ret > 0)
-               ret = generic_write_sync(iocb, ret);
-       return ret;
+       return fno;
  }
  
- static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ static struct inode *zonefs_get_file_inode(struct inode *dir,
+                                          struct dentry *dentry)
  {
-       struct inode *inode = file_inode(iocb->ki_filp);
-       if (unlikely(IS_IMMUTABLE(inode)))
-               return -EPERM;
-       if (sb_rdonly(inode->i_sb))
-               return -EROFS;
-       /* Write operations beyond the zone size are not allowed */
-       if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
-               return -EFBIG;
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               ssize_t ret = zonefs_file_dio_write(iocb, from);
-               if (ret != -ENOTBLK)
-                       return ret;
-       }
-       return zonefs_file_buffered_write(iocb, from);
- }
- static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
-                                      int error, unsigned int flags)
- {
-       if (error) {
-               zonefs_io_error(file_inode(iocb->ki_filp), false);
-               return error;
-       }
-       return 0;
- }
- static const struct iomap_dio_ops zonefs_read_dio_ops = {
-       .end_io                 = zonefs_file_read_dio_end_io,
- };
- static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
- {
-       struct inode *inode = file_inode(iocb->ki_filp);
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       struct super_block *sb = inode->i_sb;
-       loff_t isize;
-       ssize_t ret;
-       /* Offline zones cannot be read */
-       if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
-               return -EPERM;
-       if (iocb->ki_pos >= zi->i_max_size)
-               return 0;
-       if (iocb->ki_flags & IOCB_NOWAIT) {
-               if (!inode_trylock_shared(inode))
-                       return -EAGAIN;
-       } else {
-               inode_lock_shared(inode);
-       }
+       struct zonefs_zone_group *zgroup = dir->i_private;
+       struct super_block *sb = dir->i_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       struct zonefs_zone *z;
+       struct inode *inode;
+       ino_t ino;
+       long fno;
  
-       /* Limit read operations to written data */
-       mutex_lock(&zi->i_truncate_mutex);
-       isize = i_size_read(inode);
-       if (iocb->ki_pos >= isize) {
-               mutex_unlock(&zi->i_truncate_mutex);
-               ret = 0;
-               goto inode_unlock;
-       }
-       iov_iter_truncate(to, isize - iocb->ki_pos);
-       mutex_unlock(&zi->i_truncate_mutex);
+       /* Get the file number from the file name */
+       fno = zonefs_fname_to_fno(&dentry->d_name);
+       if (fno < 0)
+               return ERR_PTR(fno);
  
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               size_t count = iov_iter_count(to);
+       if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones)
+               return ERR_PTR(-ENOENT);
  
-               if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
-                       ret = -EINVAL;
-                       goto inode_unlock;
-               }
-               file_accessed(iocb->ki_filp);
-               ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
-                                  &zonefs_read_dio_ops, 0, NULL, 0);
-       } else {
-               ret = generic_file_read_iter(iocb, to);
-               if (ret == -EIO)
-                       zonefs_io_error(inode, false);
+       z = &zgroup->g_zones[fno];
+       ino = z->z_sector >> sbi->s_zone_sectors_shift;
+       inode = iget_locked(sb, ino);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->i_state & I_NEW)) {
+               WARN_ON_ONCE(inode->i_private != z);
+               return inode;
        }
  
- inode_unlock:
-       inode_unlock_shared(inode);
-       return ret;
- }
+       inode->i_ino = ino;
+       inode->i_mode = z->z_mode;
+       inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
+       inode->i_uid = z->z_uid;
+       inode->i_gid = z->z_gid;
+       inode->i_size = z->z_wpoffset;
+       inode->i_blocks = z->z_capacity >> SECTOR_SHIFT;
+       inode->i_private = z;
  
- /*
-  * Write open accounting is done only for sequential files.
-  */
- static inline bool zonefs_seq_file_need_wro(struct inode *inode,
-                                           struct file *file)
- {
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       inode->i_op = &zonefs_file_inode_operations;
+       inode->i_fop = &zonefs_file_operations;
+       inode->i_mapping->a_ops = &zonefs_file_aops;
  
-       if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
-               return false;
+       /* Update the inode access rights depending on the zone condition */
+       zonefs_inode_update_mode(inode);
  
-       if (!(file->f_mode & FMODE_WRITE))
-               return false;
+       unlock_new_inode(inode);
  
-       return true;
+       return inode;
  }
  
- static int zonefs_seq_file_write_open(struct inode *inode)
+ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
+                                            enum zonefs_ztype ztype)
  {
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       int ret = 0;
-       mutex_lock(&zi->i_truncate_mutex);
-       if (!zi->i_wr_refcnt) {
-               struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
-               unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
-               if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
-                       if (sbi->s_max_wro_seq_files
-                           && wro > sbi->s_max_wro_seq_files) {
-                               atomic_dec(&sbi->s_wro_seq_files);
-                               ret = -EBUSY;
-                               goto unlock;
-                       }
+       struct inode *root = d_inode(sb->s_root);
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       struct inode *inode;
+       ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1;
  
-                       if (i_size_read(inode) < zi->i_max_size) {
-                               ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
-                               if (ret) {
-                                       atomic_dec(&sbi->s_wro_seq_files);
-                                       goto unlock;
-                               }
-                               zi->i_flags |= ZONEFS_ZONE_OPEN;
-                               zonefs_account_active(inode);
-                       }
-               }
-       }
+       inode = iget_locked(sb, ino);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+       if (!(inode->i_state & I_NEW))
+               return inode;
+       inode->i_ino = ino;
 -      inode_init_owner(&init_user_ns, inode, root, S_IFDIR | 0555);
++      inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
+       inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
+       inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime;
+       inode->i_private = &sbi->s_zgroup[ztype];
+       set_nlink(inode, 2);
  
-       zi->i_wr_refcnt++;
+       inode->i_op = &zonefs_dir_inode_operations;
+       inode->i_fop = &zonefs_dir_operations;
  
- unlock:
-       mutex_unlock(&zi->i_truncate_mutex);
+       unlock_new_inode(inode);
  
-       return ret;
+       return inode;
  }
  
- static int zonefs_file_open(struct inode *inode, struct file *file)
- {
-       int ret;
-       ret = generic_file_open(inode, file);
-       if (ret)
-               return ret;
-       if (zonefs_seq_file_need_wro(inode, file))
-               return zonefs_seq_file_write_open(inode);
-       return 0;
- }
  
- static void zonefs_seq_file_write_close(struct inode *inode)
+ static struct inode *zonefs_get_dir_inode(struct inode *dir,
+                                         struct dentry *dentry)
  {
-       struct zonefs_inode_info *zi = ZONEFS_I(inode);
-       struct super_block *sb = inode->i_sb;
+       struct super_block *sb = dir->i_sb;
        struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
-       int ret = 0;
-       mutex_lock(&zi->i_truncate_mutex);
-       zi->i_wr_refcnt--;
-       if (zi->i_wr_refcnt)
-               goto unlock;
+       const char *name = dentry->d_name.name;
+       enum zonefs_ztype ztype;
  
        /*
-        * The file zone may not be open anymore (e.g. the file was truncated to
-        * its maximum size or it was fully written). For this case, we only
-        * need to decrement the write open count.
+        * We only need to check for the "seq" directory and
+        * the "cnv" directory if we have conventional zones.
         */
-       if (zi->i_flags & ZONEFS_ZONE_OPEN) {
-               ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
-               if (ret) {
-                       __zonefs_io_error(inode, false);
-                       /*
-                        * Leaving zones explicitly open may lead to a state
-                        * where most zones cannot be written (zone resources
-                        * exhausted). So take preventive action by remounting
-                        * read-only.
-                        */
-                       if (zi->i_flags & ZONEFS_ZONE_OPEN &&
-                           !(sb->s_flags & SB_RDONLY)) {
-                               zonefs_warn(sb,
-                                       "closing zone at %llu failed %d\n",
-                                       zi->i_zsector, ret);
-                               zonefs_warn(sb,
-                                       "remounting filesystem read-only\n");
-                               sb->s_flags |= SB_RDONLY;
-                       }
-                       goto unlock;
-               }
+       if (dentry->d_name.len != 3)
+               return ERR_PTR(-ENOENT);
  
-               zi->i_flags &= ~ZONEFS_ZONE_OPEN;
-               zonefs_account_active(inode);
+       for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
+               if (sbi->s_zgroup[ztype].g_nr_zones &&
+                   memcmp(name, zonefs_zgroup_name(ztype), 3) == 0)
+                       break;
        }
+       if (ztype == ZONEFS_ZTYPE_MAX)
+               return ERR_PTR(-ENOENT);
  
-       atomic_dec(&sbi->s_wro_seq_files);
- unlock:
-       mutex_unlock(&zi->i_truncate_mutex);
- }
- static int zonefs_file_release(struct inode *inode, struct file *file)
- {
-       /*
-        * If we explicitly open a zone we must close it again as well, but the
-        * zone management operation can fail (either due to an IO error or as
-        * the zone has gone offline or read-only). Make sure we don't fail the
-        * close(2) for user-space.
-        */
-       if (zonefs_seq_file_need_wro(inode, file))
-               zonefs_seq_file_write_close(inode);
-       return 0;
+       return zonefs_get_zgroup_inode(sb, ztype);
  }
  
- static const struct file_operations zonefs_file_operations = {
-       .open           = zonefs_file_open,
-       .release        = zonefs_file_release,
-       .fsync          = zonefs_file_fsync,
-       .mmap           = zonefs_file_mmap,
-       .llseek         = zonefs_file_llseek,
-       .read_iter      = zonefs_file_read_iter,
-       .write_iter     = zonefs_file_write_iter,
-       .splice_read    = generic_file_splice_read,
-       .splice_write   = iter_file_splice_write,
-       .iopoll         = iocb_bio_iopoll,
- };
- static struct kmem_cache *zonefs_inode_cachep;
- static struct inode *zonefs_alloc_inode(struct super_block *sb)
+ static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry,
+                                   unsigned int flags)
  {
-       struct zonefs_inode_info *zi;
-       zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL);
-       if (!zi)
-               return NULL;
-       inode_init_once(&zi->i_vnode);
-       mutex_init(&zi->i_truncate_mutex);
-       zi->i_wr_refcnt = 0;
-       zi->i_flags = 0;
-       return &zi->i_vnode;
- }
- static void zonefs_free_inode(struct inode *inode)
- {
-       kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode));
- }
- /*
-  * File system stat.
-  */
- static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
- {
-       struct super_block *sb = dentry->d_sb;
-       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
-       enum zonefs_ztype t;
+       struct inode *inode;
  
-       buf->f_type = ZONEFS_MAGIC;
-       buf->f_bsize = sb->s_blocksize;
-       buf->f_namelen = ZONEFS_NAME_MAX;
+       if (dentry->d_name.len > ZONEFS_NAME_MAX)
+               return ERR_PTR(-ENAMETOOLONG);
  
-       spin_lock(&sbi->s_lock);
-       buf->f_blocks = sbi->s_blocks;
-       if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks))
-               buf->f_bfree = 0;
+       if (dir == d_inode(dir->i_sb->s_root))
+               inode = zonefs_get_dir_inode(dir, dentry);
        else
-               buf->f_bfree = buf->f_blocks - sbi->s_used_blocks;
-       buf->f_bavail = buf->f_bfree;
+               inode = zonefs_get_file_inode(dir, dentry);
+       if (IS_ERR(inode))
+               return ERR_CAST(inode);
  
-       for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
-               if (sbi->s_nr_files[t])
-                       buf->f_files += sbi->s_nr_files[t] + 1;
-       }
-       buf->f_ffree = 0;
-       spin_unlock(&sbi->s_lock);
-       buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
-       return 0;
+       return d_splice_alias(inode, dentry);
  }
  
- enum {
-       Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
-       Opt_explicit_open, Opt_err,
- };
- static const match_table_t tokens = {
-       { Opt_errors_ro,        "errors=remount-ro"},
-       { Opt_errors_zro,       "errors=zone-ro"},
-       { Opt_errors_zol,       "errors=zone-offline"},
-       { Opt_errors_repair,    "errors=repair"},
-       { Opt_explicit_open,    "explicit-open" },
-       { Opt_err,              NULL}
- };
- static int zonefs_parse_options(struct super_block *sb, char *options)
+ static int zonefs_readdir_root(struct file *file, struct dir_context *ctx)
  {
+       struct inode *inode = file_inode(file);
+       struct super_block *sb = inode->i_sb;
        struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
-       substring_t args[MAX_OPT_ARGS];
-       char *p;
+       enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV;
+       ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1;
  
-       if (!options)
+       if (ctx->pos >= inode->i_size)
                return 0;
  
-       while ((p = strsep(&options, ",")) != NULL) {
-               int token;
+       if (!dir_emit_dots(file, ctx))
+               return 0;
  
-               if (!*p)
-                       continue;
+       if (ctx->pos == 2) {
+               if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones)
+                       ztype = ZONEFS_ZTYPE_SEQ;
  
-               token = match_token(p, tokens, args);
-               switch (token) {
-               case Opt_errors_ro:
-                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO;
-                       break;
-               case Opt_errors_zro:
-                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO;
-                       break;
-               case Opt_errors_zol:
-                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL;
-                       break;
-               case Opt_errors_repair:
-                       sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-                       sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
-                       break;
-               case Opt_explicit_open:
-                       sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
-                       break;
-               default:
-                       return -EINVAL;
-               }
+               if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3,
+                             base_ino + ztype, DT_DIR))
+                       return 0;
+               ctx->pos++;
        }
  
-       return 0;
- }
- static int zonefs_show_options(struct seq_file *seq, struct dentry *root)
- {
-       struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb);
-       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO)
-               seq_puts(seq, ",errors=remount-ro");
-       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)
-               seq_puts(seq, ",errors=zone-ro");
-       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)
-               seq_puts(seq, ",errors=zone-offline");
-       if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR)
-               seq_puts(seq, ",errors=repair");
+       if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) {
+               ztype = ZONEFS_ZTYPE_SEQ;
+               if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3,
+                             base_ino + ztype, DT_DIR))
+                       return 0;
+               ctx->pos++;
+       }
  
        return 0;
  }