zonefs: use iomap for synchronous direct writes

author Damien Le Moal <dlemoal@kernel.org>

Thu, 1 Jun 2023 08:15:41 +0000 (17:15 +0900)

committer Damien Le Moal <dlemoal@kernel.org>

Tue, 13 Jun 2023 23:51:05 +0000 (08:51 +0900)
author Damien Le Moal <dlemoal@kernel.org>
Thu, 1 Jun 2023 08:15:41 +0000 (17:15 +0900)
committer Damien Le Moal <dlemoal@kernel.org>
Tue, 13 Jun 2023 23:51:05 +0000 (08:51 +0900)
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c

index 132f01d..c34ec5b 100644 (file)
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -342,6 +342,77 @@ static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
         return generic_file_llseek_size(file, offset, whence, isize, isize);
  }
  
+struct zonefs_zone_append_bio {
+       /* The target inode of the BIO */
+       struct inode *inode;
+
+       /* For sync writes, the target append write offset */
+       u64 append_offset;
+
+       /*
+        * This member must come last, bio_alloc_bioset will allocate enough
+        * bytes for entire zonefs_bio but relies on bio being last.
+        */
+       struct bio bio;
+};
+
+static inline struct zonefs_zone_append_bio *
+zonefs_zone_append_bio(struct bio *bio)
+{
+       return container_of(bio, struct zonefs_zone_append_bio, bio);
+}
+
+static void zonefs_file_zone_append_dio_bio_end_io(struct bio *bio)
+{
+       struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
+       struct zonefs_zone *z = zonefs_inode_zone(za_bio->inode);
+       sector_t za_sector;
+
+       if (bio->bi_status != BLK_STS_OK)
+               goto bio_end;
+
+       /*
+        * If the file zone was written underneath the file system, the zone
+        * append operation can still succedd (if the zone is not full) but
+        * the write append location will not be where we expect it to be.
+        * Check that we wrote where we intended to, that is, at z->z_wpoffset.
+        */
+       za_sector = z->z_sector + (za_bio->append_offset >> SECTOR_SHIFT);
+       if (bio->bi_iter.bi_sector != za_sector) {
+               zonefs_warn(za_bio->inode->i_sb,
+                           "Invalid write sector %llu for zone at %llu\n",
+                           bio->bi_iter.bi_sector, z->z_sector);
+               bio->bi_status = BLK_STS_IOERR;
+       }
+
+bio_end:
+       iomap_dio_bio_end_io(bio);
+}
+
+static void zonefs_file_zone_append_dio_submit_io(const struct iomap_iter *iter,
+                                                 struct bio *bio,
+                                                 loff_t file_offset)
+{
+       struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio);
+       struct inode *inode = iter->inode;
+       struct zonefs_zone *z = zonefs_inode_zone(inode);
+
+       /*
+        * Issue a zone append BIO to process sync dio writes. The append
+        * file offset is saved to check the zone append write location
+        * on completion of the BIO.
+        */
+       za_bio->inode = inode;
+       za_bio->append_offset = file_offset;
+
+       bio->bi_opf &= ~REQ_OP_WRITE;
+       bio->bi_opf |= REQ_OP_ZONE_APPEND;
+       bio->bi_iter.bi_sector = z->z_sector;
+       bio->bi_end_io = zonefs_file_zone_append_dio_bio_end_io;
+
+       submit_bio(bio);
+}
+
  static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
                                         int error, unsigned int flags)
  {
@@ -372,93 +443,17 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
         return 0;
  }
  
-static const struct iomap_dio_ops zonefs_write_dio_ops = {
-       .end_io                 = zonefs_file_write_dio_end_io,
-};
+static struct bio_set zonefs_zone_append_bio_set;
  
-static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct inode *inode = file_inode(iocb->ki_filp);
-       struct zonefs_zone *z = zonefs_inode_zone(inode);
-       struct block_device *bdev = inode->i_sb->s_bdev;
-       unsigned int max = bdev_max_zone_append_sectors(bdev);
-       pgoff_t start, end;
-       struct bio *bio;
-       ssize_t size = 0;
-       int nr_pages;
-       ssize_t ret;
-
-       max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
-       iov_iter_truncate(from, max);
-
-       /*
-        * If the inode block size (zone write granularity) is smaller than the
-        * page size, we may be appending data belonging to the last page of the
-        * inode straddling inode->i_size, with that page already cached due to
-        * a buffered read or readahead. So make sure to invalidate that page.
-        * This will always be a no-op for the case where the block size is
-        * equal to the page size.
-        */
-       start = iocb->ki_pos >> PAGE_SHIFT;
-       end = (iocb->ki_pos + iov_iter_count(from) - 1) >> PAGE_SHIFT;
-       if (invalidate_inode_pages2_range(inode->i_mapping, start, end))
-               return -EBUSY;
-
-       nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
-       if (!nr_pages)
-               return 0;
-
-       bio = bio_alloc(bdev, nr_pages,
-                       REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
-       bio->bi_iter.bi_sector = z->z_sector;
-       bio->bi_ioprio = iocb->ki_ioprio;
-       if (iocb_is_dsync(iocb))
-               bio->bi_opf |= REQ_FUA;
-
-       ret = bio_iov_iter_get_pages(bio, from);
-       if (unlikely(ret))
-               goto out_release;
-
-       size = bio->bi_iter.bi_size;
-       task_io_account_write(size);
-
-       if (iocb->ki_flags & IOCB_HIPRI)
-               bio_set_polled(bio, iocb);
-
-       ret = submit_bio_wait(bio);
-
-       /*
-        * If the file zone was written underneath the file system, the zone
-        * write pointer may not be where we expect it to be, but the zone
-        * append write can still succeed. So check manually that we wrote where
-        * we intended to, that is, at zi->i_wpoffset.
-        */
-       if (!ret) {
-               sector_t wpsector =
-                       z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT);
-
-               if (bio->bi_iter.bi_sector != wpsector) {
-                       zonefs_warn(inode->i_sb,
-                               "Corrupted write pointer %llu for zone at %llu\n",
-                               bio->bi_iter.bi_sector, z->z_sector);
-                       ret = -EIO;
-               }
-       }
-
-       zonefs_file_write_dio_end_io(iocb, size, ret, 0);
-       trace_zonefs_file_dio_append(inode, size, ret);
-
-out_release:
-       bio_release_pages(bio, false);
-       bio_put(bio);
-
-       if (ret >= 0) {
-               iocb->ki_pos += size;
-               return size;
-       }
+static const struct iomap_dio_ops zonefs_zone_append_dio_ops = {
+       .submit_io      = zonefs_file_zone_append_dio_submit_io,
+       .end_io         = zonefs_file_write_dio_end_io,
+       .bio_set        = &zonefs_zone_append_bio_set,
+};
  
-       return ret;
-}
+static const struct iomap_dio_ops zonefs_write_dio_ops = {
+       .end_io         = zonefs_file_write_dio_end_io,
+};
  
  /*
   * Do not exceed the LFS limits nor the file zone size. If pos is under the
@@ -539,6 +534,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
         struct zonefs_inode_info *zi = ZONEFS_I(inode);
         struct zonefs_zone *z = zonefs_inode_zone(inode);
         struct super_block *sb = inode->i_sb;
+       const struct iomap_dio_ops *dio_ops;
         bool sync = is_sync_kiocb(iocb);
         bool append = false;
         ssize_t ret, count;
@@ -582,20 +578,26 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
         }
  
         if (append) {
-               ret = zonefs_file_dio_append(iocb, from);
+               unsigned int max = bdev_max_zone_append_sectors(sb->s_bdev);
+
+               max = ALIGN_DOWN(max << SECTOR_SHIFT, sb->s_blocksize);
+               iov_iter_truncate(from, max);
+
+               dio_ops = &zonefs_zone_append_dio_ops;
         } else {
-               /*
-                * iomap_dio_rw() may return ENOTBLK if there was an issue with
-                * page invalidation. Overwrite that error code with EBUSY to
-                * be consistent with zonefs_file_dio_append() return value for
-                * similar issues.
-                */
-               ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
-                                  &zonefs_write_dio_ops, 0, NULL, 0);
-               if (ret == -ENOTBLK)
-                       ret = -EBUSY;
+               dio_ops = &zonefs_write_dio_ops;
         }
  
+       /*
+        * iomap_dio_rw() may return ENOTBLK if there was an issue with
+        * page invalidation. Overwrite that error code with EBUSY so that
+        * the user can make sense of the error.
+        */
+       ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
+                          dio_ops, 0, NULL, 0);
+       if (ret == -ENOTBLK)
+               ret = -EBUSY;
+
         if (zonefs_zone_is_seq(z) &&
             (ret > 0 || ret == -EIOCBQUEUED)) {
                 if (ret > 0)
@@ -900,3 +902,15 @@ const struct file_operations zonefs_file_operations = {
         .splice_write   = iter_file_splice_write,
         .iopoll         = iocb_bio_iopoll,
  };
+
+int zonefs_file_bioset_init(void)
+{
+       return bioset_init(&zonefs_zone_append_bio_set, BIO_POOL_SIZE,
+                          offsetof(struct zonefs_zone_append_bio, bio),
+                          BIOSET_NEED_BVECS);
+}
+
+void zonefs_file_bioset_exit(void)
+{
+       bioset_exit(&zonefs_zone_append_bio_set);
+}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c

index 23b8b29..56c0011 100644 (file)
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1412,10 +1412,14 @@ static int __init zonefs_init(void)
  
         BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE);
  
-       ret = zonefs_init_inodecache();
+       ret = zonefs_file_bioset_init();
         if (ret)
                 return ret;
  
+       ret = zonefs_init_inodecache();
+       if (ret)
+               goto destroy_bioset;
+
         ret = zonefs_sysfs_init();
         if (ret)
                 goto destroy_inodecache;
@@ -1430,6 +1434,8 @@ sysfs_exit:
         zonefs_sysfs_exit();
  destroy_inodecache:
         zonefs_destroy_inodecache();
+destroy_bioset:
+       zonefs_file_bioset_exit();
  
         return ret;
  }
@@ -1439,6 +1445,7 @@ static void __exit zonefs_exit(void)
         unregister_filesystem(&zonefs_type);
         zonefs_sysfs_exit();
         zonefs_destroy_inodecache();
+       zonefs_file_bioset_exit();
  }
  
  MODULE_AUTHOR("Damien Le Moal");
diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h

index 8175652..f663b8e 100644 (file)
--- a/fs/zonefs/zonefs.h
+++ b/fs/zonefs/zonefs.h
@@ -279,6 +279,8 @@ extern const struct file_operations zonefs_dir_operations;
  extern const struct address_space_operations zonefs_file_aops;
  extern const struct file_operations zonefs_file_operations;
  int zonefs_file_truncate(struct inode *inode, loff_t isize);
+int zonefs_file_bioset_init(void);
+void zonefs_file_bioset_exit(void);
  
  /* In sysfs.c */
  int zonefs_sysfs_register(struct super_block *sb);
author	Damien Le Moal <dlemoal@kernel.org>
	Thu, 1 Jun 2023 08:15:41 +0000 (17:15 +0900)
committer	Damien Le Moal <dlemoal@kernel.org>
	Tue, 13 Jun 2023 23:51:05 +0000 (08:51 +0900)
fs/zonefs/file.c		patch \| blob \| history
fs/zonefs/super.c		patch \| blob \| history
fs/zonefs/zonefs.h		patch \| blob \| history