fs/zonefs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Simple file system for zoned block devices exposing zones as files.
   4  *
   5  * Copyright (C) 2022 Western Digital Corporation or its affiliates.
   6  */
   7 #include <linux/module.h>
   8 #include <linux/pagemap.h>
   9 #include <linux/iomap.h>
  10 #include <linux/init.h>
  11 #include <linux/slab.h>
  12 #include <linux/blkdev.h>
  13 #include <linux/statfs.h>
  14 #include <linux/writeback.h>
  15 #include <linux/quotaops.h>
  16 #include <linux/seq_file.h>
  17 #include <linux/parser.h>
  18 #include <linux/uio.h>
  19 #include <linux/mman.h>
  20 #include <linux/sched/mm.h>
  21 #include <linux/task_io_accounting_ops.h>
  22
  23 #include "zonefs.h"
  24
  25 #include "trace.h"
  26
  27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  28                                    loff_t length, unsigned int flags,
  29                                    struct iomap *iomap, struct iomap *srcmap)
  30 {
  31         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  32         struct zonefs_zone *z = zonefs_inode_zone(inode);
  33         struct super_block *sb = inode->i_sb;
  34         loff_t isize;
  35
  36         /*
  37          * All blocks are always mapped below EOF. If reading past EOF,
  38          * act as if there is a hole up to the file maximum size.
  39          */
  40         mutex_lock(&zi->i_truncate_mutex);
  41         iomap->bdev = inode->i_sb->s_bdev;
  42         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  43         isize = i_size_read(inode);
  44         if (iomap->offset >= isize) {
  45                 iomap->type = IOMAP_HOLE;
  46                 iomap->addr = IOMAP_NULL_ADDR;
  47                 iomap->length = length;
  48         } else {
  49                 iomap->type = IOMAP_MAPPED;
  50                 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  51                 iomap->length = isize - iomap->offset;
  52         }
  53         mutex_unlock(&zi->i_truncate_mutex);
  54
  55         trace_zonefs_iomap_begin(inode, iomap);
  56
  57         return 0;
  58 }
  59
  60 static const struct iomap_ops zonefs_read_iomap_ops = {
  61         .iomap_begin    = zonefs_read_iomap_begin,
  62 };
  63
  64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  65                                     loff_t length, unsigned int flags,
  66                                     struct iomap *iomap, struct iomap *srcmap)
  67 {
  68         struct zonefs_inode_info *zi = ZONEFS_I(inode);
  69         struct zonefs_zone *z = zonefs_inode_zone(inode);
  70         struct super_block *sb = inode->i_sb;
  71         loff_t isize;
  72
  73         /* All write I/Os should always be within the file maximum size */
  74         if (WARN_ON_ONCE(offset + length > z->z_capacity))
  75                 return -EIO;
  76
  77         /*
  78          * Sequential zones can only accept direct writes. This is already
  79          * checked when writes are issued, so warn if we see a page writeback
  80          * operation.
  81          */
  82         if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  83                 return -EIO;
  84
  85         /*
  86          * For conventional zones, all blocks are always mapped. For sequential
  87          * zones, all blocks after always mapped below the inode size (zone
  88          * write pointer) and unwriten beyond.
  89          */
  90         mutex_lock(&zi->i_truncate_mutex);
  91         iomap->bdev = inode->i_sb->s_bdev;
  92         iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  93         iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  94         isize = i_size_read(inode);
  95         if (iomap->offset >= isize) {
  96                 iomap->type = IOMAP_UNWRITTEN;
  97                 iomap->length = z->z_capacity - iomap->offset;
  98         } else {
  99                 iomap->type = IOMAP_MAPPED;
 100                 iomap->length = isize - iomap->offset;
 101         }
 102         mutex_unlock(&zi->i_truncate_mutex);
 103
 104         trace_zonefs_iomap_begin(inode, iomap);
 105
 106         return 0;
 107 }
 108
 109 static const struct iomap_ops zonefs_write_iomap_ops = {
 110         .iomap_begin    = zonefs_write_iomap_begin,
 111 };
 112
 113 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 114 {
 115         return iomap_read_folio(folio, &zonefs_read_iomap_ops);
 116 }
 117
 118 static void zonefs_readahead(struct readahead_control *rac)
 119 {
 120         iomap_readahead(rac, &zonefs_read_iomap_ops);
 121 }
 122
 123 /*
 124  * Map blocks for page writeback. This is used only on conventional zone files,
 125  * which implies that the page range can only be within the fixed inode size.
 126  */
 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
 128                                    struct inode *inode, loff_t offset)
 129 {
 130         struct zonefs_zone *z = zonefs_inode_zone(inode);
 131
 132         if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
 133                 return -EIO;
 134         if (WARN_ON_ONCE(offset >= i_size_read(inode)))
 135                 return -EIO;
 136
 137         /* If the mapping is already OK, nothing needs to be done */
 138         if (offset >= wpc->iomap.offset &&
 139             offset < wpc->iomap.offset + wpc->iomap.length)
 140                 return 0;
 141
 142         return zonefs_write_iomap_begin(inode, offset,
 143                                         z->z_capacity - offset,
 144                                         IOMAP_WRITE, &wpc->iomap, NULL);
 145 }
 146
 147 static const struct iomap_writeback_ops zonefs_writeback_ops = {
 148         .map_blocks             = zonefs_write_map_blocks,
 149 };
 150
 151 static int zonefs_writepages(struct address_space *mapping,
 152                              struct writeback_control *wbc)
 153 {
 154         struct iomap_writepage_ctx wpc = { };
 155
 156         return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
 157 }
 158
 159 static int zonefs_swap_activate(struct swap_info_struct *sis,
 160                                 struct file *swap_file, sector_t *span)
 161 {
 162         struct inode *inode = file_inode(swap_file);
 163
 164         if (zonefs_inode_is_seq(inode)) {
 165                 zonefs_err(inode->i_sb,
 166                            "swap file: not a conventional zone file\n");
 167                 return -EINVAL;
 168         }
 169
 170         return iomap_swapfile_activate(sis, swap_file, span,
 171                                        &zonefs_read_iomap_ops);
 172 }
 173
 174 const struct address_space_operations zonefs_file_aops = {
 175         .read_folio             = zonefs_read_folio,
 176         .readahead              = zonefs_readahead,
 177         .writepages             = zonefs_writepages,
 178         .dirty_folio            = iomap_dirty_folio,
 179         .release_folio          = iomap_release_folio,
 180         .invalidate_folio       = iomap_invalidate_folio,
 181         .migrate_folio          = filemap_migrate_folio,
 182         .is_partially_uptodate  = iomap_is_partially_uptodate,
 183         .error_remove_page      = generic_error_remove_page,
 184         .swap_activate          = zonefs_swap_activate,
 185 };
 186
 187 int zonefs_file_truncate(struct inode *inode, loff_t isize)
 188 {
 189         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 190         struct zonefs_zone *z = zonefs_inode_zone(inode);
 191         loff_t old_isize;
 192         enum req_op op;
 193         int ret = 0;
 194
 195         /*
 196          * Only sequential zone files can be truncated and truncation is allowed
 197          * only down to a 0 size, which is equivalent to a zone reset, and to
 198          * the maximum file size, which is equivalent to a zone finish.
 199          */
 200         if (!zonefs_zone_is_seq(z))
 201                 return -EPERM;
 202
 203         if (!isize)
 204                 op = REQ_OP_ZONE_RESET;
 205         else if (isize == z->z_capacity)
 206                 op = REQ_OP_ZONE_FINISH;
 207         else
 208                 return -EPERM;
 209
 210         inode_dio_wait(inode);
 211
 212         /* Serialize against page faults */
 213         filemap_invalidate_lock(inode->i_mapping);
 214
 215         /* Serialize against zonefs_iomap_begin() */
 216         mutex_lock(&zi->i_truncate_mutex);
 217
 218         old_isize = i_size_read(inode);
 219         if (isize == old_isize)
 220                 goto unlock;
 221
 222         ret = zonefs_inode_zone_mgmt(inode, op);
 223         if (ret)
 224                 goto unlock;
 225
 226         /*
 227          * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
 228          * take care of open zones.
 229          */
 230         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 231                 /*
 232                  * Truncating a zone to EMPTY or FULL is the equivalent of
 233                  * closing the zone. For a truncation to 0, we need to
 234                  * re-open the zone to ensure new writes can be processed.
 235                  * For a truncation to the maximum file size, the zone is
 236                  * closed and writes cannot be accepted anymore, so clear
 237                  * the open flag.
 238                  */
 239                 if (!isize)
 240                         ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
 241                 else
 242                         z->z_flags &= ~ZONEFS_ZONE_OPEN;
 243         }
 244
 245         zonefs_update_stats(inode, isize);
 246         truncate_setsize(inode, isize);
 247         z->z_wpoffset = isize;
 248         zonefs_inode_account_active(inode);
 249
 250 unlock:
 251         mutex_unlock(&zi->i_truncate_mutex);
 252         filemap_invalidate_unlock(inode->i_mapping);
 253
 254         return ret;
 255 }
 256
 257 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
 258                              int datasync)
 259 {
 260         struct inode *inode = file_inode(file);
 261         int ret = 0;
 262
 263         if (unlikely(IS_IMMUTABLE(inode)))
 264                 return -EPERM;
 265
 266         /*
 267          * Since only direct writes are allowed in sequential files, page cache
 268          * flush is needed only for conventional zone files.
 269          */
 270         if (zonefs_inode_is_cnv(inode))
 271                 ret = file_write_and_wait_range(file, start, end);
 272         if (!ret)
 273                 ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 274
 275         if (ret)
 276                 zonefs_io_error(inode, true);
 277
 278         return ret;
 279 }
 280
 281 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
 282 {
 283         struct inode *inode = file_inode(vmf->vma->vm_file);
 284         vm_fault_t ret;
 285
 286         if (unlikely(IS_IMMUTABLE(inode)))
 287                 return VM_FAULT_SIGBUS;
 288
 289         /*
 290          * Sanity check: only conventional zone files can have shared
 291          * writeable mappings.
 292          */
 293         if (zonefs_inode_is_seq(inode))
 294                 return VM_FAULT_NOPAGE;
 295
 296         sb_start_pagefault(inode->i_sb);
 297         file_update_time(vmf->vma->vm_file);
 298
 299         /* Serialize against truncates */
 300         filemap_invalidate_lock_shared(inode->i_mapping);
 301         ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
 302         filemap_invalidate_unlock_shared(inode->i_mapping);
 303
 304         sb_end_pagefault(inode->i_sb);
 305         return ret;
 306 }
 307
 308 static const struct vm_operations_struct zonefs_file_vm_ops = {
 309         .fault          = filemap_fault,
 310         .map_pages      = filemap_map_pages,
 311         .page_mkwrite   = zonefs_filemap_page_mkwrite,
 312 };
 313
 314 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 315 {
 316         /*
 317          * Conventional zones accept random writes, so their files can support
 318          * shared writable mappings. For sequential zone files, only read
 319          * mappings are possible since there are no guarantees for write
 320          * ordering between msync() and page cache writeback.
 321          */
 322         if (zonefs_inode_is_seq(file_inode(file)) &&
 323             (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
 324                 return -EINVAL;
 325
 326         file_accessed(file);
 327         vma->vm_ops = &zonefs_file_vm_ops;
 328
 329         return 0;
 330 }
 331
 332 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
 333 {
 334         loff_t isize = i_size_read(file_inode(file));
 335
 336         /*
 337          * Seeks are limited to below the zone size for conventional zones
 338          * and below the zone write pointer for sequential zones. In both
 339          * cases, this limit is the inode size.
 340          */
 341         return generic_file_llseek_size(file, offset, whence, isize, isize);
 342 }
 343
 344 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
 345                                         int error, unsigned int flags)
 346 {
 347         struct inode *inode = file_inode(iocb->ki_filp);
 348         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 349
 350         if (error) {
 351                 zonefs_io_error(inode, true);
 352                 return error;
 353         }
 354
 355         if (size && zonefs_inode_is_seq(inode)) {
 356                 /*
 357                  * Note that we may be seeing completions out of order,
 358                  * but that is not a problem since a write completed
 359                  * successfully necessarily means that all preceding writes
 360                  * were also successful. So we can safely increase the inode
 361                  * size to the write end location.
 362                  */
 363                 mutex_lock(&zi->i_truncate_mutex);
 364                 if (i_size_read(inode) < iocb->ki_pos + size) {
 365                         zonefs_update_stats(inode, iocb->ki_pos + size);
 366                         zonefs_i_size_write(inode, iocb->ki_pos + size);
 367                 }
 368                 mutex_unlock(&zi->i_truncate_mutex);
 369         }
 370
 371         return 0;
 372 }
 373
 374 static const struct iomap_dio_ops zonefs_write_dio_ops = {
 375         .end_io         = zonefs_file_write_dio_end_io,
 376 };
 377
 378 /*
 379  * Do not exceed the LFS limits nor the file zone size. If pos is under the
 380  * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
 381  */
 382 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
 383                                         loff_t count)
 384 {
 385         struct inode *inode = file_inode(file);
 386         struct zonefs_zone *z = zonefs_inode_zone(inode);
 387         loff_t limit = rlimit(RLIMIT_FSIZE);
 388         loff_t max_size = z->z_capacity;
 389
 390         if (limit != RLIM_INFINITY) {
 391                 if (pos >= limit) {
 392                         send_sig(SIGXFSZ, current, 0);
 393                         return -EFBIG;
 394                 }
 395                 count = min(count, limit - pos);
 396         }
 397
 398         if (!(file->f_flags & O_LARGEFILE))
 399                 max_size = min_t(loff_t, MAX_NON_LFS, max_size);
 400
 401         if (unlikely(pos >= max_size))
 402                 return -EFBIG;
 403
 404         return min(count, max_size - pos);
 405 }
 406
 407 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
 408 {
 409         struct file *file = iocb->ki_filp;
 410         struct inode *inode = file_inode(file);
 411         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 412         struct zonefs_zone *z = zonefs_inode_zone(inode);
 413         loff_t count;
 414
 415         if (IS_SWAPFILE(inode))
 416                 return -ETXTBSY;
 417
 418         if (!iov_iter_count(from))
 419                 return 0;
 420
 421         if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
 422                 return -EINVAL;
 423
 424         if (iocb->ki_flags & IOCB_APPEND) {
 425                 if (zonefs_zone_is_cnv(z))
 426                         return -EINVAL;
 427                 mutex_lock(&zi->i_truncate_mutex);
 428                 iocb->ki_pos = z->z_wpoffset;
 429                 mutex_unlock(&zi->i_truncate_mutex);
 430         }
 431
 432         count = zonefs_write_check_limits(file, iocb->ki_pos,
 433                                           iov_iter_count(from));
 434         if (count < 0)
 435                 return count;
 436
 437         iov_iter_truncate(from, count);
 438         return iov_iter_count(from);
 439 }
 440
 441 /*
 442  * Handle direct writes. For sequential zone files, this is the only possible
 443  * write path. For these files, check that the user is issuing writes
 444  * sequentially from the end of the file. This code assumes that the block layer
 445  * delivers write requests to the device in sequential order. This is always the
 446  * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
 447  * elevator feature is being used (e.g. mq-deadline). The block layer always
 448  * automatically select such an elevator for zoned block devices during the
 449  * device initialization.
 450  */
 451 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
 452 {
 453         struct inode *inode = file_inode(iocb->ki_filp);
 454         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 455         struct zonefs_zone *z = zonefs_inode_zone(inode);
 456         struct super_block *sb = inode->i_sb;
 457         ssize_t ret, count;
 458
 459         /*
 460          * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
 461          * as this can cause write reordering (e.g. the first aio gets EAGAIN
 462          * on the inode lock but the second goes through but is now unaligned).
 463          */
 464         if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
 465             (iocb->ki_flags & IOCB_NOWAIT))
 466                 return -EOPNOTSUPP;
 467
 468         if (iocb->ki_flags & IOCB_NOWAIT) {
 469                 if (!inode_trylock(inode))
 470                         return -EAGAIN;
 471         } else {
 472                 inode_lock(inode);
 473         }
 474
 475         count = zonefs_write_checks(iocb, from);
 476         if (count <= 0) {
 477                 ret = count;
 478                 goto inode_unlock;
 479         }
 480
 481         if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 482                 ret = -EINVAL;
 483                 goto inode_unlock;
 484         }
 485
 486         /* Enforce sequential writes (append only) in sequential zones */
 487         if (zonefs_zone_is_seq(z)) {
 488                 mutex_lock(&zi->i_truncate_mutex);
 489                 if (iocb->ki_pos != z->z_wpoffset) {
 490                         mutex_unlock(&zi->i_truncate_mutex);
 491                         ret = -EINVAL;
 492                         goto inode_unlock;
 493                 }
 494                 mutex_unlock(&zi->i_truncate_mutex);
 495         }
 496
 497         /*
 498          * iomap_dio_rw() may return ENOTBLK if there was an issue with
 499          * page invalidation. Overwrite that error code with EBUSY so that
 500          * the user can make sense of the error.
 501          */
 502         ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
 503                            &zonefs_write_dio_ops, 0, NULL, 0);
 504         if (ret == -ENOTBLK)
 505                 ret = -EBUSY;
 506
 507         if (zonefs_zone_is_seq(z) &&
 508             (ret > 0 || ret == -EIOCBQUEUED)) {
 509                 if (ret > 0)
 510                         count = ret;
 511
 512                 /*
 513                  * Update the zone write pointer offset assuming the write
 514                  * operation succeeded. If it did not, the error recovery path
 515                  * will correct it. Also do active seq file accounting.
 516                  */
 517                 mutex_lock(&zi->i_truncate_mutex);
 518                 z->z_wpoffset += count;
 519                 zonefs_inode_account_active(inode);
 520                 mutex_unlock(&zi->i_truncate_mutex);
 521         }
 522
 523 inode_unlock:
 524         inode_unlock(inode);
 525
 526         return ret;
 527 }
 528
 529 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 530                                           struct iov_iter *from)
 531 {
 532         struct inode *inode = file_inode(iocb->ki_filp);
 533         ssize_t ret;
 534
 535         /*
 536          * Direct IO writes are mandatory for sequential zone files so that the
 537          * write IO issuing order is preserved.
 538          */
 539         if (zonefs_inode_is_seq(inode))
 540                 return -EIO;
 541
 542         if (iocb->ki_flags & IOCB_NOWAIT) {
 543                 if (!inode_trylock(inode))
 544                         return -EAGAIN;
 545         } else {
 546                 inode_lock(inode);
 547         }
 548
 549         ret = zonefs_write_checks(iocb, from);
 550         if (ret <= 0)
 551                 goto inode_unlock;
 552
 553         ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
 554         if (ret == -EIO)
 555                 zonefs_io_error(inode, true);
 556
 557 inode_unlock:
 558         inode_unlock(inode);
 559         if (ret > 0)
 560                 ret = generic_write_sync(iocb, ret);
 561
 562         return ret;
 563 }
 564
 565 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 566 {
 567         struct inode *inode = file_inode(iocb->ki_filp);
 568         struct zonefs_zone *z = zonefs_inode_zone(inode);
 569
 570         if (unlikely(IS_IMMUTABLE(inode)))
 571                 return -EPERM;
 572
 573         if (sb_rdonly(inode->i_sb))
 574                 return -EROFS;
 575
 576         /* Write operations beyond the zone capacity are not allowed */
 577         if (iocb->ki_pos >= z->z_capacity)
 578                 return -EFBIG;
 579
 580         if (iocb->ki_flags & IOCB_DIRECT) {
 581                 ssize_t ret = zonefs_file_dio_write(iocb, from);
 582
 583                 if (ret != -ENOTBLK)
 584                         return ret;
 585         }
 586
 587         return zonefs_file_buffered_write(iocb, from);
 588 }
 589
 590 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
 591                                        int error, unsigned int flags)
 592 {
 593         if (error) {
 594                 zonefs_io_error(file_inode(iocb->ki_filp), false);
 595                 return error;
 596         }
 597
 598         return 0;
 599 }
 600
 601 static const struct iomap_dio_ops zonefs_read_dio_ops = {
 602         .end_io                 = zonefs_file_read_dio_end_io,
 603 };
 604
 605 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 606 {
 607         struct inode *inode = file_inode(iocb->ki_filp);
 608         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 609         struct zonefs_zone *z = zonefs_inode_zone(inode);
 610         struct super_block *sb = inode->i_sb;
 611         loff_t isize;
 612         ssize_t ret;
 613
 614         /* Offline zones cannot be read */
 615         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 616                 return -EPERM;
 617
 618         if (iocb->ki_pos >= z->z_capacity)
 619                 return 0;
 620
 621         if (iocb->ki_flags & IOCB_NOWAIT) {
 622                 if (!inode_trylock_shared(inode))
 623                         return -EAGAIN;
 624         } else {
 625                 inode_lock_shared(inode);
 626         }
 627
 628         /* Limit read operations to written data */
 629         mutex_lock(&zi->i_truncate_mutex);
 630         isize = i_size_read(inode);
 631         if (iocb->ki_pos >= isize) {
 632                 mutex_unlock(&zi->i_truncate_mutex);
 633                 ret = 0;
 634                 goto inode_unlock;
 635         }
 636         iov_iter_truncate(to, isize - iocb->ki_pos);
 637         mutex_unlock(&zi->i_truncate_mutex);
 638
 639         if (iocb->ki_flags & IOCB_DIRECT) {
 640                 size_t count = iov_iter_count(to);
 641
 642                 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
 643                         ret = -EINVAL;
 644                         goto inode_unlock;
 645                 }
 646                 file_accessed(iocb->ki_filp);
 647                 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
 648                                    &zonefs_read_dio_ops, 0, NULL, 0);
 649         } else {
 650                 ret = generic_file_read_iter(iocb, to);
 651                 if (ret == -EIO)
 652                         zonefs_io_error(inode, false);
 653         }
 654
 655 inode_unlock:
 656         inode_unlock_shared(inode);
 657
 658         return ret;
 659 }
 660
 661 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos,
 662                                        struct pipe_inode_info *pipe,
 663                                        size_t len, unsigned int flags)
 664 {
 665         struct inode *inode = file_inode(in);
 666         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 667         struct zonefs_zone *z = zonefs_inode_zone(inode);
 668         loff_t isize;
 669         ssize_t ret = 0;
 670
 671         /* Offline zones cannot be read */
 672         if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
 673                 return -EPERM;
 674
 675         if (*ppos >= z->z_capacity)
 676                 return 0;
 677
 678         inode_lock_shared(inode);
 679
 680         /* Limit read operations to written data */
 681         mutex_lock(&zi->i_truncate_mutex);
 682         isize = i_size_read(inode);
 683         if (*ppos >= isize)
 684                 len = 0;
 685         else
 686                 len = min_t(loff_t, len, isize - *ppos);
 687         mutex_unlock(&zi->i_truncate_mutex);
 688
 689         if (len > 0) {
 690                 ret = filemap_splice_read(in, ppos, pipe, len, flags);
 691                 if (ret == -EIO)
 692                         zonefs_io_error(inode, false);
 693         }
 694
 695         inode_unlock_shared(inode);
 696         return ret;
 697 }
 698
 699 /*
 700  * Write open accounting is done only for sequential files.
 701  */
 702 static inline bool zonefs_seq_file_need_wro(struct inode *inode,
 703                                             struct file *file)
 704 {
 705         if (zonefs_inode_is_cnv(inode))
 706                 return false;
 707
 708         if (!(file->f_mode & FMODE_WRITE))
 709                 return false;
 710
 711         return true;
 712 }
 713
 714 static int zonefs_seq_file_write_open(struct inode *inode)
 715 {
 716         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 717         struct zonefs_zone *z = zonefs_inode_zone(inode);
 718         int ret = 0;
 719
 720         mutex_lock(&zi->i_truncate_mutex);
 721
 722         if (!zi->i_wr_refcnt) {
 723                 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
 724                 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
 725
 726                 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
 727
 728                         if (sbi->s_max_wro_seq_files
 729                             && wro > sbi->s_max_wro_seq_files) {
 730                                 atomic_dec(&sbi->s_wro_seq_files);
 731                                 ret = -EBUSY;
 732                                 goto unlock;
 733                         }
 734
 735                         if (i_size_read(inode) < z->z_capacity) {
 736                                 ret = zonefs_inode_zone_mgmt(inode,
 737                                                              REQ_OP_ZONE_OPEN);
 738                                 if (ret) {
 739                                         atomic_dec(&sbi->s_wro_seq_files);
 740                                         goto unlock;
 741                                 }
 742                                 z->z_flags |= ZONEFS_ZONE_OPEN;
 743                                 zonefs_inode_account_active(inode);
 744                         }
 745                 }
 746         }
 747
 748         zi->i_wr_refcnt++;
 749
 750 unlock:
 751         mutex_unlock(&zi->i_truncate_mutex);
 752
 753         return ret;
 754 }
 755
 756 static int zonefs_file_open(struct inode *inode, struct file *file)
 757 {
 758         int ret;
 759
 760         file->f_mode |= FMODE_CAN_ODIRECT;
 761         ret = generic_file_open(inode, file);
 762         if (ret)
 763                 return ret;
 764
 765         if (zonefs_seq_file_need_wro(inode, file))
 766                 return zonefs_seq_file_write_open(inode);
 767
 768         return 0;
 769 }
 770
 771 static void zonefs_seq_file_write_close(struct inode *inode)
 772 {
 773         struct zonefs_inode_info *zi = ZONEFS_I(inode);
 774         struct zonefs_zone *z = zonefs_inode_zone(inode);
 775         struct super_block *sb = inode->i_sb;
 776         struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
 777         int ret = 0;
 778
 779         mutex_lock(&zi->i_truncate_mutex);
 780
 781         zi->i_wr_refcnt--;
 782         if (zi->i_wr_refcnt)
 783                 goto unlock;
 784
 785         /*
 786          * The file zone may not be open anymore (e.g. the file was truncated to
 787          * its maximum size or it was fully written). For this case, we only
 788          * need to decrement the write open count.
 789          */
 790         if (z->z_flags & ZONEFS_ZONE_OPEN) {
 791                 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
 792                 if (ret) {
 793                         __zonefs_io_error(inode, false);
 794                         /*
 795                          * Leaving zones explicitly open may lead to a state
 796                          * where most zones cannot be written (zone resources
 797                          * exhausted). So take preventive action by remounting
 798                          * read-only.
 799                          */
 800                         if (z->z_flags & ZONEFS_ZONE_OPEN &&
 801                             !(sb->s_flags & SB_RDONLY)) {
 802                                 zonefs_warn(sb,
 803                                         "closing zone at %llu failed %d\n",
 804                                         z->z_sector, ret);
 805                                 zonefs_warn(sb,
 806                                         "remounting filesystem read-only\n");
 807                                 sb->s_flags |= SB_RDONLY;
 808                         }
 809                         goto unlock;
 810                 }
 811
 812                 z->z_flags &= ~ZONEFS_ZONE_OPEN;
 813                 zonefs_inode_account_active(inode);
 814         }
 815
 816         atomic_dec(&sbi->s_wro_seq_files);
 817
 818 unlock:
 819         mutex_unlock(&zi->i_truncate_mutex);
 820 }
 821
 822 static int zonefs_file_release(struct inode *inode, struct file *file)
 823 {
 824         /*
 825          * If we explicitly open a zone we must close it again as well, but the
 826          * zone management operation can fail (either due to an IO error or as
 827          * the zone has gone offline or read-only). Make sure we don't fail the
 828          * close(2) for user-space.
 829          */
 830         if (zonefs_seq_file_need_wro(inode, file))
 831                 zonefs_seq_file_write_close(inode);
 832
 833         return 0;
 834 }
 835
 836 const struct file_operations zonefs_file_operations = {
 837         .open           = zonefs_file_open,
 838         .release        = zonefs_file_release,
 839         .fsync          = zonefs_file_fsync,
 840         .mmap           = zonefs_file_mmap,
 841         .llseek         = zonefs_file_llseek,
 842         .read_iter      = zonefs_file_read_iter,
 843         .write_iter     = zonefs_file_write_iter,
 844         .splice_read    = zonefs_file_splice_read,
 845         .splice_write   = iter_file_splice_write,
 846         .iopoll         = iocb_bio_iopoll,
 847 };