fs/btrfs/inode.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <crypto/hash.h>
   7 #include <linux/kernel.h>
   8 #include <linux/bio.h>
   9 #include <linux/blk-cgroup.h>
  10 #include <linux/file.h>
  11 #include <linux/fs.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/highmem.h>
  14 #include <linux/time.h>
  15 #include <linux/init.h>
  16 #include <linux/string.h>
  17 #include <linux/backing-dev.h>
  18 #include <linux/writeback.h>
  19 #include <linux/compat.h>
  20 #include <linux/xattr.h>
  21 #include <linux/posix_acl.h>
  22 #include <linux/falloc.h>
  23 #include <linux/slab.h>
  24 #include <linux/ratelimit.h>
  25 #include <linux/btrfs.h>
  26 #include <linux/blkdev.h>
  27 #include <linux/posix_acl_xattr.h>
  28 #include <linux/uio.h>
  29 #include <linux/magic.h>
  30 #include <linux/iversion.h>
  31 #include <linux/swap.h>
  32 #include <linux/migrate.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/iomap.h>
  35 #include <asm/unaligned.h>
  36 #include <linux/fsverity.h>
  37 #include "misc.h"
  38 #include "ctree.h"
  39 #include "disk-io.h"
  40 #include "transaction.h"
  41 #include "btrfs_inode.h"
  42 #include "print-tree.h"
  43 #include "ordered-data.h"
  44 #include "xattr.h"
  45 #include "tree-log.h"
  46 #include "bio.h"
  47 #include "compression.h"
  48 #include "locking.h"
  49 #include "free-space-cache.h"
  50 #include "props.h"
  51 #include "qgroup.h"
  52 #include "delalloc-space.h"
  53 #include "block-group.h"
  54 #include "space-info.h"
  55 #include "zoned.h"
  56 #include "subpage.h"
  57 #include "inode-item.h"
  58 #include "fs.h"
  59 #include "accessors.h"
  60 #include "extent-tree.h"
  61 #include "root-tree.h"
  62 #include "defrag.h"
  63 #include "dir-item.h"
  64 #include "file-item.h"
  65 #include "uuid-tree.h"
  66 #include "ioctl.h"
  67 #include "file.h"
  68 #include "acl.h"
  69 #include "relocation.h"
  70 #include "verity.h"
  71 #include "super.h"
  72 #include "orphan.h"
  73 #include "backref.h"
  74
  75 struct btrfs_iget_args {
  76         u64 ino;
  77         struct btrfs_root *root;
  78 };
  79
  80 struct btrfs_dio_data {
  81         ssize_t submitted;
  82         struct extent_changeset *data_reserved;
  83         struct btrfs_ordered_extent *ordered;
  84         bool data_space_reserved;
  85         bool nocow_done;
  86 };
  87
  88 struct btrfs_dio_private {
  89         /* Range of I/O */
  90         u64 file_offset;
  91         u32 bytes;
  92
  93         /* This must be last */
  94         struct btrfs_bio bbio;
  95 };
  96
  97 static struct bio_set btrfs_dio_bioset;
  98
  99 struct btrfs_rename_ctx {
 100         /* Output field. Stores the index number of the old directory entry. */
 101         u64 index;
 102 };
 103
 104 /*
 105  * Used by data_reloc_print_warning_inode() to pass needed info for filename
 106  * resolution and output of error message.
 107  */
 108 struct data_reloc_warn {
 109         struct btrfs_path path;
 110         struct btrfs_fs_info *fs_info;
 111         u64 extent_item_size;
 112         u64 logical;
 113         int mirror_num;
 114 };
 115
 116 static const struct inode_operations btrfs_dir_inode_operations;
 117 static const struct inode_operations btrfs_symlink_inode_operations;
 118 static const struct inode_operations btrfs_special_inode_operations;
 119 static const struct inode_operations btrfs_file_inode_operations;
 120 static const struct address_space_operations btrfs_aops;
 121 static const struct file_operations btrfs_dir_file_operations;
 122
 123 static struct kmem_cache *btrfs_inode_cachep;
 124
 125 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 126 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
 127
 128 static noinline int cow_file_range(struct btrfs_inode *inode,
 129                                    struct page *locked_page,
 130                                    u64 start, u64 end, u64 *done_offset,
 131                                    bool keep_locked, bool no_inline);
 132 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
 133                                        u64 len, u64 orig_start, u64 block_start,
 134                                        u64 block_len, u64 orig_block_len,
 135                                        u64 ram_bytes, int compress_type,
 136                                        int type);
 137
 138 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
 139                                           u64 root, void *warn_ctx)
 140 {
 141         struct data_reloc_warn *warn = warn_ctx;
 142         struct btrfs_fs_info *fs_info = warn->fs_info;
 143         struct extent_buffer *eb;
 144         struct btrfs_inode_item *inode_item;
 145         struct inode_fs_paths *ipath = NULL;
 146         struct btrfs_root *local_root;
 147         struct btrfs_key key;
 148         unsigned int nofs_flag;
 149         u32 nlink;
 150         int ret;
 151
 152         local_root = btrfs_get_fs_root(fs_info, root, true);
 153         if (IS_ERR(local_root)) {
 154                 ret = PTR_ERR(local_root);
 155                 goto err;
 156         }
 157
 158         /* This makes the path point to (inum INODE_ITEM ioff). */
 159         key.objectid = inum;
 160         key.type = BTRFS_INODE_ITEM_KEY;
 161         key.offset = 0;
 162
 163         ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
 164         if (ret) {
 165                 btrfs_put_root(local_root);
 166                 btrfs_release_path(&warn->path);
 167                 goto err;
 168         }
 169
 170         eb = warn->path.nodes[0];
 171         inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
 172         nlink = btrfs_inode_nlink(eb, inode_item);
 173         btrfs_release_path(&warn->path);
 174
 175         nofs_flag = memalloc_nofs_save();
 176         ipath = init_ipath(4096, local_root, &warn->path);
 177         memalloc_nofs_restore(nofs_flag);
 178         if (IS_ERR(ipath)) {
 179                 btrfs_put_root(local_root);
 180                 ret = PTR_ERR(ipath);
 181                 ipath = NULL;
 182                 /*
 183                  * -ENOMEM, not a critical error, just output an generic error
 184                  * without filename.
 185                  */
 186                 btrfs_warn(fs_info,
 187 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
 188                            warn->logical, warn->mirror_num, root, inum, offset);
 189                 return ret;
 190         }
 191         ret = paths_from_inode(inum, ipath);
 192         if (ret < 0)
 193                 goto err;
 194
 195         /*
 196          * We deliberately ignore the bit ipath might have been too small to
 197          * hold all of the paths here
 198          */
 199         for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
 200                 btrfs_warn(fs_info,
 201 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
 202                            warn->logical, warn->mirror_num, root, inum, offset,
 203                            fs_info->sectorsize, nlink,
 204                            (char *)(unsigned long)ipath->fspath->val[i]);
 205         }
 206
 207         btrfs_put_root(local_root);
 208         free_ipath(ipath);
 209         return 0;
 210
 211 err:
 212         btrfs_warn(fs_info,
 213 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
 214                    warn->logical, warn->mirror_num, root, inum, offset, ret);
 215
 216         free_ipath(ipath);
 217         return ret;
 218 }
 219
 220 /*
 221  * Do extra user-friendly error output (e.g. lookup all the affected files).
 222  *
 223  * Return true if we succeeded doing the backref lookup.
 224  * Return false if such lookup failed, and has to fallback to the old error message.
 225  */
 226 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
 227                                    const u8 *csum, const u8 *csum_expected,
 228                                    int mirror_num)
 229 {
 230         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 231         struct btrfs_path path = { 0 };
 232         struct btrfs_key found_key = { 0 };
 233         struct extent_buffer *eb;
 234         struct btrfs_extent_item *ei;
 235         const u32 csum_size = fs_info->csum_size;
 236         u64 logical;
 237         u64 flags;
 238         u32 item_size;
 239         int ret;
 240
 241         mutex_lock(&fs_info->reloc_mutex);
 242         logical = btrfs_get_reloc_bg_bytenr(fs_info);
 243         mutex_unlock(&fs_info->reloc_mutex);
 244
 245         if (logical == U64_MAX) {
 246                 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
 247                 btrfs_warn_rl(fs_info,
 248 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 249                         inode->root->root_key.objectid, btrfs_ino(inode), file_off,
 250                         CSUM_FMT_VALUE(csum_size, csum),
 251                         CSUM_FMT_VALUE(csum_size, csum_expected),
 252                         mirror_num);
 253                 return;
 254         }
 255
 256         logical += file_off;
 257         btrfs_warn_rl(fs_info,
 258 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 259                         inode->root->root_key.objectid,
 260                         btrfs_ino(inode), file_off, logical,
 261                         CSUM_FMT_VALUE(csum_size, csum),
 262                         CSUM_FMT_VALUE(csum_size, csum_expected),
 263                         mirror_num);
 264
 265         ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
 266         if (ret < 0) {
 267                 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
 268                              logical, ret);
 269                 return;
 270         }
 271         eb = path.nodes[0];
 272         ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
 273         item_size = btrfs_item_size(eb, path.slots[0]);
 274         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 275                 unsigned long ptr = 0;
 276                 u64 ref_root;
 277                 u8 ref_level;
 278
 279                 while (true) {
 280                         ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 281                                                       item_size, &ref_root,
 282                                                       &ref_level);
 283                         if (ret < 0) {
 284                                 btrfs_warn_rl(fs_info,
 285                                 "failed to resolve tree backref for logical %llu: %d",
 286                                               logical, ret);
 287                                 break;
 288                         }
 289                         if (ret > 0)
 290                                 break;
 291
 292                         btrfs_warn_rl(fs_info,
 293 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
 294                                 logical, mirror_num,
 295                                 (ref_level ? "node" : "leaf"),
 296                                 ref_level, ref_root);
 297                 }
 298                 btrfs_release_path(&path);
 299         } else {
 300                 struct btrfs_backref_walk_ctx ctx = { 0 };
 301                 struct data_reloc_warn reloc_warn = { 0 };
 302
 303                 btrfs_release_path(&path);
 304
 305                 ctx.bytenr = found_key.objectid;
 306                 ctx.extent_item_pos = logical - found_key.objectid;
 307                 ctx.fs_info = fs_info;
 308
 309                 reloc_warn.logical = logical;
 310                 reloc_warn.extent_item_size = found_key.offset;
 311                 reloc_warn.mirror_num = mirror_num;
 312                 reloc_warn.fs_info = fs_info;
 313
 314                 iterate_extent_inodes(&ctx, true,
 315                                       data_reloc_print_warning_inode, &reloc_warn);
 316         }
 317 }
 318
 319 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
 320                 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
 321 {
 322         struct btrfs_root *root = inode->root;
 323         const u32 csum_size = root->fs_info->csum_size;
 324
 325         /* For data reloc tree, it's better to do a backref lookup instead. */
 326         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
 327                 return print_data_reloc_error(inode, logical_start, csum,
 328                                               csum_expected, mirror_num);
 329
 330         /* Output without objectid, which is more meaningful */
 331         if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
 332                 btrfs_warn_rl(root->fs_info,
 333 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 334                         root->root_key.objectid, btrfs_ino(inode),
 335                         logical_start,
 336                         CSUM_FMT_VALUE(csum_size, csum),
 337                         CSUM_FMT_VALUE(csum_size, csum_expected),
 338                         mirror_num);
 339         } else {
 340                 btrfs_warn_rl(root->fs_info,
 341 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 342                         root->root_key.objectid, btrfs_ino(inode),
 343                         logical_start,
 344                         CSUM_FMT_VALUE(csum_size, csum),
 345                         CSUM_FMT_VALUE(csum_size, csum_expected),
 346                         mirror_num);
 347         }
 348 }
 349
 350 /*
 351  * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
 352  *
 353  * ilock_flags can have the following bit set:
 354  *
 355  * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
 356  * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
 357  *                   return -EAGAIN
 358  * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
 359  */
 360 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
 361 {
 362         if (ilock_flags & BTRFS_ILOCK_SHARED) {
 363                 if (ilock_flags & BTRFS_ILOCK_TRY) {
 364                         if (!inode_trylock_shared(&inode->vfs_inode))
 365                                 return -EAGAIN;
 366                         else
 367                                 return 0;
 368                 }
 369                 inode_lock_shared(&inode->vfs_inode);
 370         } else {
 371                 if (ilock_flags & BTRFS_ILOCK_TRY) {
 372                         if (!inode_trylock(&inode->vfs_inode))
 373                                 return -EAGAIN;
 374                         else
 375                                 return 0;
 376                 }
 377                 inode_lock(&inode->vfs_inode);
 378         }
 379         if (ilock_flags & BTRFS_ILOCK_MMAP)
 380                 down_write(&inode->i_mmap_lock);
 381         return 0;
 382 }
 383
 384 /*
 385  * btrfs_inode_unlock - unock inode i_rwsem
 386  *
 387  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
 388  * to decide whether the lock acquired is shared or exclusive.
 389  */
 390 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
 391 {
 392         if (ilock_flags & BTRFS_ILOCK_MMAP)
 393                 up_write(&inode->i_mmap_lock);
 394         if (ilock_flags & BTRFS_ILOCK_SHARED)
 395                 inode_unlock_shared(&inode->vfs_inode);
 396         else
 397                 inode_unlock(&inode->vfs_inode);
 398 }
 399
 400 /*
 401  * Cleanup all submitted ordered extents in specified range to handle errors
 402  * from the btrfs_run_delalloc_range() callback.
 403  *
 404  * NOTE: caller must ensure that when an error happens, it can not call
 405  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 406  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 407  * to be released, which we want to happen only when finishing the ordered
 408  * extent (btrfs_finish_ordered_io()).
 409  */
 410 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
 411                                                  struct page *locked_page,
 412                                                  u64 offset, u64 bytes)
 413 {
 414         unsigned long index = offset >> PAGE_SHIFT;
 415         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
 416         u64 page_start = 0, page_end = 0;
 417         struct page *page;
 418
 419         if (locked_page) {
 420                 page_start = page_offset(locked_page);
 421                 page_end = page_start + PAGE_SIZE - 1;
 422         }
 423
 424         while (index <= end_index) {
 425                 /*
 426                  * For locked page, we will call btrfs_mark_ordered_io_finished
 427                  * through btrfs_mark_ordered_io_finished() on it
 428                  * in run_delalloc_range() for the error handling, which will
 429                  * clear page Ordered and run the ordered extent accounting.
 430                  *
 431                  * Here we can't just clear the Ordered bit, or
 432                  * btrfs_mark_ordered_io_finished() would skip the accounting
 433                  * for the page range, and the ordered extent will never finish.
 434                  */
 435                 if (locked_page && index == (page_start >> PAGE_SHIFT)) {
 436                         index++;
 437                         continue;
 438                 }
 439                 page = find_get_page(inode->vfs_inode.i_mapping, index);
 440                 index++;
 441                 if (!page)
 442                         continue;
 443
 444                 /*
 445                  * Here we just clear all Ordered bits for every page in the
 446                  * range, then btrfs_mark_ordered_io_finished() will handle
 447                  * the ordered extent accounting for the range.
 448                  */
 449                 btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
 450                                                offset, bytes);
 451                 put_page(page);
 452         }
 453
 454         if (locked_page) {
 455                 /* The locked page covers the full range, nothing needs to be done */
 456                 if (bytes + offset <= page_start + PAGE_SIZE)
 457                         return;
 458                 /*
 459                  * In case this page belongs to the delalloc range being
 460                  * instantiated then skip it, since the first page of a range is
 461                  * going to be properly cleaned up by the caller of
 462                  * run_delalloc_range
 463                  */
 464                 if (page_start >= offset && page_end <= (offset + bytes - 1)) {
 465                         bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
 466                         offset = page_offset(locked_page) + PAGE_SIZE;
 467                 }
 468         }
 469
 470         return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
 471 }
 472
 473 static int btrfs_dirty_inode(struct btrfs_inode *inode);
 474
 475 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 476                                      struct btrfs_new_inode_args *args)
 477 {
 478         int err;
 479
 480         if (args->default_acl) {
 481                 err = __btrfs_set_acl(trans, args->inode, args->default_acl,
 482                                       ACL_TYPE_DEFAULT);
 483                 if (err)
 484                         return err;
 485         }
 486         if (args->acl) {
 487                 err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
 488                 if (err)
 489                         return err;
 490         }
 491         if (!args->default_acl && !args->acl)
 492                 cache_no_acl(args->inode);
 493         return btrfs_xattr_security_init(trans, args->inode, args->dir,
 494                                          &args->dentry->d_name);
 495 }
 496
 497 /*
 498  * this does all the hard work for inserting an inline extent into
 499  * the btree.  The caller should have done a btrfs_drop_extents so that
 500  * no overlapping inline items exist in the btree
 501  */
 502 static int insert_inline_extent(struct btrfs_trans_handle *trans,
 503                                 struct btrfs_path *path,
 504                                 struct btrfs_inode *inode, bool extent_inserted,
 505                                 size_t size, size_t compressed_size,
 506                                 int compress_type,
 507                                 struct page **compressed_pages,
 508                                 bool update_i_size)
 509 {
 510         struct btrfs_root *root = inode->root;
 511         struct extent_buffer *leaf;
 512         struct page *page = NULL;
 513         char *kaddr;
 514         unsigned long ptr;
 515         struct btrfs_file_extent_item *ei;
 516         int ret;
 517         size_t cur_size = size;
 518         u64 i_size;
 519
 520         ASSERT((compressed_size > 0 && compressed_pages) ||
 521                (compressed_size == 0 && !compressed_pages));
 522
 523         if (compressed_size && compressed_pages)
 524                 cur_size = compressed_size;
 525
 526         if (!extent_inserted) {
 527                 struct btrfs_key key;
 528                 size_t datasize;
 529
 530                 key.objectid = btrfs_ino(inode);
 531                 key.offset = 0;
 532                 key.type = BTRFS_EXTENT_DATA_KEY;
 533
 534                 datasize = btrfs_file_extent_calc_inline_size(cur_size);
 535                 ret = btrfs_insert_empty_item(trans, root, path, &key,
 536                                               datasize);
 537                 if (ret)
 538                         goto fail;
 539         }
 540         leaf = path->nodes[0];
 541         ei = btrfs_item_ptr(leaf, path->slots[0],
 542                             struct btrfs_file_extent_item);
 543         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 544         btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 545         btrfs_set_file_extent_encryption(leaf, ei, 0);
 546         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
 547         btrfs_set_file_extent_ram_bytes(leaf, ei, size);
 548         ptr = btrfs_file_extent_inline_start(ei);
 549
 550         if (compress_type != BTRFS_COMPRESS_NONE) {
 551                 struct page *cpage;
 552                 int i = 0;
 553                 while (compressed_size > 0) {
 554                         cpage = compressed_pages[i];
 555                         cur_size = min_t(unsigned long, compressed_size,
 556                                        PAGE_SIZE);
 557
 558                         kaddr = kmap_local_page(cpage);
 559                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
 560                         kunmap_local(kaddr);
 561
 562                         i++;
 563                         ptr += cur_size;
 564                         compressed_size -= cur_size;
 565                 }
 566                 btrfs_set_file_extent_compression(leaf, ei,
 567                                                   compress_type);
 568         } else {
 569                 page = find_get_page(inode->vfs_inode.i_mapping, 0);
 570                 btrfs_set_file_extent_compression(leaf, ei, 0);
 571                 kaddr = kmap_local_page(page);
 572                 write_extent_buffer(leaf, kaddr, ptr, size);
 573                 kunmap_local(kaddr);
 574                 put_page(page);
 575         }
 576         btrfs_mark_buffer_dirty(leaf);
 577         btrfs_release_path(path);
 578
 579         /*
 580          * We align size to sectorsize for inline extents just for simplicity
 581          * sake.
 582          */
 583         ret = btrfs_inode_set_file_extent_range(inode, 0,
 584                                         ALIGN(size, root->fs_info->sectorsize));
 585         if (ret)
 586                 goto fail;
 587
 588         /*
 589          * We're an inline extent, so nobody can extend the file past i_size
 590          * without locking a page we already have locked.
 591          *
 592          * We must do any i_size and inode updates before we unlock the pages.
 593          * Otherwise we could end up racing with unlink.
 594          */
 595         i_size = i_size_read(&inode->vfs_inode);
 596         if (update_i_size && size > i_size) {
 597                 i_size_write(&inode->vfs_inode, size);
 598                 i_size = size;
 599         }
 600         inode->disk_i_size = i_size;
 601
 602 fail:
 603         return ret;
 604 }
 605
 606
 607 /*
 608  * conditionally insert an inline extent into the file.  This
 609  * does the checks required to make sure the data is small enough
 610  * to fit as an inline extent.
 611  */
 612 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
 613                                           size_t compressed_size,
 614                                           int compress_type,
 615                                           struct page **compressed_pages,
 616                                           bool update_i_size)
 617 {
 618         struct btrfs_drop_extents_args drop_args = { 0 };
 619         struct btrfs_root *root = inode->root;
 620         struct btrfs_fs_info *fs_info = root->fs_info;
 621         struct btrfs_trans_handle *trans;
 622         u64 data_len = (compressed_size ?: size);
 623         int ret;
 624         struct btrfs_path *path;
 625
 626         /*
 627          * We can create an inline extent if it ends at or beyond the current
 628          * i_size, is no larger than a sector (decompressed), and the (possibly
 629          * compressed) data fits in a leaf and the configured maximum inline
 630          * size.
 631          */
 632         if (size < i_size_read(&inode->vfs_inode) ||
 633             size > fs_info->sectorsize ||
 634             data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
 635             data_len > fs_info->max_inline)
 636                 return 1;
 637
 638         path = btrfs_alloc_path();
 639         if (!path)
 640                 return -ENOMEM;
 641
 642         trans = btrfs_join_transaction(root);
 643         if (IS_ERR(trans)) {
 644                 btrfs_free_path(path);
 645                 return PTR_ERR(trans);
 646         }
 647         trans->block_rsv = &inode->block_rsv;
 648
 649         drop_args.path = path;
 650         drop_args.start = 0;
 651         drop_args.end = fs_info->sectorsize;
 652         drop_args.drop_cache = true;
 653         drop_args.replace_extent = true;
 654         drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
 655         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
 656         if (ret) {
 657                 btrfs_abort_transaction(trans, ret);
 658                 goto out;
 659         }
 660
 661         ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
 662                                    size, compressed_size, compress_type,
 663                                    compressed_pages, update_i_size);
 664         if (ret && ret != -ENOSPC) {
 665                 btrfs_abort_transaction(trans, ret);
 666                 goto out;
 667         } else if (ret == -ENOSPC) {
 668                 ret = 1;
 669                 goto out;
 670         }
 671
 672         btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
 673         ret = btrfs_update_inode(trans, root, inode);
 674         if (ret && ret != -ENOSPC) {
 675                 btrfs_abort_transaction(trans, ret);
 676                 goto out;
 677         } else if (ret == -ENOSPC) {
 678                 ret = 1;
 679                 goto out;
 680         }
 681
 682         btrfs_set_inode_full_sync(inode);
 683 out:
 684         /*
 685          * Don't forget to free the reserved space, as for inlined extent
 686          * it won't count as data extent, free them directly here.
 687          * And at reserve time, it's always aligned to page size, so
 688          * just free one page here.
 689          */
 690         btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
 691         btrfs_free_path(path);
 692         btrfs_end_transaction(trans);
 693         return ret;
 694 }
 695
 696 struct async_extent {
 697         u64 start;
 698         u64 ram_size;
 699         u64 compressed_size;
 700         struct page **pages;
 701         unsigned long nr_pages;
 702         int compress_type;
 703         struct list_head list;
 704 };
 705
 706 struct async_chunk {
 707         struct btrfs_inode *inode;
 708         struct page *locked_page;
 709         u64 start;
 710         u64 end;
 711         blk_opf_t write_flags;
 712         struct list_head extents;
 713         struct cgroup_subsys_state *blkcg_css;
 714         struct btrfs_work work;
 715         struct async_cow *async_cow;
 716 };
 717
 718 struct async_cow {
 719         atomic_t num_chunks;
 720         struct async_chunk chunks[];
 721 };
 722
 723 static noinline int add_async_extent(struct async_chunk *cow,
 724                                      u64 start, u64 ram_size,
 725                                      u64 compressed_size,
 726                                      struct page **pages,
 727                                      unsigned long nr_pages,
 728                                      int compress_type)
 729 {
 730         struct async_extent *async_extent;
 731
 732         async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
 733         BUG_ON(!async_extent); /* -ENOMEM */
 734         async_extent->start = start;
 735         async_extent->ram_size = ram_size;
 736         async_extent->compressed_size = compressed_size;
 737         async_extent->pages = pages;
 738         async_extent->nr_pages = nr_pages;
 739         async_extent->compress_type = compress_type;
 740         list_add_tail(&async_extent->list, &cow->extents);
 741         return 0;
 742 }
 743
 744 /*
 745  * Check if the inode needs to be submitted to compression, based on mount
 746  * options, defragmentation, properties or heuristics.
 747  */
 748 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
 749                                       u64 end)
 750 {
 751         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 752
 753         if (!btrfs_inode_can_compress(inode)) {
 754                 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
 755                         KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
 756                         btrfs_ino(inode));
 757                 return 0;
 758         }
 759         /*
 760          * Special check for subpage.
 761          *
 762          * We lock the full page then run each delalloc range in the page, thus
 763          * for the following case, we will hit some subpage specific corner case:
 764          *
 765          * 0            32K             64K
 766          * |    |///////|       |///////|
 767          *              \- A            \- B
 768          *
 769          * In above case, both range A and range B will try to unlock the full
 770          * page [0, 64K), causing the one finished later will have page
 771          * unlocked already, triggering various page lock requirement BUG_ON()s.
 772          *
 773          * So here we add an artificial limit that subpage compression can only
 774          * if the range is fully page aligned.
 775          *
 776          * In theory we only need to ensure the first page is fully covered, but
 777          * the tailing partial page will be locked until the full compression
 778          * finishes, delaying the write of other range.
 779          *
 780          * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
 781          * first to prevent any submitted async extent to unlock the full page.
 782          * By this, we can ensure for subpage case that only the last async_cow
 783          * will unlock the full page.
 784          */
 785         if (fs_info->sectorsize < PAGE_SIZE) {
 786                 if (!PAGE_ALIGNED(start) ||
 787                     !PAGE_ALIGNED(end + 1))
 788                         return 0;
 789         }
 790
 791         /* force compress */
 792         if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 793                 return 1;
 794         /* defrag ioctl */
 795         if (inode->defrag_compress)
 796                 return 1;
 797         /* bad compression ratios */
 798         if (inode->flags & BTRFS_INODE_NOCOMPRESS)
 799                 return 0;
 800         if (btrfs_test_opt(fs_info, COMPRESS) ||
 801             inode->flags & BTRFS_INODE_COMPRESS ||
 802             inode->prop_compress)
 803                 return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
 804         return 0;
 805 }
 806
 807 static inline void inode_should_defrag(struct btrfs_inode *inode,
 808                 u64 start, u64 end, u64 num_bytes, u32 small_write)
 809 {
 810         /* If this is a small write inside eof, kick off a defrag */
 811         if (num_bytes < small_write &&
 812             (start > 0 || end + 1 < inode->disk_i_size))
 813                 btrfs_add_inode_defrag(NULL, inode, small_write);
 814 }
 815
 816 /*
 817  * Work queue call back to started compression on a file and pages.
 818  *
 819  * This is done inside an ordered work queue, and the compression is spread
 820  * across many cpus.  The actual IO submission is step two, and the ordered work
 821  * queue takes care of making sure that happens in the same order things were
 822  * put onto the queue by writepages and friends.
 823  *
 824  * If this code finds it can't get good compression, it puts an entry onto the
 825  * work queue to write the uncompressed bytes.  This makes sure that both
 826  * compressed inodes and uncompressed inodes are written in the same order that
 827  * the flusher thread sent them down.
 828  */
 829 static void compress_file_range(struct btrfs_work *work)
 830 {
 831         struct async_chunk *async_chunk =
 832                 container_of(work, struct async_chunk, work);
 833         struct btrfs_inode *inode = async_chunk->inode;
 834         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 835         struct address_space *mapping = inode->vfs_inode.i_mapping;
 836         u64 blocksize = fs_info->sectorsize;
 837         u64 start = async_chunk->start;
 838         u64 end = async_chunk->end;
 839         u64 actual_end;
 840         u64 i_size;
 841         int ret = 0;
 842         struct page **pages = NULL;
 843         unsigned long nr_pages;
 844         unsigned long total_compressed = 0;
 845         unsigned long total_in = 0;
 846         int i;
 847         int will_compress;
 848         int compress_type = fs_info->compress_type;
 849         int redirty = 0;
 850
 851         inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
 852
 853         /*
 854          * We need to save i_size before now because it could change in between
 855          * us evaluating the size and assigning it.  This is because we lock and
 856          * unlock the page in truncate and fallocate, and then modify the i_size
 857          * later on.
 858          *
 859          * The barriers are to emulate READ_ONCE, remove that once i_size_read
 860          * does that for us.
 861          */
 862         barrier();
 863         i_size = i_size_read(&inode->vfs_inode);
 864         barrier();
 865         actual_end = min_t(u64, i_size, end + 1);
 866 again:
 867         will_compress = 0;
 868         nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
 869         nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
 870
 871         /*
 872          * we don't want to send crud past the end of i_size through
 873          * compression, that's just a waste of CPU time.  So, if the
 874          * end of the file is before the start of our current
 875          * requested range of bytes, we bail out to the uncompressed
 876          * cleanup code that can deal with all of this.
 877          *
 878          * It isn't really the fastest way to fix things, but this is a
 879          * very uncommon corner.
 880          */
 881         if (actual_end <= start)
 882                 goto cleanup_and_bail_uncompressed;
 883
 884         total_compressed = actual_end - start;
 885
 886         /*
 887          * Skip compression for a small file range(<=blocksize) that
 888          * isn't an inline extent, since it doesn't save disk space at all.
 889          */
 890         if (total_compressed <= blocksize &&
 891            (start > 0 || end + 1 < inode->disk_i_size))
 892                 goto cleanup_and_bail_uncompressed;
 893
 894         /*
 895          * For subpage case, we require full page alignment for the sector
 896          * aligned range.
 897          * Thus we must also check against @actual_end, not just @end.
 898          */
 899         if (blocksize < PAGE_SIZE) {
 900                 if (!PAGE_ALIGNED(start) ||
 901                     !PAGE_ALIGNED(round_up(actual_end, blocksize)))
 902                         goto cleanup_and_bail_uncompressed;
 903         }
 904
 905         total_compressed = min_t(unsigned long, total_compressed,
 906                         BTRFS_MAX_UNCOMPRESSED);
 907         total_in = 0;
 908         ret = 0;
 909
 910         /*
 911          * we do compression for mount -o compress and when the
 912          * inode has not been flagged as nocompress.  This flag can
 913          * change at any time if we discover bad compression ratios.
 914          */
 915         if (inode_need_compress(inode, start, end)) {
 916                 WARN_ON(pages);
 917                 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
 918                 if (!pages) {
 919                         /* just bail out to the uncompressed code */
 920                         nr_pages = 0;
 921                         goto cont;
 922                 }
 923
 924                 if (inode->defrag_compress)
 925                         compress_type = inode->defrag_compress;
 926                 else if (inode->prop_compress)
 927                         compress_type = inode->prop_compress;
 928
 929                 /*
 930                  * we need to call clear_page_dirty_for_io on each
 931                  * page in the range.  Otherwise applications with the file
 932                  * mmap'd can wander in and change the page contents while
 933                  * we are compressing them.
 934                  *
 935                  * If the compression fails for any reason, we set the pages
 936                  * dirty again later on.
 937                  *
 938                  * Note that the remaining part is redirtied, the start pointer
 939                  * has moved, the end is the original one.
 940                  */
 941                 if (!redirty) {
 942                         extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
 943                         redirty = 1;
 944                 }
 945
 946                 /* Compression level is applied here and only here */
 947                 ret = btrfs_compress_pages(
 948                         compress_type | (fs_info->compress_level << 4),
 949                                            mapping, start,
 950                                            pages,
 951                                            &nr_pages,
 952                                            &total_in,
 953                                            &total_compressed);
 954
 955                 if (!ret) {
 956                         unsigned long offset = offset_in_page(total_compressed);
 957                         struct page *page = pages[nr_pages - 1];
 958
 959                         /* zero the tail end of the last page, we might be
 960                          * sending it down to disk
 961                          */
 962                         if (offset)
 963                                 memzero_page(page, offset, PAGE_SIZE - offset);
 964                         will_compress = 1;
 965                 }
 966         }
 967 cont:
 968         /*
 969          * Check cow_file_range() for why we don't even try to create inline
 970          * extent for subpage case.
 971          */
 972         if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
 973                 /* lets try to make an inline extent */
 974                 if (ret || total_in < actual_end) {
 975                         /* we didn't compress the entire range, try
 976                          * to make an uncompressed inline extent.
 977                          */
 978                         ret = cow_file_range_inline(inode, actual_end,
 979                                                     0, BTRFS_COMPRESS_NONE,
 980                                                     NULL, false);
 981                 } else {
 982                         /* try making a compressed inline extent */
 983                         ret = cow_file_range_inline(inode, actual_end,
 984                                                     total_compressed,
 985                                                     compress_type, pages,
 986                                                     false);
 987                 }
 988                 if (ret <= 0) {
 989                         unsigned long clear_flags = EXTENT_DELALLOC |
 990                                 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
 991                                 EXTENT_DO_ACCOUNTING;
 992
 993                         if (ret < 0)
 994                                 mapping_set_error(mapping, -EIO);
 995
 996                         /*
 997                          * inline extent creation worked or returned error,
 998                          * we don't need to create any more async work items.
 999                          * Unlock and free up our temp pages.
1000                          *
1001                          * We use DO_ACCOUNTING here because we need the
1002                          * delalloc_release_metadata to be done _after_ we drop
1003                          * our outstanding extent for clearing delalloc for this
1004                          * range.
1005                          */
1006                         extent_clear_unlock_delalloc(inode, start, end,
1007                                                      NULL,
1008                                                      clear_flags,
1009                                                      PAGE_UNLOCK |
1010                                                      PAGE_START_WRITEBACK |
1011                                                      PAGE_END_WRITEBACK);
1012
1013                         /*
1014                          * Ensure we only free the compressed pages if we have
1015                          * them allocated, as we can still reach here with
1016                          * inode_need_compress() == false.
1017                          */
1018                         if (pages) {
1019                                 for (i = 0; i < nr_pages; i++) {
1020                                         WARN_ON(pages[i]->mapping);
1021                                         put_page(pages[i]);
1022                                 }
1023                                 kfree(pages);
1024                         }
1025                         return;
1026                 }
1027         }
1028
1029         if (will_compress) {
1030                 /*
1031                  * we aren't doing an inline extent round the compressed size
1032                  * up to a block size boundary so the allocator does sane
1033                  * things
1034                  */
1035                 total_compressed = ALIGN(total_compressed, blocksize);
1036
1037                 /*
1038                  * one last check to make sure the compression is really a
1039                  * win, compare the page count read with the blocks on disk,
1040                  * compression must free at least one sector size
1041                  */
1042                 total_in = round_up(total_in, fs_info->sectorsize);
1043                 if (total_compressed + blocksize <= total_in) {
1044                         /*
1045                          * The async work queues will take care of doing actual
1046                          * allocation on disk for these compressed pages, and
1047                          * will submit them to the elevator.
1048                          */
1049                         add_async_extent(async_chunk, start, total_in,
1050                                         total_compressed, pages, nr_pages,
1051                                         compress_type);
1052
1053                         if (start + total_in < end) {
1054                                 start += total_in;
1055                                 pages = NULL;
1056                                 cond_resched();
1057                                 goto again;
1058                         }
1059                         return;
1060                 }
1061         }
1062         if (pages) {
1063                 /*
1064                  * the compression code ran but failed to make things smaller,
1065                  * free any pages it allocated and our page pointer array
1066                  */
1067                 for (i = 0; i < nr_pages; i++) {
1068                         WARN_ON(pages[i]->mapping);
1069                         put_page(pages[i]);
1070                 }
1071                 kfree(pages);
1072                 pages = NULL;
1073                 total_compressed = 0;
1074                 nr_pages = 0;
1075
1076                 /* flag the file so we don't compress in the future */
1077                 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
1078                     !(inode->prop_compress)) {
1079                         inode->flags |= BTRFS_INODE_NOCOMPRESS;
1080                 }
1081         }
1082 cleanup_and_bail_uncompressed:
1083         /*
1084          * No compression, but we still need to write the pages in the file
1085          * we've been given so far.  redirty the locked page if it corresponds
1086          * to our extent and set things up for the async work queue to run
1087          * cow_file_range to do the normal delalloc dance.
1088          */
1089         if (async_chunk->locked_page &&
1090             (page_offset(async_chunk->locked_page) >= start &&
1091              page_offset(async_chunk->locked_page)) <= end) {
1092                 __set_page_dirty_nobuffers(async_chunk->locked_page);
1093                 /* unlocked later on in the async handlers */
1094         }
1095
1096         if (redirty)
1097                 extent_range_redirty_for_io(&inode->vfs_inode, start, end);
1098         add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1099                          BTRFS_COMPRESS_NONE);
1100 }
1101
1102 static void free_async_extent_pages(struct async_extent *async_extent)
1103 {
1104         int i;
1105
1106         if (!async_extent->pages)
1107                 return;
1108
1109         for (i = 0; i < async_extent->nr_pages; i++) {
1110                 WARN_ON(async_extent->pages[i]->mapping);
1111                 put_page(async_extent->pages[i]);
1112         }
1113         kfree(async_extent->pages);
1114         async_extent->nr_pages = 0;
1115         async_extent->pages = NULL;
1116 }
1117
1118 static void submit_uncompressed_range(struct btrfs_inode *inode,
1119                                       struct async_extent *async_extent,
1120                                       struct page *locked_page)
1121 {
1122         u64 start = async_extent->start;
1123         u64 end = async_extent->start + async_extent->ram_size - 1;
1124         int ret;
1125         struct writeback_control wbc = {
1126                 .sync_mode              = WB_SYNC_ALL,
1127                 .range_start            = start,
1128                 .range_end              = end,
1129                 .no_cgroup_owner        = 1,
1130         };
1131
1132         /*
1133          * Call cow_file_range() to run the delalloc range directly, since we
1134          * won't go to NOCOW or async path again.
1135          *
1136          * Also we call cow_file_range() with @unlock_page == 0, so that we
1137          * can directly submit them without interruption.
1138          */
1139         ret = cow_file_range(inode, locked_page, start, end, NULL, true, false);
1140         /* Inline extent inserted, page gets unlocked and everything is done */
1141         if (ret == 1)
1142                 return;
1143
1144         if (ret < 0) {
1145                 btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
1146                 if (locked_page) {
1147                         const u64 page_start = page_offset(locked_page);
1148
1149                         set_page_writeback(locked_page);
1150                         end_page_writeback(locked_page);
1151                         btrfs_mark_ordered_io_finished(inode, locked_page,
1152                                                        page_start, PAGE_SIZE,
1153                                                        !ret);
1154                         btrfs_page_clear_uptodate(inode->root->fs_info,
1155                                                   locked_page, page_start,
1156                                                   PAGE_SIZE);
1157                         mapping_set_error(locked_page->mapping, ret);
1158                         unlock_page(locked_page);
1159                 }
1160                 return;
1161         }
1162
1163         /* All pages will be unlocked, including @locked_page */
1164         wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1165         extent_write_locked_range(&inode->vfs_inode, start, end, &wbc);
1166         wbc_detach_inode(&wbc);
1167 }
1168
1169 static void submit_one_async_extent(struct async_chunk *async_chunk,
1170                                     struct async_extent *async_extent,
1171                                     u64 *alloc_hint)
1172 {
1173         struct btrfs_inode *inode = async_chunk->inode;
1174         struct extent_io_tree *io_tree = &inode->io_tree;
1175         struct btrfs_root *root = inode->root;
1176         struct btrfs_fs_info *fs_info = root->fs_info;
1177         struct btrfs_ordered_extent *ordered;
1178         struct btrfs_key ins;
1179         struct page *locked_page = NULL;
1180         struct extent_map *em;
1181         int ret = 0;
1182         u64 start = async_extent->start;
1183         u64 end = async_extent->start + async_extent->ram_size - 1;
1184
1185         if (async_chunk->blkcg_css)
1186                 kthread_associate_blkcg(async_chunk->blkcg_css);
1187
1188         /*
1189          * If async_chunk->locked_page is in the async_extent range, we need to
1190          * handle it.
1191          */
1192         if (async_chunk->locked_page) {
1193                 u64 locked_page_start = page_offset(async_chunk->locked_page);
1194                 u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
1195
1196                 if (!(start >= locked_page_end || end <= locked_page_start))
1197                         locked_page = async_chunk->locked_page;
1198         }
1199         lock_extent(io_tree, start, end, NULL);
1200
1201         if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1202                 submit_uncompressed_range(inode, async_extent, locked_page);
1203                 goto done;
1204         }
1205
1206         ret = btrfs_reserve_extent(root, async_extent->ram_size,
1207                                    async_extent->compressed_size,
1208                                    async_extent->compressed_size,
1209                                    0, *alloc_hint, &ins, 1, 1);
1210         if (ret) {
1211                 /*
1212                  * Here we used to try again by going back to non-compressed
1213                  * path for ENOSPC.  But we can't reserve space even for
1214                  * compressed size, how could it work for uncompressed size
1215                  * which requires larger size?  So here we directly go error
1216                  * path.
1217                  */
1218                 goto out_free;
1219         }
1220
1221         /* Here we're doing allocation and writeback of the compressed pages */
1222         em = create_io_em(inode, start,
1223                           async_extent->ram_size,       /* len */
1224                           start,                        /* orig_start */
1225                           ins.objectid,                 /* block_start */
1226                           ins.offset,                   /* block_len */
1227                           ins.offset,                   /* orig_block_len */
1228                           async_extent->ram_size,       /* ram_bytes */
1229                           async_extent->compress_type,
1230                           BTRFS_ORDERED_COMPRESSED);
1231         if (IS_ERR(em)) {
1232                 ret = PTR_ERR(em);
1233                 goto out_free_reserve;
1234         }
1235         free_extent_map(em);
1236
1237         ordered = btrfs_alloc_ordered_extent(inode, start,      /* file_offset */
1238                                        async_extent->ram_size,  /* num_bytes */
1239                                        async_extent->ram_size,  /* ram_bytes */
1240                                        ins.objectid,            /* disk_bytenr */
1241                                        ins.offset,              /* disk_num_bytes */
1242                                        0,                       /* offset */
1243                                        1 << BTRFS_ORDERED_COMPRESSED,
1244                                        async_extent->compress_type);
1245         if (IS_ERR(ordered)) {
1246                 btrfs_drop_extent_map_range(inode, start, end, false);
1247                 ret = PTR_ERR(ordered);
1248                 goto out_free_reserve;
1249         }
1250         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1251
1252         /* Clear dirty, set writeback and unlock the pages. */
1253         extent_clear_unlock_delalloc(inode, start, end,
1254                         NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
1255                         PAGE_UNLOCK | PAGE_START_WRITEBACK);
1256         btrfs_submit_compressed_write(ordered,
1257                             async_extent->pages,        /* compressed_pages */
1258                             async_extent->nr_pages,
1259                             async_chunk->write_flags, true);
1260         *alloc_hint = ins.objectid + ins.offset;
1261 done:
1262         if (async_chunk->blkcg_css)
1263                 kthread_associate_blkcg(NULL);
1264         kfree(async_extent);
1265         return;
1266
1267 out_free_reserve:
1268         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1269         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1270 out_free:
1271         mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1272         extent_clear_unlock_delalloc(inode, start, end,
1273                                      NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1274                                      EXTENT_DELALLOC_NEW |
1275                                      EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1276                                      PAGE_UNLOCK | PAGE_START_WRITEBACK |
1277                                      PAGE_END_WRITEBACK);
1278         free_async_extent_pages(async_extent);
1279         if (async_chunk->blkcg_css)
1280                 kthread_associate_blkcg(NULL);
1281         btrfs_debug(fs_info,
1282 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1283                     root->root_key.objectid, btrfs_ino(inode), start,
1284                     async_extent->ram_size, ret);
1285         kfree(async_extent);
1286 }
1287
1288 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1289                                       u64 num_bytes)
1290 {
1291         struct extent_map_tree *em_tree = &inode->extent_tree;
1292         struct extent_map *em;
1293         u64 alloc_hint = 0;
1294
1295         read_lock(&em_tree->lock);
1296         em = search_extent_mapping(em_tree, start, num_bytes);
1297         if (em) {
1298                 /*
1299                  * if block start isn't an actual block number then find the
1300                  * first block in this inode and use that as a hint.  If that
1301                  * block is also bogus then just don't worry about it.
1302                  */
1303                 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1304                         free_extent_map(em);
1305                         em = search_extent_mapping(em_tree, 0, 0);
1306                         if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1307                                 alloc_hint = em->block_start;
1308                         if (em)
1309                                 free_extent_map(em);
1310                 } else {
1311                         alloc_hint = em->block_start;
1312                         free_extent_map(em);
1313                 }
1314         }
1315         read_unlock(&em_tree->lock);
1316
1317         return alloc_hint;
1318 }
1319
1320 /*
1321  * when extent_io.c finds a delayed allocation range in the file,
1322  * the call backs end up in this code.  The basic idea is to
1323  * allocate extents on disk for the range, and create ordered data structs
1324  * in ram to track those extents.
1325  *
1326  * locked_page is the page that writepage had locked already.  We use
1327  * it to make sure we don't do extra locks or unlocks.
1328  *
1329  * When this function fails, it unlocks all pages except @locked_page.
1330  *
1331  * When this function successfully creates an inline extent, it returns 1 and
1332  * unlocks all pages including locked_page and starts I/O on them.
1333  * (In reality inline extents are limited to a single page, so locked_page is
1334  * the only page handled anyway).
1335  *
1336  * When this function succeed and creates a normal extent, the page locking
1337  * status depends on the passed in flags:
1338  *
1339  * - If @keep_locked is set, all pages are kept locked.
1340  * - Else all pages except for @locked_page are unlocked.
1341  *
1342  * When a failure happens in the second or later iteration of the
1343  * while-loop, the ordered extents created in previous iterations are kept
1344  * intact. So, the caller must clean them up by calling
1345  * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1346  * example.
1347  */
1348 static noinline int cow_file_range(struct btrfs_inode *inode,
1349                                    struct page *locked_page, u64 start, u64 end,
1350                                    u64 *done_offset,
1351                                    bool keep_locked, bool no_inline)
1352 {
1353         struct btrfs_root *root = inode->root;
1354         struct btrfs_fs_info *fs_info = root->fs_info;
1355         u64 alloc_hint = 0;
1356         u64 orig_start = start;
1357         u64 num_bytes;
1358         unsigned long ram_size;
1359         u64 cur_alloc_size = 0;
1360         u64 min_alloc_size;
1361         u64 blocksize = fs_info->sectorsize;
1362         struct btrfs_key ins;
1363         struct extent_map *em;
1364         unsigned clear_bits;
1365         unsigned long page_ops;
1366         bool extent_reserved = false;
1367         int ret = 0;
1368
1369         if (btrfs_is_free_space_inode(inode)) {
1370                 ret = -EINVAL;
1371                 goto out_unlock;
1372         }
1373
1374         num_bytes = ALIGN(end - start + 1, blocksize);
1375         num_bytes = max(blocksize,  num_bytes);
1376         ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1377
1378         inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1379
1380         /*
1381          * Due to the page size limit, for subpage we can only trigger the
1382          * writeback for the dirty sectors of page, that means data writeback
1383          * is doing more writeback than what we want.
1384          *
1385          * This is especially unexpected for some call sites like fallocate,
1386          * where we only increase i_size after everything is done.
1387          * This means we can trigger inline extent even if we didn't want to.
1388          * So here we skip inline extent creation completely.
1389          */
1390         if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
1391                 u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
1392                                        end + 1);
1393
1394                 /* lets try to make an inline extent */
1395                 ret = cow_file_range_inline(inode, actual_end, 0,
1396                                             BTRFS_COMPRESS_NONE, NULL, false);
1397                 if (ret == 0) {
1398                         /*
1399                          * We use DO_ACCOUNTING here because we need the
1400                          * delalloc_release_metadata to be run _after_ we drop
1401                          * our outstanding extent for clearing delalloc for this
1402                          * range.
1403                          */
1404                         extent_clear_unlock_delalloc(inode, start, end,
1405                                      locked_page,
1406                                      EXTENT_LOCKED | EXTENT_DELALLOC |
1407                                      EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1408                                      EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1409                                      PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1410                         /*
1411                          * locked_page is locked by the caller of
1412                          * writepage_delalloc(), not locked by
1413                          * __process_pages_contig().
1414                          *
1415                          * We can't let __process_pages_contig() to unlock it,
1416                          * as it doesn't have any subpage::writers recorded.
1417                          *
1418                          * Here we manually unlock the page, since the caller
1419                          * can't determine if it's an inline extent or a
1420                          * compressed extent.
1421                          */
1422                         unlock_page(locked_page);
1423                         return 1;
1424                 } else if (ret < 0) {
1425                         goto out_unlock;
1426                 }
1427         }
1428
1429         alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1430
1431         /*
1432          * Relocation relies on the relocated extents to have exactly the same
1433          * size as the original extents. Normally writeback for relocation data
1434          * extents follows a NOCOW path because relocation preallocates the
1435          * extents. However, due to an operation such as scrub turning a block
1436          * group to RO mode, it may fallback to COW mode, so we must make sure
1437          * an extent allocated during COW has exactly the requested size and can
1438          * not be split into smaller extents, otherwise relocation breaks and
1439          * fails during the stage where it updates the bytenr of file extent
1440          * items.
1441          */
1442         if (btrfs_is_data_reloc_root(root))
1443                 min_alloc_size = num_bytes;
1444         else
1445                 min_alloc_size = fs_info->sectorsize;
1446
1447         while (num_bytes > 0) {
1448                 struct btrfs_ordered_extent *ordered;
1449
1450                 cur_alloc_size = num_bytes;
1451                 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1452                                            min_alloc_size, 0, alloc_hint,
1453                                            &ins, 1, 1);
1454                 if (ret < 0)
1455                         goto out_unlock;
1456                 cur_alloc_size = ins.offset;
1457                 extent_reserved = true;
1458
1459                 ram_size = ins.offset;
1460                 em = create_io_em(inode, start, ins.offset, /* len */
1461                                   start, /* orig_start */
1462                                   ins.objectid, /* block_start */
1463                                   ins.offset, /* block_len */
1464                                   ins.offset, /* orig_block_len */
1465                                   ram_size, /* ram_bytes */
1466                                   BTRFS_COMPRESS_NONE, /* compress_type */
1467                                   BTRFS_ORDERED_REGULAR /* type */);
1468                 if (IS_ERR(em)) {
1469                         ret = PTR_ERR(em);
1470                         goto out_reserve;
1471                 }
1472                 free_extent_map(em);
1473
1474                 ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
1475                                         ram_size, ins.objectid, cur_alloc_size,
1476                                         0, 1 << BTRFS_ORDERED_REGULAR,
1477                                         BTRFS_COMPRESS_NONE);
1478                 if (IS_ERR(ordered)) {
1479                         ret = PTR_ERR(ordered);
1480                         goto out_drop_extent_cache;
1481                 }
1482
1483                 if (btrfs_is_data_reloc_root(root)) {
1484                         ret = btrfs_reloc_clone_csums(ordered);
1485
1486                         /*
1487                          * Only drop cache here, and process as normal.
1488                          *
1489                          * We must not allow extent_clear_unlock_delalloc()
1490                          * at out_unlock label to free meta of this ordered
1491                          * extent, as its meta should be freed by
1492                          * btrfs_finish_ordered_io().
1493                          *
1494                          * So we must continue until @start is increased to
1495                          * skip current ordered extent.
1496                          */
1497                         if (ret)
1498                                 btrfs_drop_extent_map_range(inode, start,
1499                                                             start + ram_size - 1,
1500                                                             false);
1501                 }
1502                 btrfs_put_ordered_extent(ordered);
1503
1504                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1505
1506                 /*
1507                  * We're not doing compressed IO, don't unlock the first page
1508                  * (which the caller expects to stay locked), don't clear any
1509                  * dirty bits and don't set any writeback bits
1510                  *
1511                  * Do set the Ordered (Private2) bit so we know this page was
1512                  * properly setup for writepage.
1513                  */
1514                 page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1515                 page_ops |= PAGE_SET_ORDERED;
1516
1517                 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1518                                              locked_page,
1519                                              EXTENT_LOCKED | EXTENT_DELALLOC,
1520                                              page_ops);
1521                 if (num_bytes < cur_alloc_size)
1522                         num_bytes = 0;
1523                 else
1524                         num_bytes -= cur_alloc_size;
1525                 alloc_hint = ins.objectid + ins.offset;
1526                 start += cur_alloc_size;
1527                 extent_reserved = false;
1528
1529                 /*
1530                  * btrfs_reloc_clone_csums() error, since start is increased
1531                  * extent_clear_unlock_delalloc() at out_unlock label won't
1532                  * free metadata of current ordered extent, we're OK to exit.
1533                  */
1534                 if (ret)
1535                         goto out_unlock;
1536         }
1537         return ret;
1538
1539 out_drop_extent_cache:
1540         btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
1541 out_reserve:
1542         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1543         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1544 out_unlock:
1545         /*
1546          * If done_offset is non-NULL and ret == -EAGAIN, we expect the
1547          * caller to write out the successfully allocated region and retry.
1548          */
1549         if (done_offset && ret == -EAGAIN) {
1550                 if (orig_start < start)
1551                         *done_offset = start - 1;
1552                 else
1553                         *done_offset = start;
1554                 return ret;
1555         } else if (ret == -EAGAIN) {
1556                 /* Convert to -ENOSPC since the caller cannot retry. */
1557                 ret = -ENOSPC;
1558         }
1559
1560         /*
1561          * Now, we have three regions to clean up:
1562          *
1563          * |-------(1)----|---(2)---|-------------(3)----------|
1564          * `- orig_start  `- start  `- start + cur_alloc_size  `- end
1565          *
1566          * We process each region below.
1567          */
1568
1569         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1570                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1571         page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1572
1573         /*
1574          * For the range (1). We have already instantiated the ordered extents
1575          * for this region. They are cleaned up by
1576          * btrfs_cleanup_ordered_extents() in e.g,
1577          * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
1578          * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
1579          * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
1580          * function.
1581          *
1582          * However, in case of @keep_locked, we still need to unlock the pages
1583          * (except @locked_page) to ensure all the pages are unlocked.
1584          */
1585         if (keep_locked && orig_start < start) {
1586                 if (!locked_page)
1587                         mapping_set_error(inode->vfs_inode.i_mapping, ret);
1588                 extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1589                                              locked_page, 0, page_ops);
1590         }
1591
1592         /*
1593          * For the range (2). If we reserved an extent for our delalloc range
1594          * (or a subrange) and failed to create the respective ordered extent,
1595          * then it means that when we reserved the extent we decremented the
1596          * extent's size from the data space_info's bytes_may_use counter and
1597          * incremented the space_info's bytes_reserved counter by the same
1598          * amount. We must make sure extent_clear_unlock_delalloc() does not try
1599          * to decrement again the data space_info's bytes_may_use counter,
1600          * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1601          */
1602         if (extent_reserved) {
1603                 extent_clear_unlock_delalloc(inode, start,
1604                                              start + cur_alloc_size - 1,
1605                                              locked_page,
1606                                              clear_bits,
1607                                              page_ops);
1608                 start += cur_alloc_size;
1609         }
1610
1611         /*
1612          * For the range (3). We never touched the region. In addition to the
1613          * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1614          * space_info's bytes_may_use counter, reserved in
1615          * btrfs_check_data_free_space().
1616          */
1617         if (start < end) {
1618                 clear_bits |= EXTENT_CLEAR_DATA_RESV;
1619                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
1620                                              clear_bits, page_ops);
1621         }
1622         return ret;
1623 }
1624
1625 /*
1626  * Phase two of compressed writeback.  This is the ordered portion of the code,
1627  * which only gets called in the order the work was queued.  We walk all the
1628  * async extents created by compress_file_range and send them down to the disk.
1629  */
1630 static noinline void submit_compressed_extents(struct btrfs_work *work)
1631 {
1632         struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1633                                                      work);
1634         struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1635         struct async_extent *async_extent;
1636         unsigned long nr_pages;
1637         u64 alloc_hint = 0;
1638
1639         nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1640                 PAGE_SHIFT;
1641
1642         while (!list_empty(&async_chunk->extents)) {
1643                 async_extent = list_entry(async_chunk->extents.next,
1644                                           struct async_extent, list);
1645                 list_del(&async_extent->list);
1646                 submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1647         }
1648
1649         /* atomic_sub_return implies a barrier */
1650         if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1651             5 * SZ_1M)
1652                 cond_wake_up_nomb(&fs_info->async_submit_wait);
1653 }
1654
1655 static noinline void async_cow_free(struct btrfs_work *work)
1656 {
1657         struct async_chunk *async_chunk;
1658         struct async_cow *async_cow;
1659
1660         async_chunk = container_of(work, struct async_chunk, work);
1661         btrfs_add_delayed_iput(async_chunk->inode);
1662         if (async_chunk->blkcg_css)
1663                 css_put(async_chunk->blkcg_css);
1664
1665         async_cow = async_chunk->async_cow;
1666         if (atomic_dec_and_test(&async_cow->num_chunks))
1667                 kvfree(async_cow);
1668 }
1669
1670 static bool run_delalloc_compressed(struct btrfs_inode *inode,
1671                                     struct page *locked_page, u64 start,
1672                                     u64 end, struct writeback_control *wbc)
1673 {
1674         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1675         struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1676         struct async_cow *ctx;
1677         struct async_chunk *async_chunk;
1678         unsigned long nr_pages;
1679         u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1680         int i;
1681         unsigned nofs_flag;
1682         const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1683
1684         nofs_flag = memalloc_nofs_save();
1685         ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1686         memalloc_nofs_restore(nofs_flag);
1687         if (!ctx)
1688                 return false;
1689
1690         unlock_extent(&inode->io_tree, start, end, NULL);
1691         set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1692
1693         async_chunk = ctx->chunks;
1694         atomic_set(&ctx->num_chunks, num_chunks);
1695
1696         for (i = 0; i < num_chunks; i++) {
1697                 u64 cur_end = min(end, start + SZ_512K - 1);
1698
1699                 /*
1700                  * igrab is called higher up in the call chain, take only the
1701                  * lightweight reference for the callback lifetime
1702                  */
1703                 ihold(&inode->vfs_inode);
1704                 async_chunk[i].async_cow = ctx;
1705                 async_chunk[i].inode = inode;
1706                 async_chunk[i].start = start;
1707                 async_chunk[i].end = cur_end;
1708                 async_chunk[i].write_flags = write_flags;
1709                 INIT_LIST_HEAD(&async_chunk[i].extents);
1710
1711                 /*
1712                  * The locked_page comes all the way from writepage and its
1713                  * the original page we were actually given.  As we spread
1714                  * this large delalloc region across multiple async_chunk
1715                  * structs, only the first struct needs a pointer to locked_page
1716                  *
1717                  * This way we don't need racey decisions about who is supposed
1718                  * to unlock it.
1719                  */
1720                 if (locked_page) {
1721                         /*
1722                          * Depending on the compressibility, the pages might or
1723                          * might not go through async.  We want all of them to
1724                          * be accounted against wbc once.  Let's do it here
1725                          * before the paths diverge.  wbc accounting is used
1726                          * only for foreign writeback detection and doesn't
1727                          * need full accuracy.  Just account the whole thing
1728                          * against the first page.
1729                          */
1730                         wbc_account_cgroup_owner(wbc, locked_page,
1731                                                  cur_end - start);
1732                         async_chunk[i].locked_page = locked_page;
1733                         locked_page = NULL;
1734                 } else {
1735                         async_chunk[i].locked_page = NULL;
1736                 }
1737
1738                 if (blkcg_css != blkcg_root_css) {
1739                         css_get(blkcg_css);
1740                         async_chunk[i].blkcg_css = blkcg_css;
1741                         async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1742                 } else {
1743                         async_chunk[i].blkcg_css = NULL;
1744                 }
1745
1746                 btrfs_init_work(&async_chunk[i].work, compress_file_range,
1747                                 submit_compressed_extents, async_cow_free);
1748
1749                 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1750                 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1751
1752                 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1753
1754                 start = cur_end + 1;
1755         }
1756         return true;
1757 }
1758
1759 static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
1760                                        struct page *locked_page, u64 start,
1761                                        u64 end, struct writeback_control *wbc)
1762 {
1763         u64 done_offset = end;
1764         int ret;
1765         bool locked_page_done = false;
1766
1767         while (start <= end) {
1768                 ret = cow_file_range(inode, locked_page, start, end, &done_offset,
1769                                      true, false);
1770                 if (ret && ret != -EAGAIN)
1771                         return ret;
1772
1773                 if (ret == 0)
1774                         done_offset = end;
1775
1776                 if (done_offset == start) {
1777                         wait_on_bit_io(&inode->root->fs_info->flags,
1778                                        BTRFS_FS_NEED_ZONE_FINISH,
1779                                        TASK_UNINTERRUPTIBLE);
1780                         continue;
1781                 }
1782
1783                 if (!locked_page_done) {
1784                         __set_page_dirty_nobuffers(locked_page);
1785                         account_page_redirty(locked_page);
1786                 }
1787                 locked_page_done = true;
1788                 extent_write_locked_range(&inode->vfs_inode, start, done_offset,
1789                                           wbc);
1790                 start = done_offset + 1;
1791         }
1792
1793         return 1;
1794 }
1795
1796 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1797                                         u64 bytenr, u64 num_bytes, bool nowait)
1798 {
1799         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1800         struct btrfs_ordered_sum *sums;
1801         int ret;
1802         LIST_HEAD(list);
1803
1804         ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
1805                                       &list, 0, nowait);
1806         if (ret == 0 && list_empty(&list))
1807                 return 0;
1808
1809         while (!list_empty(&list)) {
1810                 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1811                 list_del(&sums->list);
1812                 kfree(sums);
1813         }
1814         if (ret < 0)
1815                 return ret;
1816         return 1;
1817 }
1818
1819 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1820                            const u64 start, const u64 end)
1821 {
1822         const bool is_space_ino = btrfs_is_free_space_inode(inode);
1823         const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1824         const u64 range_bytes = end + 1 - start;
1825         struct extent_io_tree *io_tree = &inode->io_tree;
1826         u64 range_start = start;
1827         u64 count;
1828         int ret;
1829
1830         /*
1831          * If EXTENT_NORESERVE is set it means that when the buffered write was
1832          * made we had not enough available data space and therefore we did not
1833          * reserve data space for it, since we though we could do NOCOW for the
1834          * respective file range (either there is prealloc extent or the inode
1835          * has the NOCOW bit set).
1836          *
1837          * However when we need to fallback to COW mode (because for example the
1838          * block group for the corresponding extent was turned to RO mode by a
1839          * scrub or relocation) we need to do the following:
1840          *
1841          * 1) We increment the bytes_may_use counter of the data space info.
1842          *    If COW succeeds, it allocates a new data extent and after doing
1843          *    that it decrements the space info's bytes_may_use counter and
1844          *    increments its bytes_reserved counter by the same amount (we do
1845          *    this at btrfs_add_reserved_bytes()). So we need to increment the
1846          *    bytes_may_use counter to compensate (when space is reserved at
1847          *    buffered write time, the bytes_may_use counter is incremented);
1848          *
1849          * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1850          *    that if the COW path fails for any reason, it decrements (through
1851          *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1852          *    data space info, which we incremented in the step above.
1853          *
1854          * If we need to fallback to cow and the inode corresponds to a free
1855          * space cache inode or an inode of the data relocation tree, we must
1856          * also increment bytes_may_use of the data space_info for the same
1857          * reason. Space caches and relocated data extents always get a prealloc
1858          * extent for them, however scrub or balance may have set the block
1859          * group that contains that extent to RO mode and therefore force COW
1860          * when starting writeback.
1861          */
1862         count = count_range_bits(io_tree, &range_start, end, range_bytes,
1863                                  EXTENT_NORESERVE, 0, NULL);
1864         if (count > 0 || is_space_ino || is_reloc_ino) {
1865                 u64 bytes = count;
1866                 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1867                 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1868
1869                 if (is_space_ino || is_reloc_ino)
1870                         bytes = range_bytes;
1871
1872                 spin_lock(&sinfo->lock);
1873                 btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1874                 spin_unlock(&sinfo->lock);
1875
1876                 if (count > 0)
1877                         clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1878                                          NULL);
1879         }
1880
1881         /*
1882          * Don't try to create inline extents, as a mix of inline extent that
1883          * is written out and unlocked directly and a normal NOCOW extent
1884          * doesn't work.
1885          */
1886         ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
1887         ASSERT(ret != 1);
1888         return ret;
1889 }
1890
1891 struct can_nocow_file_extent_args {
1892         /* Input fields. */
1893
1894         /* Start file offset of the range we want to NOCOW. */
1895         u64 start;
1896         /* End file offset (inclusive) of the range we want to NOCOW. */
1897         u64 end;
1898         bool writeback_path;
1899         bool strict;
1900         /*
1901          * Free the path passed to can_nocow_file_extent() once it's not needed
1902          * anymore.
1903          */
1904         bool free_path;
1905
1906         /* Output fields. Only set when can_nocow_file_extent() returns 1. */
1907
1908         u64 disk_bytenr;
1909         u64 disk_num_bytes;
1910         u64 extent_offset;
1911         /* Number of bytes that can be written to in NOCOW mode. */
1912         u64 num_bytes;
1913 };
1914
1915 /*
1916  * Check if we can NOCOW the file extent that the path points to.
1917  * This function may return with the path released, so the caller should check
1918  * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1919  *
1920  * Returns: < 0 on error
1921  *            0 if we can not NOCOW
1922  *            1 if we can NOCOW
1923  */
1924 static int can_nocow_file_extent(struct btrfs_path *path,
1925                                  struct btrfs_key *key,
1926                                  struct btrfs_inode *inode,
1927                                  struct can_nocow_file_extent_args *args)
1928 {
1929         const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1930         struct extent_buffer *leaf = path->nodes[0];
1931         struct btrfs_root *root = inode->root;
1932         struct btrfs_file_extent_item *fi;
1933         u64 extent_end;
1934         u8 extent_type;
1935         int can_nocow = 0;
1936         int ret = 0;
1937         bool nowait = path->nowait;
1938
1939         fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1940         extent_type = btrfs_file_extent_type(leaf, fi);
1941
1942         if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1943                 goto out;
1944
1945         /* Can't access these fields unless we know it's not an inline extent. */
1946         args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1947         args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1948         args->extent_offset = btrfs_file_extent_offset(leaf, fi);
1949
1950         if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1951             extent_type == BTRFS_FILE_EXTENT_REG)
1952                 goto out;
1953
1954         /*
1955          * If the extent was created before the generation where the last snapshot
1956          * for its subvolume was created, then this implies the extent is shared,
1957          * hence we must COW.
1958          */
1959         if (!args->strict &&
1960             btrfs_file_extent_generation(leaf, fi) <=
1961             btrfs_root_last_snapshot(&root->root_item))
1962                 goto out;
1963
1964         /* An explicit hole, must COW. */
1965         if (args->disk_bytenr == 0)
1966                 goto out;
1967
1968         /* Compressed/encrypted/encoded extents must be COWed. */
1969         if (btrfs_file_extent_compression(leaf, fi) ||
1970             btrfs_file_extent_encryption(leaf, fi) ||
1971             btrfs_file_extent_other_encoding(leaf, fi))
1972                 goto out;
1973
1974         extent_end = btrfs_file_extent_end(path);
1975
1976         /*
1977          * The following checks can be expensive, as they need to take other
1978          * locks and do btree or rbtree searches, so release the path to avoid
1979          * blocking other tasks for too long.
1980          */
1981         btrfs_release_path(path);
1982
1983         ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
1984                                     key->offset - args->extent_offset,
1985                                     args->disk_bytenr, args->strict, path);
1986         WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1987         if (ret != 0)
1988                 goto out;
1989
1990         if (args->free_path) {
1991                 /*
1992                  * We don't need the path anymore, plus through the
1993                  * csum_exist_in_range() call below we will end up allocating
1994                  * another path. So free the path to avoid unnecessary extra
1995                  * memory usage.
1996                  */
1997                 btrfs_free_path(path);
1998                 path = NULL;
1999         }
2000
2001         /* If there are pending snapshots for this root, we must COW. */
2002         if (args->writeback_path && !is_freespace_inode &&
2003             atomic_read(&root->snapshot_force_cow))
2004                 goto out;
2005
2006         args->disk_bytenr += args->extent_offset;
2007         args->disk_bytenr += args->start - key->offset;
2008         args->num_bytes = min(args->end + 1, extent_end) - args->start;
2009
2010         /*
2011          * Force COW if csums exist in the range. This ensures that csums for a
2012          * given extent are either valid or do not exist.
2013          */
2014         ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
2015                                   nowait);
2016         WARN_ON_ONCE(ret > 0 && is_freespace_inode);
2017         if (ret != 0)
2018                 goto out;
2019
2020         can_nocow = 1;
2021  out:
2022         if (args->free_path && path)
2023                 btrfs_free_path(path);
2024
2025         return ret < 0 ? ret : can_nocow;
2026 }
2027
2028 /*
2029  * when nowcow writeback call back.  This checks for snapshots or COW copies
2030  * of the extents that exist in the file, and COWs the file as required.
2031  *
2032  * If no cow copies or snapshots exist, we write directly to the existing
2033  * blocks on disk
2034  */
2035 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
2036                                        struct page *locked_page,
2037                                        const u64 start, const u64 end)
2038 {
2039         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2040         struct btrfs_root *root = inode->root;
2041         struct btrfs_path *path;
2042         u64 cow_start = (u64)-1;
2043         u64 cur_offset = start;
2044         int ret;
2045         bool check_prev = true;
2046         u64 ino = btrfs_ino(inode);
2047         struct btrfs_block_group *bg;
2048         bool nocow = false;
2049         struct can_nocow_file_extent_args nocow_args = { 0 };
2050
2051         path = btrfs_alloc_path();
2052         if (!path) {
2053                 extent_clear_unlock_delalloc(inode, start, end, locked_page,
2054                                              EXTENT_LOCKED | EXTENT_DELALLOC |
2055                                              EXTENT_DO_ACCOUNTING |
2056                                              EXTENT_DEFRAG, PAGE_UNLOCK |
2057                                              PAGE_START_WRITEBACK |
2058                                              PAGE_END_WRITEBACK);
2059                 return -ENOMEM;
2060         }
2061
2062         nocow_args.end = end;
2063         nocow_args.writeback_path = true;
2064
2065         while (1) {
2066                 struct btrfs_ordered_extent *ordered;
2067                 struct btrfs_key found_key;
2068                 struct btrfs_file_extent_item *fi;
2069                 struct extent_buffer *leaf;
2070                 u64 extent_end;
2071                 u64 ram_bytes;
2072                 u64 nocow_end;
2073                 int extent_type;
2074                 bool is_prealloc;
2075
2076                 nocow = false;
2077
2078                 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2079                                                cur_offset, 0);
2080                 if (ret < 0)
2081                         goto error;
2082
2083                 /*
2084                  * If there is no extent for our range when doing the initial
2085                  * search, then go back to the previous slot as it will be the
2086                  * one containing the search offset
2087                  */
2088                 if (ret > 0 && path->slots[0] > 0 && check_prev) {
2089                         leaf = path->nodes[0];
2090                         btrfs_item_key_to_cpu(leaf, &found_key,
2091                                               path->slots[0] - 1);
2092                         if (found_key.objectid == ino &&
2093                             found_key.type == BTRFS_EXTENT_DATA_KEY)
2094                                 path->slots[0]--;
2095                 }
2096                 check_prev = false;
2097 next_slot:
2098                 /* Go to next leaf if we have exhausted the current one */
2099                 leaf = path->nodes[0];
2100                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2101                         ret = btrfs_next_leaf(root, path);
2102                         if (ret < 0) {
2103                                 if (cow_start != (u64)-1)
2104                                         cur_offset = cow_start;
2105                                 goto error;
2106                         }
2107                         if (ret > 0)
2108                                 break;
2109                         leaf = path->nodes[0];
2110                 }
2111
2112                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2113
2114                 /* Didn't find anything for our INO */
2115                 if (found_key.objectid > ino)
2116                         break;
2117                 /*
2118                  * Keep searching until we find an EXTENT_ITEM or there are no
2119                  * more extents for this inode
2120                  */
2121                 if (WARN_ON_ONCE(found_key.objectid < ino) ||
2122                     found_key.type < BTRFS_EXTENT_DATA_KEY) {
2123                         path->slots[0]++;
2124                         goto next_slot;
2125                 }
2126
2127                 /* Found key is not EXTENT_DATA_KEY or starts after req range */
2128                 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2129                     found_key.offset > end)
2130                         break;
2131
2132                 /*
2133                  * If the found extent starts after requested offset, then
2134                  * adjust extent_end to be right before this extent begins
2135                  */
2136                 if (found_key.offset > cur_offset) {
2137                         extent_end = found_key.offset;
2138                         extent_type = 0;
2139                         goto out_check;
2140                 }
2141
2142                 /*
2143                  * Found extent which begins before our range and potentially
2144                  * intersect it
2145                  */
2146                 fi = btrfs_item_ptr(leaf, path->slots[0],
2147                                     struct btrfs_file_extent_item);
2148                 extent_type = btrfs_file_extent_type(leaf, fi);
2149                 /* If this is triggered then we have a memory corruption. */
2150                 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2151                 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2152                         ret = -EUCLEAN;
2153                         goto error;
2154                 }
2155                 ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
2156                 extent_end = btrfs_file_extent_end(path);
2157
2158                 /*
2159                  * If the extent we got ends before our current offset, skip to
2160                  * the next extent.
2161                  */
2162                 if (extent_end <= cur_offset) {
2163                         path->slots[0]++;
2164                         goto next_slot;
2165                 }
2166
2167                 nocow_args.start = cur_offset;
2168                 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2169                 if (ret < 0) {
2170                         if (cow_start != (u64)-1)
2171                                 cur_offset = cow_start;
2172                         goto error;
2173                 } else if (ret == 0) {
2174                         goto out_check;
2175                 }
2176
2177                 ret = 0;
2178                 bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
2179                 if (bg)
2180                         nocow = true;
2181 out_check:
2182                 /*
2183                  * If nocow is false then record the beginning of the range
2184                  * that needs to be COWed
2185                  */
2186                 if (!nocow) {
2187                         if (cow_start == (u64)-1)
2188                                 cow_start = cur_offset;
2189                         cur_offset = extent_end;
2190                         if (cur_offset > end)
2191                                 break;
2192                         if (!path->nodes[0])
2193                                 continue;
2194                         path->slots[0]++;
2195                         goto next_slot;
2196                 }
2197
2198                 /*
2199                  * COW range from cow_start to found_key.offset - 1. As the key
2200                  * will contain the beginning of the first extent that can be
2201                  * NOCOW, following one which needs to be COW'ed
2202                  */
2203                 if (cow_start != (u64)-1) {
2204                         ret = fallback_to_cow(inode, locked_page,
2205                                               cow_start, found_key.offset - 1);
2206                         if (ret)
2207                                 goto error;
2208                         cow_start = (u64)-1;
2209                 }
2210
2211                 nocow_end = cur_offset + nocow_args.num_bytes - 1;
2212                 is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2213                 if (is_prealloc) {
2214                         u64 orig_start = found_key.offset - nocow_args.extent_offset;
2215                         struct extent_map *em;
2216
2217                         em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
2218                                           orig_start,
2219                                           nocow_args.disk_bytenr, /* block_start */
2220                                           nocow_args.num_bytes, /* block_len */
2221                                           nocow_args.disk_num_bytes, /* orig_block_len */
2222                                           ram_bytes, BTRFS_COMPRESS_NONE,
2223                                           BTRFS_ORDERED_PREALLOC);
2224                         if (IS_ERR(em)) {
2225                                 ret = PTR_ERR(em);
2226                                 goto error;
2227                         }
2228                         free_extent_map(em);
2229                 }
2230
2231                 ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
2232                                 nocow_args.num_bytes, nocow_args.num_bytes,
2233                                 nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
2234                                 is_prealloc
2235                                 ? (1 << BTRFS_ORDERED_PREALLOC)
2236                                 : (1 << BTRFS_ORDERED_NOCOW),
2237                                 BTRFS_COMPRESS_NONE);
2238                 if (IS_ERR(ordered)) {
2239                         if (is_prealloc) {
2240                                 btrfs_drop_extent_map_range(inode, cur_offset,
2241                                                             nocow_end, false);
2242                         }
2243                         ret = PTR_ERR(ordered);
2244                         goto error;
2245                 }
2246
2247                 if (nocow) {
2248                         btrfs_dec_nocow_writers(bg);
2249                         nocow = false;
2250                 }
2251
2252                 if (btrfs_is_data_reloc_root(root))
2253                         /*
2254                          * Error handled later, as we must prevent
2255                          * extent_clear_unlock_delalloc() in error handler
2256                          * from freeing metadata of created ordered extent.
2257                          */
2258                         ret = btrfs_reloc_clone_csums(ordered);
2259                 btrfs_put_ordered_extent(ordered);
2260
2261                 extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
2262                                              locked_page, EXTENT_LOCKED |
2263                                              EXTENT_DELALLOC |
2264                                              EXTENT_CLEAR_DATA_RESV,
2265                                              PAGE_UNLOCK | PAGE_SET_ORDERED);
2266
2267                 cur_offset = extent_end;
2268
2269                 /*
2270                  * btrfs_reloc_clone_csums() error, now we're OK to call error
2271                  * handler, as metadata for created ordered extent will only
2272                  * be freed by btrfs_finish_ordered_io().
2273                  */
2274                 if (ret)
2275                         goto error;
2276                 if (cur_offset > end)
2277                         break;
2278         }
2279         btrfs_release_path(path);
2280
2281         if (cur_offset <= end && cow_start == (u64)-1)
2282                 cow_start = cur_offset;
2283
2284         if (cow_start != (u64)-1) {
2285                 cur_offset = end;
2286                 ret = fallback_to_cow(inode, locked_page, cow_start, end);
2287                 if (ret)
2288                         goto error;
2289         }
2290
2291 error:
2292         if (nocow)
2293                 btrfs_dec_nocow_writers(bg);
2294
2295         if (ret && cur_offset < end)
2296                 extent_clear_unlock_delalloc(inode, cur_offset, end,
2297                                              locked_page, EXTENT_LOCKED |
2298                                              EXTENT_DELALLOC | EXTENT_DEFRAG |
2299                                              EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2300                                              PAGE_START_WRITEBACK |
2301                                              PAGE_END_WRITEBACK);
2302         btrfs_free_path(path);
2303         return ret;
2304 }
2305
2306 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2307 {
2308         if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2309                 if (inode->defrag_bytes &&
2310                     test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
2311                                    0, NULL))
2312                         return false;
2313                 return true;
2314         }
2315         return false;
2316 }
2317
2318 /*
2319  * Function to process delayed allocation (create CoW) for ranges which are
2320  * being touched for the first time.
2321  */
2322 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
2323                              u64 start, u64 end, struct writeback_control *wbc)
2324 {
2325         const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2326         int ret;
2327
2328         /*
2329          * The range must cover part of the @locked_page, or a return of 1
2330          * can confuse the caller.
2331          */
2332         ASSERT(!(end <= page_offset(locked_page) ||
2333                  start >= page_offset(locked_page) + PAGE_SIZE));
2334
2335         if (should_nocow(inode, start, end)) {
2336                 /*
2337                  * Normally on a zoned device we're only doing COW writes, but
2338                  * in case of relocation on a zoned filesystem we have taken
2339                  * precaution, that we're only writing sequentially. It's safe
2340                  * to use run_delalloc_nocow() here, like for  regular
2341                  * preallocated inodes.
2342                  */
2343                 ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
2344                 ret = run_delalloc_nocow(inode, locked_page, start, end);
2345                 goto out;
2346         }
2347
2348         if (btrfs_inode_can_compress(inode) &&
2349             inode_need_compress(inode, start, end) &&
2350             run_delalloc_compressed(inode, locked_page, start, end, wbc))
2351                 return 1;
2352
2353         if (zoned)
2354                 ret = run_delalloc_zoned(inode, locked_page, start, end, wbc);
2355         else
2356                 ret = cow_file_range(inode, locked_page, start, end, NULL,
2357                                      false, false);
2358
2359 out:
2360         if (ret < 0)
2361                 btrfs_cleanup_ordered_extents(inode, locked_page, start,
2362                                               end - start + 1);
2363         return ret;
2364 }
2365
2366 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2367                                  struct extent_state *orig, u64 split)
2368 {
2369         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2370         u64 size;
2371
2372         /* not delalloc, ignore it */
2373         if (!(orig->state & EXTENT_DELALLOC))
2374                 return;
2375
2376         size = orig->end - orig->start + 1;
2377         if (size > fs_info->max_extent_size) {
2378                 u32 num_extents;
2379                 u64 new_size;
2380
2381                 /*
2382                  * See the explanation in btrfs_merge_delalloc_extent, the same
2383                  * applies here, just in reverse.
2384                  */
2385                 new_size = orig->end - split + 1;
2386                 num_extents = count_max_extents(fs_info, new_size);
2387                 new_size = split - orig->start;
2388                 num_extents += count_max_extents(fs_info, new_size);
2389                 if (count_max_extents(fs_info, size) >= num_extents)
2390                         return;
2391         }
2392
2393         spin_lock(&inode->lock);
2394         btrfs_mod_outstanding_extents(inode, 1);
2395         spin_unlock(&inode->lock);
2396 }
2397
2398 /*
2399  * Handle merged delayed allocation extents so we can keep track of new extents
2400  * that are just merged onto old extents, such as when we are doing sequential
2401  * writes, so we can properly account for the metadata space we'll need.
2402  */
2403 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2404                                  struct extent_state *other)
2405 {
2406         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2407         u64 new_size, old_size;
2408         u32 num_extents;
2409
2410         /* not delalloc, ignore it */
2411         if (!(other->state & EXTENT_DELALLOC))
2412                 return;
2413
2414         if (new->start > other->start)
2415                 new_size = new->end - other->start + 1;
2416         else
2417                 new_size = other->end - new->start + 1;
2418
2419         /* we're not bigger than the max, unreserve the space and go */
2420         if (new_size <= fs_info->max_extent_size) {
2421                 spin_lock(&inode->lock);
2422                 btrfs_mod_outstanding_extents(inode, -1);
2423                 spin_unlock(&inode->lock);
2424                 return;
2425         }
2426
2427         /*
2428          * We have to add up either side to figure out how many extents were
2429          * accounted for before we merged into one big extent.  If the number of
2430          * extents we accounted for is <= the amount we need for the new range
2431          * then we can return, otherwise drop.  Think of it like this
2432          *
2433          * [ 4k][MAX_SIZE]
2434          *
2435          * So we've grown the extent by a MAX_SIZE extent, this would mean we
2436          * need 2 outstanding extents, on one side we have 1 and the other side
2437          * we have 1 so they are == and we can return.  But in this case
2438          *
2439          * [MAX_SIZE+4k][MAX_SIZE+4k]
2440          *
2441          * Each range on their own accounts for 2 extents, but merged together
2442          * they are only 3 extents worth of accounting, so we need to drop in
2443          * this case.
2444          */
2445         old_size = other->end - other->start + 1;
2446         num_extents = count_max_extents(fs_info, old_size);
2447         old_size = new->end - new->start + 1;
2448         num_extents += count_max_extents(fs_info, old_size);
2449         if (count_max_extents(fs_info, new_size) >= num_extents)
2450                 return;
2451
2452         spin_lock(&inode->lock);
2453         btrfs_mod_outstanding_extents(inode, -1);
2454         spin_unlock(&inode->lock);
2455 }
2456
2457 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2458                                       struct btrfs_inode *inode)
2459 {
2460         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2461
2462         spin_lock(&root->delalloc_lock);
2463         if (list_empty(&inode->delalloc_inodes)) {
2464                 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2465                 set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
2466                 root->nr_delalloc_inodes++;
2467                 if (root->nr_delalloc_inodes == 1) {
2468                         spin_lock(&fs_info->delalloc_root_lock);
2469                         BUG_ON(!list_empty(&root->delalloc_root));
2470                         list_add_tail(&root->delalloc_root,
2471                                       &fs_info->delalloc_roots);
2472                         spin_unlock(&fs_info->delalloc_root_lock);
2473                 }
2474         }
2475         spin_unlock(&root->delalloc_lock);
2476 }
2477
2478 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2479                                 struct btrfs_inode *inode)
2480 {
2481         struct btrfs_fs_info *fs_info = root->fs_info;
2482
2483         if (!list_empty(&inode->delalloc_inodes)) {
2484                 list_del_init(&inode->delalloc_inodes);
2485                 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2486                           &inode->runtime_flags);
2487                 root->nr_delalloc_inodes--;
2488                 if (!root->nr_delalloc_inodes) {
2489                         ASSERT(list_empty(&root->delalloc_inodes));
2490                         spin_lock(&fs_info->delalloc_root_lock);
2491                         BUG_ON(list_empty(&root->delalloc_root));
2492                         list_del_init(&root->delalloc_root);
2493                         spin_unlock(&fs_info->delalloc_root_lock);
2494                 }
2495         }
2496 }
2497
2498 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2499                                      struct btrfs_inode *inode)
2500 {
2501         spin_lock(&root->delalloc_lock);
2502         __btrfs_del_delalloc_inode(root, inode);
2503         spin_unlock(&root->delalloc_lock);
2504 }
2505
2506 /*
2507  * Properly track delayed allocation bytes in the inode and to maintain the
2508  * list of inodes that have pending delalloc work to be done.
2509  */
2510 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2511                                u32 bits)
2512 {
2513         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2514
2515         if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2516                 WARN_ON(1);
2517         /*
2518          * set_bit and clear bit hooks normally require _irqsave/restore
2519          * but in this case, we are only testing for the DELALLOC
2520          * bit, which is only set or cleared with irqs on
2521          */
2522         if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2523                 struct btrfs_root *root = inode->root;
2524                 u64 len = state->end + 1 - state->start;
2525                 u32 num_extents = count_max_extents(fs_info, len);
2526                 bool do_list = !btrfs_is_free_space_inode(inode);
2527
2528                 spin_lock(&inode->lock);
2529                 btrfs_mod_outstanding_extents(inode, num_extents);
2530                 spin_unlock(&inode->lock);
2531
2532                 /* For sanity tests */
2533                 if (btrfs_is_testing(fs_info))
2534                         return;
2535
2536                 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2537                                          fs_info->delalloc_batch);
2538                 spin_lock(&inode->lock);
2539                 inode->delalloc_bytes += len;
2540                 if (bits & EXTENT_DEFRAG)
2541                         inode->defrag_bytes += len;
2542                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2543                                          &inode->runtime_flags))
2544                         btrfs_add_delalloc_inodes(root, inode);
2545                 spin_unlock(&inode->lock);
2546         }
2547
2548         if (!(state->state & EXTENT_DELALLOC_NEW) &&
2549             (bits & EXTENT_DELALLOC_NEW)) {
2550                 spin_lock(&inode->lock);
2551                 inode->new_delalloc_bytes += state->end + 1 - state->start;
2552                 spin_unlock(&inode->lock);
2553         }
2554 }
2555
2556 /*
2557  * Once a range is no longer delalloc this function ensures that proper
2558  * accounting happens.
2559  */
2560 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2561                                  struct extent_state *state, u32 bits)
2562 {
2563         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2564         u64 len = state->end + 1 - state->start;
2565         u32 num_extents = count_max_extents(fs_info, len);
2566
2567         if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2568                 spin_lock(&inode->lock);
2569                 inode->defrag_bytes -= len;
2570                 spin_unlock(&inode->lock);
2571         }
2572
2573         /*
2574          * set_bit and clear bit hooks normally require _irqsave/restore
2575          * but in this case, we are only testing for the DELALLOC
2576          * bit, which is only set or cleared with irqs on
2577          */
2578         if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2579                 struct btrfs_root *root = inode->root;
2580                 bool do_list = !btrfs_is_free_space_inode(inode);
2581
2582                 spin_lock(&inode->lock);
2583                 btrfs_mod_outstanding_extents(inode, -num_extents);
2584                 spin_unlock(&inode->lock);
2585
2586                 /*
2587                  * We don't reserve metadata space for space cache inodes so we
2588                  * don't need to call delalloc_release_metadata if there is an
2589                  * error.
2590                  */
2591                 if (bits & EXTENT_CLEAR_META_RESV &&
2592                     root != fs_info->tree_root)
2593                         btrfs_delalloc_release_metadata(inode, len, false);
2594
2595                 /* For sanity tests. */
2596                 if (btrfs_is_testing(fs_info))
2597                         return;
2598
2599                 if (!btrfs_is_data_reloc_root(root) &&
2600                     do_list && !(state->state & EXTENT_NORESERVE) &&
2601                     (bits & EXTENT_CLEAR_DATA_RESV))
2602                         btrfs_free_reserved_data_space_noquota(fs_info, len);
2603
2604                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2605                                          fs_info->delalloc_batch);
2606                 spin_lock(&inode->lock);
2607                 inode->delalloc_bytes -= len;
2608                 if (do_list && inode->delalloc_bytes == 0 &&
2609                     test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2610                                         &inode->runtime_flags))
2611                         btrfs_del_delalloc_inode(root, inode);
2612                 spin_unlock(&inode->lock);
2613         }
2614
2615         if ((state->state & EXTENT_DELALLOC_NEW) &&
2616             (bits & EXTENT_DELALLOC_NEW)) {
2617                 spin_lock(&inode->lock);
2618                 ASSERT(inode->new_delalloc_bytes >= len);
2619                 inode->new_delalloc_bytes -= len;
2620                 if (bits & EXTENT_ADD_INODE_BYTES)
2621                         inode_add_bytes(&inode->vfs_inode, len);
2622                 spin_unlock(&inode->lock);
2623         }
2624 }
2625
2626 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
2627                                         struct btrfs_ordered_extent *ordered)
2628 {
2629         u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
2630         u64 len = bbio->bio.bi_iter.bi_size;
2631         struct btrfs_ordered_extent *new;
2632         int ret;
2633
2634         /* Must always be called for the beginning of an ordered extent. */
2635         if (WARN_ON_ONCE(start != ordered->disk_bytenr))
2636                 return -EINVAL;
2637
2638         /* No need to split if the ordered extent covers the entire bio. */
2639         if (ordered->disk_num_bytes == len) {
2640                 refcount_inc(&ordered->refs);
2641                 bbio->ordered = ordered;
2642                 return 0;
2643         }
2644
2645         /*
2646          * Don't split the extent_map for NOCOW extents, as we're writing into
2647          * a pre-existing one.
2648          */
2649         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
2650                 ret = split_extent_map(bbio->inode, bbio->file_offset,
2651                                        ordered->num_bytes, len,
2652                                        ordered->disk_bytenr);
2653                 if (ret)
2654                         return ret;
2655         }
2656
2657         new = btrfs_split_ordered_extent(ordered, len);
2658         if (IS_ERR(new))
2659                 return PTR_ERR(new);
2660         bbio->ordered = new;
2661         return 0;
2662 }
2663
2664 /*
2665  * given a list of ordered sums record them in the inode.  This happens
2666  * at IO completion time based on sums calculated at bio submission time.
2667  */
2668 static int add_pending_csums(struct btrfs_trans_handle *trans,
2669                              struct list_head *list)
2670 {
2671         struct btrfs_ordered_sum *sum;
2672         struct btrfs_root *csum_root = NULL;
2673         int ret;
2674
2675         list_for_each_entry(sum, list, list) {
2676                 trans->adding_csums = true;
2677                 if (!csum_root)
2678                         csum_root = btrfs_csum_root(trans->fs_info,
2679                                                     sum->logical);
2680                 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2681                 trans->adding_csums = false;
2682                 if (ret)
2683                         return ret;
2684         }
2685         return 0;
2686 }
2687
2688 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2689                                          const u64 start,
2690                                          const u64 len,
2691                                          struct extent_state **cached_state)
2692 {
2693         u64 search_start = start;
2694         const u64 end = start + len - 1;
2695
2696         while (search_start < end) {
2697                 const u64 search_len = end - search_start + 1;
2698                 struct extent_map *em;
2699                 u64 em_len;
2700                 int ret = 0;
2701
2702                 em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2703                 if (IS_ERR(em))
2704                         return PTR_ERR(em);
2705
2706                 if (em->block_start != EXTENT_MAP_HOLE)
2707                         goto next;
2708
2709                 em_len = em->len;
2710                 if (em->start < search_start)
2711                         em_len -= search_start - em->start;
2712                 if (em_len > search_len)
2713                         em_len = search_len;
2714
2715                 ret = set_extent_bit(&inode->io_tree, search_start,
2716                                      search_start + em_len - 1,
2717                                      EXTENT_DELALLOC_NEW, cached_state);
2718 next:
2719                 search_start = extent_map_end(em);
2720                 free_extent_map(em);
2721                 if (ret)
2722                         return ret;
2723         }
2724         return 0;
2725 }
2726
2727 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2728                               unsigned int extra_bits,
2729                               struct extent_state **cached_state)
2730 {
2731         WARN_ON(PAGE_ALIGNED(end));
2732
2733         if (start >= i_size_read(&inode->vfs_inode) &&
2734             !(inode->flags & BTRFS_INODE_PREALLOC)) {
2735                 /*
2736                  * There can't be any extents following eof in this case so just
2737                  * set the delalloc new bit for the range directly.
2738                  */
2739                 extra_bits |= EXTENT_DELALLOC_NEW;
2740         } else {
2741                 int ret;
2742
2743                 ret = btrfs_find_new_delalloc_bytes(inode, start,
2744                                                     end + 1 - start,
2745                                                     cached_state);
2746                 if (ret)
2747                         return ret;
2748         }
2749
2750         return set_extent_bit(&inode->io_tree, start, end,
2751                               EXTENT_DELALLOC | extra_bits, cached_state);
2752 }
2753
2754 /* see btrfs_writepage_start_hook for details on why this is required */
2755 struct btrfs_writepage_fixup {
2756         struct page *page;
2757         struct btrfs_inode *inode;
2758         struct btrfs_work work;
2759 };
2760
2761 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2762 {
2763         struct btrfs_writepage_fixup *fixup =
2764                 container_of(work, struct btrfs_writepage_fixup, work);
2765         struct btrfs_ordered_extent *ordered;
2766         struct extent_state *cached_state = NULL;
2767         struct extent_changeset *data_reserved = NULL;
2768         struct page *page = fixup->page;
2769         struct btrfs_inode *inode = fixup->inode;
2770         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2771         u64 page_start = page_offset(page);
2772         u64 page_end = page_offset(page) + PAGE_SIZE - 1;
2773         int ret = 0;
2774         bool free_delalloc_space = true;
2775
2776         /*
2777          * This is similar to page_mkwrite, we need to reserve the space before
2778          * we take the page lock.
2779          */
2780         ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2781                                            PAGE_SIZE);
2782 again:
2783         lock_page(page);
2784
2785         /*
2786          * Before we queued this fixup, we took a reference on the page.
2787          * page->mapping may go NULL, but it shouldn't be moved to a different
2788          * address space.
2789          */
2790         if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2791                 /*
2792                  * Unfortunately this is a little tricky, either
2793                  *
2794                  * 1) We got here and our page had already been dealt with and
2795                  *    we reserved our space, thus ret == 0, so we need to just
2796                  *    drop our space reservation and bail.  This can happen the
2797                  *    first time we come into the fixup worker, or could happen
2798                  *    while waiting for the ordered extent.
2799                  * 2) Our page was already dealt with, but we happened to get an
2800                  *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2801                  *    this case we obviously don't have anything to release, but
2802                  *    because the page was already dealt with we don't want to
2803                  *    mark the page with an error, so make sure we're resetting
2804                  *    ret to 0.  This is why we have this check _before_ the ret
2805                  *    check, because we do not want to have a surprise ENOSPC
2806                  *    when the page was already properly dealt with.
2807                  */
2808                 if (!ret) {
2809                         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2810                         btrfs_delalloc_release_space(inode, data_reserved,
2811                                                      page_start, PAGE_SIZE,
2812                                                      true);
2813                 }
2814                 ret = 0;
2815                 goto out_page;
2816         }
2817
2818         /*
2819          * We can't mess with the page state unless it is locked, so now that
2820          * it is locked bail if we failed to make our space reservation.
2821          */
2822         if (ret)
2823                 goto out_page;
2824
2825         lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2826
2827         /* already ordered? We're done */
2828         if (PageOrdered(page))
2829                 goto out_reserved;
2830
2831         ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2832         if (ordered) {
2833                 unlock_extent(&inode->io_tree, page_start, page_end,
2834                               &cached_state);
2835                 unlock_page(page);
2836                 btrfs_start_ordered_extent(ordered);
2837                 btrfs_put_ordered_extent(ordered);
2838                 goto again;
2839         }
2840
2841         ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2842                                         &cached_state);
2843         if (ret)
2844                 goto out_reserved;
2845
2846         /*
2847          * Everything went as planned, we're now the owner of a dirty page with
2848          * delayed allocation bits set and space reserved for our COW
2849          * destination.
2850          *
2851          * The page was dirty when we started, nothing should have cleaned it.
2852          */
2853         BUG_ON(!PageDirty(page));
2854         free_delalloc_space = false;
2855 out_reserved:
2856         btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2857         if (free_delalloc_space)
2858                 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2859                                              PAGE_SIZE, true);
2860         unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2861 out_page:
2862         if (ret) {
2863                 /*
2864                  * We hit ENOSPC or other errors.  Update the mapping and page
2865                  * to reflect the errors and clean the page.
2866                  */
2867                 mapping_set_error(page->mapping, ret);
2868                 btrfs_mark_ordered_io_finished(inode, page, page_start,
2869                                                PAGE_SIZE, !ret);
2870                 btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE);
2871                 clear_page_dirty_for_io(page);
2872         }
2873         btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
2874         unlock_page(page);
2875         put_page(page);
2876         kfree(fixup);
2877         extent_changeset_free(data_reserved);
2878         /*
2879          * As a precaution, do a delayed iput in case it would be the last iput
2880          * that could need flushing space. Recursing back to fixup worker would
2881          * deadlock.
2882          */
2883         btrfs_add_delayed_iput(inode);
2884 }
2885
2886 /*
2887  * There are a few paths in the higher layers of the kernel that directly
2888  * set the page dirty bit without asking the filesystem if it is a
2889  * good idea.  This causes problems because we want to make sure COW
2890  * properly happens and the data=ordered rules are followed.
2891  *
2892  * In our case any range that doesn't have the ORDERED bit set
2893  * hasn't been properly setup for IO.  We kick off an async process
2894  * to fix it up.  The async helper will wait for ordered extents, set
2895  * the delalloc bit and make it safe to write the page.
2896  */
2897 int btrfs_writepage_cow_fixup(struct page *page)
2898 {
2899         struct inode *inode = page->mapping->host;
2900         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2901         struct btrfs_writepage_fixup *fixup;
2902
2903         /* This page has ordered extent covering it already */
2904         if (PageOrdered(page))
2905                 return 0;
2906
2907         /*
2908          * PageChecked is set below when we create a fixup worker for this page,
2909          * don't try to create another one if we're already PageChecked()
2910          *
2911          * The extent_io writepage code will redirty the page if we send back
2912          * EAGAIN.
2913          */
2914         if (PageChecked(page))
2915                 return -EAGAIN;
2916
2917         fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2918         if (!fixup)
2919                 return -EAGAIN;
2920
2921         /*
2922          * We are already holding a reference to this inode from
2923          * write_cache_pages.  We need to hold it because the space reservation
2924          * takes place outside of the page lock, and we can't trust
2925          * page->mapping outside of the page lock.
2926          */
2927         ihold(inode);
2928         btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2929         get_page(page);
2930         btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2931         fixup->page = page;
2932         fixup->inode = BTRFS_I(inode);
2933         btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2934
2935         return -EAGAIN;
2936 }
2937
2938 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2939                                        struct btrfs_inode *inode, u64 file_pos,
2940                                        struct btrfs_file_extent_item *stack_fi,
2941                                        const bool update_inode_bytes,
2942                                        u64 qgroup_reserved)
2943 {
2944         struct btrfs_root *root = inode->root;
2945         const u64 sectorsize = root->fs_info->sectorsize;
2946         struct btrfs_path *path;
2947         struct extent_buffer *leaf;
2948         struct btrfs_key ins;
2949         u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2950         u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2951         u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2952         u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2953         u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2954         struct btrfs_drop_extents_args drop_args = { 0 };
2955         int ret;
2956
2957         path = btrfs_alloc_path();
2958         if (!path)
2959                 return -ENOMEM;
2960
2961         /*
2962          * we may be replacing one extent in the tree with another.
2963          * The new extent is pinned in the extent map, and we don't want
2964          * to drop it from the cache until it is completely in the btree.
2965          *
2966          * So, tell btrfs_drop_extents to leave this extent in the cache.
2967          * the caller is expected to unpin it and allow it to be merged
2968          * with the others.
2969          */
2970         drop_args.path = path;
2971         drop_args.start = file_pos;
2972         drop_args.end = file_pos + num_bytes;
2973         drop_args.replace_extent = true;
2974         drop_args.extent_item_size = sizeof(*stack_fi);
2975         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2976         if (ret)
2977                 goto out;
2978
2979         if (!drop_args.extent_inserted) {
2980                 ins.objectid = btrfs_ino(inode);
2981                 ins.offset = file_pos;
2982                 ins.type = BTRFS_EXTENT_DATA_KEY;
2983
2984                 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2985                                               sizeof(*stack_fi));
2986                 if (ret)
2987                         goto out;
2988         }
2989         leaf = path->nodes[0];
2990         btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2991         write_extent_buffer(leaf, stack_fi,
2992                         btrfs_item_ptr_offset(leaf, path->slots[0]),
2993                         sizeof(struct btrfs_file_extent_item));
2994
2995         btrfs_mark_buffer_dirty(leaf);
2996         btrfs_release_path(path);
2997
2998         /*
2999          * If we dropped an inline extent here, we know the range where it is
3000          * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
3001          * number of bytes only for that range containing the inline extent.
3002          * The remaining of the range will be processed when clearning the
3003          * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
3004          */
3005         if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
3006                 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
3007
3008                 inline_size = drop_args.bytes_found - inline_size;
3009                 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
3010                 drop_args.bytes_found -= inline_size;
3011                 num_bytes -= sectorsize;
3012         }
3013
3014         if (update_inode_bytes)
3015                 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
3016
3017         ins.objectid = disk_bytenr;
3018         ins.offset = disk_num_bytes;
3019         ins.type = BTRFS_EXTENT_ITEM_KEY;
3020
3021         ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
3022         if (ret)
3023                 goto out;
3024
3025         ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
3026                                                file_pos - offset,
3027                                                qgroup_reserved, &ins);
3028 out:
3029         btrfs_free_path(path);
3030
3031         return ret;
3032 }
3033
3034 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
3035                                          u64 start, u64 len)
3036 {
3037         struct btrfs_block_group *cache;
3038
3039         cache = btrfs_lookup_block_group(fs_info, start);
3040         ASSERT(cache);
3041
3042         spin_lock(&cache->lock);
3043         cache->delalloc_bytes -= len;
3044         spin_unlock(&cache->lock);
3045
3046         btrfs_put_block_group(cache);
3047 }
3048
3049 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
3050                                              struct btrfs_ordered_extent *oe)
3051 {
3052         struct btrfs_file_extent_item stack_fi;
3053         bool update_inode_bytes;
3054         u64 num_bytes = oe->num_bytes;
3055         u64 ram_bytes = oe->ram_bytes;
3056
3057         memset(&stack_fi, 0, sizeof(stack_fi));
3058         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
3059         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
3060         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
3061                                                    oe->disk_num_bytes);
3062         btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
3063         if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
3064                 num_bytes = oe->truncated_len;
3065                 ram_bytes = num_bytes;
3066         }
3067         btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
3068         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
3069         btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
3070         /* Encryption and other encoding is reserved and all 0 */
3071
3072         /*
3073          * For delalloc, when completing an ordered extent we update the inode's
3074          * bytes when clearing the range in the inode's io tree, so pass false
3075          * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3076          * except if the ordered extent was truncated.
3077          */
3078         update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3079                              test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3080                              test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3081
3082         return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
3083                                            oe->file_offset, &stack_fi,
3084                                            update_inode_bytes, oe->qgroup_rsv);
3085 }
3086
3087 /*
3088  * As ordered data IO finishes, this gets called so we can finish
3089  * an ordered extent if the range of bytes in the file it covers are
3090  * fully written.
3091  */
3092 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3093 {
3094         struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3095         struct btrfs_root *root = inode->root;
3096         struct btrfs_fs_info *fs_info = root->fs_info;
3097         struct btrfs_trans_handle *trans = NULL;
3098         struct extent_io_tree *io_tree = &inode->io_tree;
3099         struct extent_state *cached_state = NULL;
3100         u64 start, end;
3101         int compress_type = 0;
3102         int ret = 0;
3103         u64 logical_len = ordered_extent->num_bytes;
3104         bool freespace_inode;
3105         bool truncated = false;
3106         bool clear_reserved_extent = true;
3107         unsigned int clear_bits = EXTENT_DEFRAG;
3108
3109         start = ordered_extent->file_offset;
3110         end = start + ordered_extent->num_bytes - 1;
3111
3112         if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3113             !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3114             !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3115             !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3116                 clear_bits |= EXTENT_DELALLOC_NEW;
3117
3118         freespace_inode = btrfs_is_free_space_inode(inode);
3119         if (!freespace_inode)
3120                 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3121
3122         if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3123                 ret = -EIO;
3124                 goto out;
3125         }
3126
3127         if (btrfs_is_zoned(fs_info))
3128                 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3129                                         ordered_extent->disk_num_bytes);
3130
3131         if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3132                 truncated = true;
3133                 logical_len = ordered_extent->truncated_len;
3134                 /* Truncated the entire extent, don't bother adding */
3135                 if (!logical_len)
3136                         goto out;
3137         }
3138
3139         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3140                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3141
3142                 btrfs_inode_safe_disk_i_size_write(inode, 0);
3143                 if (freespace_inode)
3144                         trans = btrfs_join_transaction_spacecache(root);
3145                 else
3146                         trans = btrfs_join_transaction(root);
3147                 if (IS_ERR(trans)) {
3148                         ret = PTR_ERR(trans);
3149                         trans = NULL;
3150                         goto out;
3151                 }
3152                 trans->block_rsv = &inode->block_rsv;
3153                 ret = btrfs_update_inode_fallback(trans, root, inode);
3154                 if (ret) /* -ENOMEM or corruption */
3155                         btrfs_abort_transaction(trans, ret);
3156                 goto out;
3157         }
3158
3159         clear_bits |= EXTENT_LOCKED;
3160         lock_extent(io_tree, start, end, &cached_state);
3161
3162         if (freespace_inode)
3163                 trans = btrfs_join_transaction_spacecache(root);
3164         else
3165                 trans = btrfs_join_transaction(root);
3166         if (IS_ERR(trans)) {
3167                 ret = PTR_ERR(trans);
3168                 trans = NULL;
3169                 goto out;
3170         }
3171
3172         trans->block_rsv = &inode->block_rsv;
3173
3174         if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3175                 compress_type = ordered_extent->compress_type;
3176         if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3177                 BUG_ON(compress_type);
3178                 ret = btrfs_mark_extent_written(trans, inode,
3179                                                 ordered_extent->file_offset,
3180                                                 ordered_extent->file_offset +
3181                                                 logical_len);
3182                 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3183                                                   ordered_extent->disk_num_bytes);
3184         } else {
3185                 BUG_ON(root == fs_info->tree_root);
3186                 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3187                 if (!ret) {
3188                         clear_reserved_extent = false;
3189                         btrfs_release_delalloc_bytes(fs_info,
3190                                                 ordered_extent->disk_bytenr,
3191                                                 ordered_extent->disk_num_bytes);
3192                 }
3193         }
3194         unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3195                            ordered_extent->num_bytes, trans->transid);
3196         if (ret < 0) {
3197                 btrfs_abort_transaction(trans, ret);
3198                 goto out;
3199         }
3200
3201         ret = add_pending_csums(trans, &ordered_extent->list);
3202         if (ret) {
3203                 btrfs_abort_transaction(trans, ret);
3204                 goto out;
3205         }
3206
3207         /*
3208          * If this is a new delalloc range, clear its new delalloc flag to
3209          * update the inode's number of bytes. This needs to be done first
3210          * before updating the inode item.
3211          */
3212         if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3213             !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3214                 clear_extent_bit(&inode->io_tree, start, end,
3215                                  EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3216                                  &cached_state);
3217
3218         btrfs_inode_safe_disk_i_size_write(inode, 0);
3219         ret = btrfs_update_inode_fallback(trans, root, inode);
3220         if (ret) { /* -ENOMEM or corruption */
3221                 btrfs_abort_transaction(trans, ret);
3222                 goto out;
3223         }
3224         ret = 0;
3225 out:
3226         clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3227                          &cached_state);
3228
3229         if (trans)
3230                 btrfs_end_transaction(trans);
3231
3232         if (ret || truncated) {
3233                 u64 unwritten_start = start;
3234
3235                 /*
3236                  * If we failed to finish this ordered extent for any reason we
3237                  * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3238                  * extent, and mark the inode with the error if it wasn't
3239                  * already set.  Any error during writeback would have already
3240                  * set the mapping error, so we need to set it if we're the ones
3241                  * marking this ordered extent as failed.
3242                  */
3243                 if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3244                                              &ordered_extent->flags))
3245                         mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3246
3247                 if (truncated)
3248                         unwritten_start += logical_len;
3249                 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3250
3251                 /* Drop extent maps for the part of the extent we didn't write. */
3252                 btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
3253
3254                 /*
3255                  * If the ordered extent had an IOERR or something else went
3256                  * wrong we need to return the space for this ordered extent
3257                  * back to the allocator.  We only free the extent in the
3258                  * truncated case if we didn't write out the extent at all.
3259                  *
3260                  * If we made it past insert_reserved_file_extent before we
3261                  * errored out then we don't need to do this as the accounting
3262                  * has already been done.
3263                  */
3264                 if ((ret || !logical_len) &&
3265                     clear_reserved_extent &&
3266                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3267                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3268                         /*
3269                          * Discard the range before returning it back to the
3270                          * free space pool
3271                          */
3272                         if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3273                                 btrfs_discard_extent(fs_info,
3274                                                 ordered_extent->disk_bytenr,
3275                                                 ordered_extent->disk_num_bytes,
3276                                                 NULL);
3277                         btrfs_free_reserved_extent(fs_info,
3278                                         ordered_extent->disk_bytenr,
3279                                         ordered_extent->disk_num_bytes, 1);
3280                         /*
3281                          * Actually free the qgroup rsv which was released when
3282                          * the ordered extent was created.
3283                          */
3284                         btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
3285                                                   ordered_extent->qgroup_rsv,
3286                                                   BTRFS_QGROUP_RSV_DATA);
3287                 }
3288         }
3289
3290         /*
3291          * This needs to be done to make sure anybody waiting knows we are done
3292          * updating everything for this ordered extent.
3293          */
3294         btrfs_remove_ordered_extent(inode, ordered_extent);
3295
3296         /* once for us */
3297         btrfs_put_ordered_extent(ordered_extent);
3298         /* once for the tree */
3299         btrfs_put_ordered_extent(ordered_extent);
3300
3301         return ret;
3302 }
3303
3304 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3305 {
3306         if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
3307             !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
3308                 btrfs_finish_ordered_zoned(ordered);
3309         return btrfs_finish_one_ordered(ordered);
3310 }
3311
3312 /*
3313  * Verify the checksum for a single sector without any extra action that depend
3314  * on the type of I/O.
3315  */
3316 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3317                             u32 pgoff, u8 *csum, const u8 * const csum_expected)
3318 {
3319         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3320         char *kaddr;
3321
3322         ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3323
3324         shash->tfm = fs_info->csum_shash;
3325
3326         kaddr = kmap_local_page(page) + pgoff;
3327         crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3328         kunmap_local(kaddr);
3329
3330         if (memcmp(csum, csum_expected, fs_info->csum_size))
3331                 return -EIO;
3332         return 0;
3333 }
3334
3335 /*
3336  * Verify the checksum of a single data sector.
3337  *
3338  * @bbio:       btrfs_io_bio which contains the csum
3339  * @dev:        device the sector is on
3340  * @bio_offset: offset to the beginning of the bio (in bytes)
3341  * @bv:         bio_vec to check
3342  *
3343  * Check if the checksum on a data block is valid.  When a checksum mismatch is
3344  * detected, report the error and fill the corrupted range with zero.
3345  *
3346  * Return %true if the sector is ok or had no checksum to start with, else %false.
3347  */
3348 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3349                         u32 bio_offset, struct bio_vec *bv)
3350 {
3351         struct btrfs_inode *inode = bbio->inode;
3352         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3353         u64 file_offset = bbio->file_offset + bio_offset;
3354         u64 end = file_offset + bv->bv_len - 1;
3355         u8 *csum_expected;
3356         u8 csum[BTRFS_CSUM_SIZE];
3357
3358         ASSERT(bv->bv_len == fs_info->sectorsize);
3359
3360         if (!bbio->csum)
3361                 return true;
3362
3363         if (btrfs_is_data_reloc_root(inode->root) &&
3364             test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3365                            1, NULL)) {
3366                 /* Skip the range without csum for data reloc inode */
3367                 clear_extent_bits(&inode->io_tree, file_offset, end,
3368                                   EXTENT_NODATASUM);
3369                 return true;
3370         }
3371
3372         csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3373                                 fs_info->csum_size;
3374         if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3375                                     csum_expected))
3376                 goto zeroit;
3377         return true;
3378
3379 zeroit:
3380         btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3381                                     bbio->mirror_num);
3382         if (dev)
3383                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3384         memzero_bvec(bv);
3385         return false;
3386 }
3387
3388 /*
3389  * btrfs_add_delayed_iput - perform a delayed iput on @inode
3390  *
3391  * @inode: The inode we want to perform iput on
3392  *
3393  * This function uses the generic vfs_inode::i_count to track whether we should
3394  * just decrement it (in case it's > 1) or if this is the last iput then link
3395  * the inode to the delayed iput machinery. Delayed iputs are processed at
3396  * transaction commit time/superblock commit/cleaner kthread.
3397  */
3398 void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3399 {
3400         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3401         unsigned long flags;
3402
3403         if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3404                 return;
3405
3406         atomic_inc(&fs_info->nr_delayed_iputs);
3407         /*
3408          * Need to be irq safe here because we can be called from either an irq
3409          * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3410          * context.
3411          */
3412         spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3413         ASSERT(list_empty(&inode->delayed_iput));
3414         list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3415         spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3416         if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3417                 wake_up_process(fs_info->cleaner_kthread);
3418 }
3419
3420 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3421                                     struct btrfs_inode *inode)
3422 {
3423         list_del_init(&inode->delayed_iput);
3424         spin_unlock_irq(&fs_info->delayed_iput_lock);
3425         iput(&inode->vfs_inode);
3426         if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3427                 wake_up(&fs_info->delayed_iputs_wait);
3428         spin_lock_irq(&fs_info->delayed_iput_lock);
3429 }
3430
3431 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3432                                    struct btrfs_inode *inode)
3433 {
3434         if (!list_empty(&inode->delayed_iput)) {
3435                 spin_lock_irq(&fs_info->delayed_iput_lock);
3436                 if (!list_empty(&inode->delayed_iput))
3437                         run_delayed_iput_locked(fs_info, inode);
3438                 spin_unlock_irq(&fs_info->delayed_iput_lock);
3439         }
3440 }
3441
3442 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3443 {
3444         /*
3445          * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3446          * calls btrfs_add_delayed_iput() and that needs to lock
3447          * fs_info->delayed_iput_lock. So we need to disable irqs here to
3448          * prevent a deadlock.
3449          */
3450         spin_lock_irq(&fs_info->delayed_iput_lock);
3451         while (!list_empty(&fs_info->delayed_iputs)) {
3452                 struct btrfs_inode *inode;
3453
3454                 inode = list_first_entry(&fs_info->delayed_iputs,
3455                                 struct btrfs_inode, delayed_iput);
3456                 run_delayed_iput_locked(fs_info, inode);
3457                 if (need_resched()) {
3458                         spin_unlock_irq(&fs_info->delayed_iput_lock);
3459                         cond_resched();
3460                         spin_lock_irq(&fs_info->delayed_iput_lock);
3461                 }
3462         }
3463         spin_unlock_irq(&fs_info->delayed_iput_lock);
3464 }
3465
3466 /*
3467  * Wait for flushing all delayed iputs
3468  *
3469  * @fs_info:  the filesystem
3470  *
3471  * This will wait on any delayed iputs that are currently running with KILLABLE
3472  * set.  Once they are all done running we will return, unless we are killed in
3473  * which case we return EINTR. This helps in user operations like fallocate etc
3474  * that might get blocked on the iputs.
3475  *
3476  * Return EINTR if we were killed, 0 if nothing's pending
3477  */
3478 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3479 {
3480         int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3481                         atomic_read(&fs_info->nr_delayed_iputs) == 0);
3482         if (ret)
3483                 return -EINTR;
3484         return 0;
3485 }
3486
3487 /*
3488  * This creates an orphan entry for the given inode in case something goes wrong
3489  * in the middle of an unlink.
3490  */
3491 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3492                      struct btrfs_inode *inode)
3493 {
3494         int ret;
3495
3496         ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3497         if (ret && ret != -EEXIST) {
3498                 btrfs_abort_transaction(trans, ret);
3499                 return ret;
3500         }
3501
3502         return 0;
3503 }
3504
3505 /*
3506  * We have done the delete so we can go ahead and remove the orphan item for
3507  * this particular inode.
3508  */
3509 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3510                             struct btrfs_inode *inode)
3511 {
3512         return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3513 }
3514
3515 /*
3516  * this cleans up any orphans that may be left on the list from the last use
3517  * of this root.
3518  */
3519 int btrfs_orphan_cleanup(struct btrfs_root *root)
3520 {
3521         struct btrfs_fs_info *fs_info = root->fs_info;
3522         struct btrfs_path *path;
3523         struct extent_buffer *leaf;
3524         struct btrfs_key key, found_key;
3525         struct btrfs_trans_handle *trans;
3526         struct inode *inode;
3527         u64 last_objectid = 0;
3528         int ret = 0, nr_unlink = 0;
3529
3530         if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3531                 return 0;
3532
3533         path = btrfs_alloc_path();
3534         if (!path) {
3535                 ret = -ENOMEM;
3536                 goto out;
3537         }
3538         path->reada = READA_BACK;
3539
3540         key.objectid = BTRFS_ORPHAN_OBJECTID;
3541         key.type = BTRFS_ORPHAN_ITEM_KEY;
3542         key.offset = (u64)-1;
3543
3544         while (1) {
3545                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3546                 if (ret < 0)
3547                         goto out;
3548
3549                 /*
3550                  * if ret == 0 means we found what we were searching for, which
3551                  * is weird, but possible, so only screw with path if we didn't
3552                  * find the key and see if we have stuff that matches
3553                  */
3554                 if (ret > 0) {
3555                         ret = 0;
3556                         if (path->slots[0] == 0)
3557                                 break;
3558                         path->slots[0]--;
3559                 }
3560
3561                 /* pull out the item */
3562                 leaf = path->nodes[0];
3563                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3564
3565                 /* make sure the item matches what we want */
3566                 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3567                         break;
3568                 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3569                         break;
3570
3571                 /* release the path since we're done with it */
3572                 btrfs_release_path(path);
3573
3574                 /*
3575                  * this is where we are basically btrfs_lookup, without the
3576                  * crossing root thing.  we store the inode number in the
3577                  * offset of the orphan item.
3578                  */
3579
3580                 if (found_key.offset == last_objectid) {
3581                         btrfs_err(fs_info,
3582                                   "Error removing orphan entry, stopping orphan cleanup");
3583                         ret = -EINVAL;
3584                         goto out;
3585                 }
3586
3587                 last_objectid = found_key.offset;
3588
3589                 found_key.objectid = found_key.offset;
3590                 found_key.type = BTRFS_INODE_ITEM_KEY;
3591                 found_key.offset = 0;
3592                 inode = btrfs_iget(fs_info->sb, last_objectid, root);
3593                 if (IS_ERR(inode)) {
3594                         ret = PTR_ERR(inode);
3595                         inode = NULL;
3596                         if (ret != -ENOENT)
3597                                 goto out;
3598                 }
3599
3600                 if (!inode && root == fs_info->tree_root) {
3601                         struct btrfs_root *dead_root;
3602                         int is_dead_root = 0;
3603
3604                         /*
3605                          * This is an orphan in the tree root. Currently these
3606                          * could come from 2 sources:
3607                          *  a) a root (snapshot/subvolume) deletion in progress
3608                          *  b) a free space cache inode
3609                          * We need to distinguish those two, as the orphan item
3610                          * for a root must not get deleted before the deletion
3611                          * of the snapshot/subvolume's tree completes.
3612                          *
3613                          * btrfs_find_orphan_roots() ran before us, which has
3614                          * found all deleted roots and loaded them into
3615                          * fs_info->fs_roots_radix. So here we can find if an
3616                          * orphan item corresponds to a deleted root by looking
3617                          * up the root from that radix tree.
3618                          */
3619
3620                         spin_lock(&fs_info->fs_roots_radix_lock);
3621                         dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3622                                                          (unsigned long)found_key.objectid);
3623                         if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3624                                 is_dead_root = 1;
3625                         spin_unlock(&fs_info->fs_roots_radix_lock);
3626
3627                         if (is_dead_root) {
3628                                 /* prevent this orphan from being found again */
3629                                 key.offset = found_key.objectid - 1;
3630                                 continue;
3631                         }
3632
3633                 }
3634
3635                 /*
3636                  * If we have an inode with links, there are a couple of
3637                  * possibilities:
3638                  *
3639                  * 1. We were halfway through creating fsverity metadata for the
3640                  * file. In that case, the orphan item represents incomplete
3641                  * fsverity metadata which must be cleaned up with
3642                  * btrfs_drop_verity_items and deleting the orphan item.
3643
3644                  * 2. Old kernels (before v3.12) used to create an
3645                  * orphan item for truncate indicating that there were possibly
3646                  * extent items past i_size that needed to be deleted. In v3.12,
3647                  * truncate was changed to update i_size in sync with the extent
3648                  * items, but the (useless) orphan item was still created. Since
3649                  * v4.18, we don't create the orphan item for truncate at all.
3650                  *
3651                  * So, this item could mean that we need to do a truncate, but
3652                  * only if this filesystem was last used on a pre-v3.12 kernel
3653                  * and was not cleanly unmounted. The odds of that are quite
3654                  * slim, and it's a pain to do the truncate now, so just delete
3655                  * the orphan item.
3656                  *
3657                  * It's also possible that this orphan item was supposed to be
3658                  * deleted but wasn't. The inode number may have been reused,
3659                  * but either way, we can delete the orphan item.
3660                  */
3661                 if (!inode || inode->i_nlink) {
3662                         if (inode) {
3663                                 ret = btrfs_drop_verity_items(BTRFS_I(inode));
3664                                 iput(inode);
3665                                 inode = NULL;
3666                                 if (ret)
3667                                         goto out;
3668                         }
3669                         trans = btrfs_start_transaction(root, 1);
3670                         if (IS_ERR(trans)) {
3671                                 ret = PTR_ERR(trans);
3672                                 goto out;
3673                         }
3674                         btrfs_debug(fs_info, "auto deleting %Lu",
3675                                     found_key.objectid);
3676                         ret = btrfs_del_orphan_item(trans, root,
3677                                                     found_key.objectid);
3678                         btrfs_end_transaction(trans);
3679                         if (ret)
3680                                 goto out;
3681                         continue;
3682                 }
3683
3684                 nr_unlink++;
3685
3686                 /* this will do delete_inode and everything for us */
3687                 iput(inode);
3688         }
3689         /* release the path since we're done with it */
3690         btrfs_release_path(path);
3691
3692         if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3693                 trans = btrfs_join_transaction(root);
3694                 if (!IS_ERR(trans))
3695                         btrfs_end_transaction(trans);
3696         }
3697
3698         if (nr_unlink)
3699                 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3700
3701 out:
3702         if (ret)
3703                 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3704         btrfs_free_path(path);
3705         return ret;
3706 }
3707
3708 /*
3709  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3710  * don't find any xattrs, we know there can't be any acls.
3711  *
3712  * slot is the slot the inode is in, objectid is the objectid of the inode
3713  */
3714 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3715                                           int slot, u64 objectid,
3716                                           int *first_xattr_slot)
3717 {
3718         u32 nritems = btrfs_header_nritems(leaf);
3719         struct btrfs_key found_key;
3720         static u64 xattr_access = 0;
3721         static u64 xattr_default = 0;
3722         int scanned = 0;
3723
3724         if (!xattr_access) {
3725                 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3726                                         strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3727                 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3728                                         strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3729         }
3730
3731         slot++;
3732         *first_xattr_slot = -1;
3733         while (slot < nritems) {
3734                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3735
3736                 /* we found a different objectid, there must not be acls */
3737                 if (found_key.objectid != objectid)
3738                         return 0;
3739
3740                 /* we found an xattr, assume we've got an acl */
3741                 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3742                         if (*first_xattr_slot == -1)
3743                                 *first_xattr_slot = slot;
3744                         if (found_key.offset == xattr_access ||
3745                             found_key.offset == xattr_default)
3746                                 return 1;
3747                 }
3748
3749                 /*
3750                  * we found a key greater than an xattr key, there can't
3751                  * be any acls later on
3752                  */
3753                 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3754                         return 0;
3755
3756                 slot++;
3757                 scanned++;
3758
3759                 /*
3760                  * it goes inode, inode backrefs, xattrs, extents,
3761                  * so if there are a ton of hard links to an inode there can
3762                  * be a lot of backrefs.  Don't waste time searching too hard,
3763                  * this is just an optimization
3764                  */
3765                 if (scanned >= 8)
3766                         break;
3767         }
3768         /* we hit the end of the leaf before we found an xattr or
3769          * something larger than an xattr.  We have to assume the inode
3770          * has acls
3771          */
3772         if (*first_xattr_slot == -1)
3773                 *first_xattr_slot = slot;
3774         return 1;
3775 }
3776
3777 /*
3778  * read an inode from the btree into the in-memory inode
3779  */
3780 static int btrfs_read_locked_inode(struct inode *inode,
3781                                    struct btrfs_path *in_path)
3782 {
3783         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3784         struct btrfs_path *path = in_path;
3785         struct extent_buffer *leaf;
3786         struct btrfs_inode_item *inode_item;
3787         struct btrfs_root *root = BTRFS_I(inode)->root;
3788         struct btrfs_key location;
3789         unsigned long ptr;
3790         int maybe_acls;
3791         u32 rdev;
3792         int ret;
3793         bool filled = false;
3794         int first_xattr_slot;
3795
3796         ret = btrfs_fill_inode(inode, &rdev);
3797         if (!ret)
3798                 filled = true;
3799
3800         if (!path) {
3801                 path = btrfs_alloc_path();
3802                 if (!path)
3803                         return -ENOMEM;
3804         }
3805
3806         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3807
3808         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3809         if (ret) {
3810                 if (path != in_path)
3811                         btrfs_free_path(path);
3812                 return ret;
3813         }
3814
3815         leaf = path->nodes[0];
3816
3817         if (filled)
3818                 goto cache_index;
3819
3820         inode_item = btrfs_item_ptr(leaf, path->slots[0],
3821                                     struct btrfs_inode_item);
3822         inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3823         set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3824         i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3825         i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3826         btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3827         btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3828                         round_up(i_size_read(inode), fs_info->sectorsize));
3829
3830         inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3831         inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3832
3833         inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3834         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3835
3836         inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3837         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3838
3839         BTRFS_I(inode)->i_otime.tv_sec =
3840                 btrfs_timespec_sec(leaf, &inode_item->otime);
3841         BTRFS_I(inode)->i_otime.tv_nsec =
3842                 btrfs_timespec_nsec(leaf, &inode_item->otime);
3843
3844         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3845         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3846         BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3847
3848         inode_set_iversion_queried(inode,
3849                                    btrfs_inode_sequence(leaf, inode_item));
3850         inode->i_generation = BTRFS_I(inode)->generation;
3851         inode->i_rdev = 0;
3852         rdev = btrfs_inode_rdev(leaf, inode_item);
3853
3854         BTRFS_I(inode)->index_cnt = (u64)-1;
3855         btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3856                                 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3857
3858 cache_index:
3859         /*
3860          * If we were modified in the current generation and evicted from memory
3861          * and then re-read we need to do a full sync since we don't have any
3862          * idea about which extents were modified before we were evicted from
3863          * cache.
3864          *
3865          * This is required for both inode re-read from disk and delayed inode
3866          * in delayed_nodes_tree.
3867          */
3868         if (BTRFS_I(inode)->last_trans == fs_info->generation)
3869                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3870                         &BTRFS_I(inode)->runtime_flags);
3871
3872         /*
3873          * We don't persist the id of the transaction where an unlink operation
3874          * against the inode was last made. So here we assume the inode might
3875          * have been evicted, and therefore the exact value of last_unlink_trans
3876          * lost, and set it to last_trans to avoid metadata inconsistencies
3877          * between the inode and its parent if the inode is fsync'ed and the log
3878          * replayed. For example, in the scenario:
3879          *
3880          * touch mydir/foo
3881          * ln mydir/foo mydir/bar
3882          * sync
3883          * unlink mydir/bar
3884          * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3885          * xfs_io -c fsync mydir/foo
3886          * <power failure>
3887          * mount fs, triggers fsync log replay
3888          *
3889          * We must make sure that when we fsync our inode foo we also log its
3890          * parent inode, otherwise after log replay the parent still has the
3891          * dentry with the "bar" name but our inode foo has a link count of 1
3892          * and doesn't have an inode ref with the name "bar" anymore.
3893          *
3894          * Setting last_unlink_trans to last_trans is a pessimistic approach,
3895          * but it guarantees correctness at the expense of occasional full
3896          * transaction commits on fsync if our inode is a directory, or if our
3897          * inode is not a directory, logging its parent unnecessarily.
3898          */
3899         BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3900
3901         /*
3902          * Same logic as for last_unlink_trans. We don't persist the generation
3903          * of the last transaction where this inode was used for a reflink
3904          * operation, so after eviction and reloading the inode we must be
3905          * pessimistic and assume the last transaction that modified the inode.
3906          */
3907         BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3908
3909         path->slots[0]++;
3910         if (inode->i_nlink != 1 ||
3911             path->slots[0] >= btrfs_header_nritems(leaf))
3912                 goto cache_acl;
3913
3914         btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3915         if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3916                 goto cache_acl;
3917
3918         ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3919         if (location.type == BTRFS_INODE_REF_KEY) {
3920                 struct btrfs_inode_ref *ref;
3921
3922                 ref = (struct btrfs_inode_ref *)ptr;
3923                 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3924         } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3925                 struct btrfs_inode_extref *extref;
3926
3927                 extref = (struct btrfs_inode_extref *)ptr;
3928                 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3929                                                                      extref);
3930         }
3931 cache_acl:
3932         /*
3933          * try to precache a NULL acl entry for files that don't have
3934          * any xattrs or acls
3935          */
3936         maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3937                         btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3938         if (first_xattr_slot != -1) {
3939                 path->slots[0] = first_xattr_slot;
3940                 ret = btrfs_load_inode_props(inode, path);
3941                 if (ret)
3942                         btrfs_err(fs_info,
3943                                   "error loading props for ino %llu (root %llu): %d",
3944                                   btrfs_ino(BTRFS_I(inode)),
3945                                   root->root_key.objectid, ret);
3946         }
3947         if (path != in_path)
3948                 btrfs_free_path(path);
3949
3950         if (!maybe_acls)
3951                 cache_no_acl(inode);
3952
3953         switch (inode->i_mode & S_IFMT) {
3954         case S_IFREG:
3955                 inode->i_mapping->a_ops = &btrfs_aops;
3956                 inode->i_fop = &btrfs_file_operations;
3957                 inode->i_op = &btrfs_file_inode_operations;
3958                 break;
3959         case S_IFDIR:
3960                 inode->i_fop = &btrfs_dir_file_operations;
3961                 inode->i_op = &btrfs_dir_inode_operations;
3962                 break;
3963         case S_IFLNK:
3964                 inode->i_op = &btrfs_symlink_inode_operations;
3965                 inode_nohighmem(inode);
3966                 inode->i_mapping->a_ops = &btrfs_aops;
3967                 break;
3968         default:
3969                 inode->i_op = &btrfs_special_inode_operations;
3970                 init_special_inode(inode, inode->i_mode, rdev);
3971                 break;
3972         }
3973
3974         btrfs_sync_inode_flags_to_i_flags(inode);
3975         return 0;
3976 }
3977
3978 /*
3979  * given a leaf and an inode, copy the inode fields into the leaf
3980  */
3981 static void fill_inode_item(struct btrfs_trans_handle *trans,
3982                             struct extent_buffer *leaf,
3983                             struct btrfs_inode_item *item,
3984                             struct inode *inode)
3985 {
3986         struct btrfs_map_token token;
3987         u64 flags;
3988
3989         btrfs_init_map_token(&token, leaf);
3990
3991         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3992         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3993         btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3994         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3995         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3996
3997         btrfs_set_token_timespec_sec(&token, &item->atime,
3998                                      inode->i_atime.tv_sec);
3999         btrfs_set_token_timespec_nsec(&token, &item->atime,
4000                                       inode->i_atime.tv_nsec);
4001
4002         btrfs_set_token_timespec_sec(&token, &item->mtime,
4003                                      inode->i_mtime.tv_sec);
4004         btrfs_set_token_timespec_nsec(&token, &item->mtime,
4005                                       inode->i_mtime.tv_nsec);
4006
4007         btrfs_set_token_timespec_sec(&token, &item->ctime,
4008                                      inode->i_ctime.tv_sec);
4009         btrfs_set_token_timespec_nsec(&token, &item->ctime,
4010                                       inode->i_ctime.tv_nsec);
4011
4012         btrfs_set_token_timespec_sec(&token, &item->otime,
4013                                      BTRFS_I(inode)->i_otime.tv_sec);
4014         btrfs_set_token_timespec_nsec(&token, &item->otime,
4015                                       BTRFS_I(inode)->i_otime.tv_nsec);
4016
4017         btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
4018         btrfs_set_token_inode_generation(&token, item,
4019                                          BTRFS_I(inode)->generation);
4020         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4021         btrfs_set_token_inode_transid(&token, item, trans->transid);
4022         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4023         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4024                                           BTRFS_I(inode)->ro_flags);
4025         btrfs_set_token_inode_flags(&token, item, flags);
4026         btrfs_set_token_inode_block_group(&token, item, 0);
4027 }
4028
4029 /*
4030  * copy everything in the in-memory inode into the btree.
4031  */
4032 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
4033                                 struct btrfs_root *root,
4034                                 struct btrfs_inode *inode)
4035 {
4036         struct btrfs_inode_item *inode_item;
4037         struct btrfs_path *path;
4038         struct extent_buffer *leaf;
4039         int ret;
4040
4041         path = btrfs_alloc_path();
4042         if (!path)
4043                 return -ENOMEM;
4044
4045         ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
4046         if (ret) {
4047                 if (ret > 0)
4048                         ret = -ENOENT;
4049                 goto failed;
4050         }
4051
4052         leaf = path->nodes[0];
4053         inode_item = btrfs_item_ptr(leaf, path->slots[0],
4054                                     struct btrfs_inode_item);
4055
4056         fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4057         btrfs_mark_buffer_dirty(leaf);
4058         btrfs_set_inode_last_trans(trans, inode);
4059         ret = 0;
4060 failed:
4061         btrfs_free_path(path);
4062         return ret;
4063 }
4064
4065 /*
4066  * copy everything in the in-memory inode into the btree.
4067  */
4068 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4069                                 struct btrfs_root *root,
4070                                 struct btrfs_inode *inode)
4071 {
4072         struct btrfs_fs_info *fs_info = root->fs_info;
4073         int ret;
4074
4075         /*
4076          * If the inode is a free space inode, we can deadlock during commit
4077          * if we put it into the delayed code.
4078          *
4079          * The data relocation inode should also be directly updated
4080          * without delay
4081          */
4082         if (!btrfs_is_free_space_inode(inode)
4083             && !btrfs_is_data_reloc_root(root)
4084             && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4085                 btrfs_update_root_times(trans, root);
4086
4087                 ret = btrfs_delayed_update_inode(trans, root, inode);
4088                 if (!ret)
4089                         btrfs_set_inode_last_trans(trans, inode);
4090                 return ret;
4091         }
4092
4093         return btrfs_update_inode_item(trans, root, inode);
4094 }
4095
4096 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4097                                 struct btrfs_root *root, struct btrfs_inode *inode)
4098 {
4099         int ret;
4100
4101         ret = btrfs_update_inode(trans, root, inode);
4102         if (ret == -ENOSPC)
4103                 return btrfs_update_inode_item(trans, root, inode);
4104         return ret;
4105 }
4106
4107 /*
4108  * unlink helper that gets used here in inode.c and in the tree logging
4109  * recovery code.  It remove a link in a directory with a given name, and
4110  * also drops the back refs in the inode to the directory
4111  */
4112 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4113                                 struct btrfs_inode *dir,
4114                                 struct btrfs_inode *inode,
4115                                 const struct fscrypt_str *name,
4116                                 struct btrfs_rename_ctx *rename_ctx)
4117 {
4118         struct btrfs_root *root = dir->root;
4119         struct btrfs_fs_info *fs_info = root->fs_info;
4120         struct btrfs_path *path;
4121         int ret = 0;
4122         struct btrfs_dir_item *di;
4123         u64 index;
4124         u64 ino = btrfs_ino(inode);
4125         u64 dir_ino = btrfs_ino(dir);
4126
4127         path = btrfs_alloc_path();
4128         if (!path) {
4129                 ret = -ENOMEM;
4130                 goto out;
4131         }
4132
4133         di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4134         if (IS_ERR_OR_NULL(di)) {
4135                 ret = di ? PTR_ERR(di) : -ENOENT;
4136                 goto err;
4137         }
4138         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4139         if (ret)
4140                 goto err;
4141         btrfs_release_path(path);
4142
4143         /*
4144          * If we don't have dir index, we have to get it by looking up
4145          * the inode ref, since we get the inode ref, remove it directly,
4146          * it is unnecessary to do delayed deletion.
4147          *
4148          * But if we have dir index, needn't search inode ref to get it.
4149          * Since the inode ref is close to the inode item, it is better
4150          * that we delay to delete it, and just do this deletion when
4151          * we update the inode item.
4152          */
4153         if (inode->dir_index) {
4154                 ret = btrfs_delayed_delete_inode_ref(inode);
4155                 if (!ret) {
4156                         index = inode->dir_index;
4157                         goto skip_backref;
4158                 }
4159         }
4160
4161         ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4162         if (ret) {
4163                 btrfs_info(fs_info,
4164                         "failed to delete reference to %.*s, inode %llu parent %llu",
4165                         name->len, name->name, ino, dir_ino);
4166                 btrfs_abort_transaction(trans, ret);
4167                 goto err;
4168         }
4169 skip_backref:
4170         if (rename_ctx)
4171                 rename_ctx->index = index;
4172
4173         ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4174         if (ret) {
4175                 btrfs_abort_transaction(trans, ret);
4176                 goto err;
4177         }
4178
4179         /*
4180          * If we are in a rename context, we don't need to update anything in the
4181          * log. That will be done later during the rename by btrfs_log_new_name().
4182          * Besides that, doing it here would only cause extra unnecessary btree
4183          * operations on the log tree, increasing latency for applications.
4184          */
4185         if (!rename_ctx) {
4186                 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4187                 btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4188         }
4189
4190         /*
4191          * If we have a pending delayed iput we could end up with the final iput
4192          * being run in btrfs-cleaner context.  If we have enough of these built
4193          * up we can end up burning a lot of time in btrfs-cleaner without any
4194          * way to throttle the unlinks.  Since we're currently holding a ref on
4195          * the inode we can run the delayed iput here without any issues as the
4196          * final iput won't be done until after we drop the ref we're currently
4197          * holding.
4198          */
4199         btrfs_run_delayed_iput(fs_info, inode);
4200 err:
4201         btrfs_free_path(path);
4202         if (ret)
4203                 goto out;
4204
4205         btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4206         inode_inc_iversion(&inode->vfs_inode);
4207         inode_inc_iversion(&dir->vfs_inode);
4208         inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4209         dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
4210         dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
4211         ret = btrfs_update_inode(trans, root, dir);
4212 out:
4213         return ret;
4214 }
4215
4216 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4217                        struct btrfs_inode *dir, struct btrfs_inode *inode,
4218                        const struct fscrypt_str *name)
4219 {
4220         int ret;
4221
4222         ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4223         if (!ret) {
4224                 drop_nlink(&inode->vfs_inode);
4225                 ret = btrfs_update_inode(trans, inode->root, inode);
4226         }
4227         return ret;
4228 }
4229
4230 /*
4231  * helper to start transaction for unlink and rmdir.
4232  *
4233  * unlink and rmdir are special in btrfs, they do not always free space, so
4234  * if we cannot make our reservations the normal way try and see if there is
4235  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4236  * allow the unlink to occur.
4237  */
4238 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4239 {
4240         struct btrfs_root *root = dir->root;
4241
4242         return btrfs_start_transaction_fallback_global_rsv(root,
4243                                                    BTRFS_UNLINK_METADATA_UNITS);
4244 }
4245
4246 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4247 {
4248         struct btrfs_trans_handle *trans;
4249         struct inode *inode = d_inode(dentry);
4250         int ret;
4251         struct fscrypt_name fname;
4252
4253         ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4254         if (ret)
4255                 return ret;
4256
4257         /* This needs to handle no-key deletions later on */
4258
4259         trans = __unlink_start_trans(BTRFS_I(dir));
4260         if (IS_ERR(trans)) {
4261                 ret = PTR_ERR(trans);
4262                 goto fscrypt_free;
4263         }
4264
4265         btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4266                                 false);
4267
4268         ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4269                                  &fname.disk_name);
4270         if (ret)
4271                 goto end_trans;
4272
4273         if (inode->i_nlink == 0) {
4274                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4275                 if (ret)
4276                         goto end_trans;
4277         }
4278
4279 end_trans:
4280         btrfs_end_transaction(trans);
4281         btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4282 fscrypt_free:
4283         fscrypt_free_filename(&fname);
4284         return ret;
4285 }
4286
4287 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4288                                struct btrfs_inode *dir, struct dentry *dentry)
4289 {
4290         struct btrfs_root *root = dir->root;
4291         struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4292         struct btrfs_path *path;
4293         struct extent_buffer *leaf;
4294         struct btrfs_dir_item *di;
4295         struct btrfs_key key;
4296         u64 index;
4297         int ret;
4298         u64 objectid;
4299         u64 dir_ino = btrfs_ino(dir);
4300         struct fscrypt_name fname;
4301
4302         ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4303         if (ret)
4304                 return ret;
4305
4306         /* This needs to handle no-key deletions later on */
4307
4308         if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4309                 objectid = inode->root->root_key.objectid;
4310         } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4311                 objectid = inode->location.objectid;
4312         } else {
4313                 WARN_ON(1);
4314                 fscrypt_free_filename(&fname);
4315                 return -EINVAL;
4316         }
4317
4318         path = btrfs_alloc_path();
4319         if (!path) {
4320                 ret = -ENOMEM;
4321                 goto out;
4322         }
4323
4324         di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4325                                    &fname.disk_name, -1);
4326         if (IS_ERR_OR_NULL(di)) {
4327                 ret = di ? PTR_ERR(di) : -ENOENT;
4328                 goto out;
4329         }
4330
4331         leaf = path->nodes[0];
4332         btrfs_dir_item_key_to_cpu(leaf, di, &key);
4333         WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4334         ret = btrfs_delete_one_dir_name(trans, root, path, di);
4335         if (ret) {
4336                 btrfs_abort_transaction(trans, ret);
4337                 goto out;
4338         }
4339         btrfs_release_path(path);
4340
4341         /*
4342          * This is a placeholder inode for a subvolume we didn't have a
4343          * reference to at the time of the snapshot creation.  In the meantime
4344          * we could have renamed the real subvol link into our snapshot, so
4345          * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4346          * Instead simply lookup the dir_index_item for this entry so we can
4347          * remove it.  Otherwise we know we have a ref to the root and we can
4348          * call btrfs_del_root_ref, and it _shouldn't_ fail.
4349          */
4350         if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4351                 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4352                 if (IS_ERR_OR_NULL(di)) {
4353                         if (!di)
4354                                 ret = -ENOENT;
4355                         else
4356                                 ret = PTR_ERR(di);
4357                         btrfs_abort_transaction(trans, ret);
4358                         goto out;
4359                 }
4360
4361                 leaf = path->nodes[0];
4362                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4363                 index = key.offset;
4364                 btrfs_release_path(path);
4365         } else {
4366                 ret = btrfs_del_root_ref(trans, objectid,
4367                                          root->root_key.objectid, dir_ino,
4368                                          &index, &fname.disk_name);
4369                 if (ret) {
4370                         btrfs_abort_transaction(trans, ret);
4371                         goto out;
4372                 }
4373         }
4374
4375         ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4376         if (ret) {
4377                 btrfs_abort_transaction(trans, ret);
4378                 goto out;
4379         }
4380
4381         btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4382         inode_inc_iversion(&dir->vfs_inode);
4383         dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode);
4384         dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime;
4385         ret = btrfs_update_inode_fallback(trans, root, dir);
4386         if (ret)
4387                 btrfs_abort_transaction(trans, ret);
4388 out:
4389         btrfs_free_path(path);
4390         fscrypt_free_filename(&fname);
4391         return ret;
4392 }
4393
4394 /*
4395  * Helper to check if the subvolume references other subvolumes or if it's
4396  * default.
4397  */
4398 static noinline int may_destroy_subvol(struct btrfs_root *root)
4399 {
4400         struct btrfs_fs_info *fs_info = root->fs_info;
4401         struct btrfs_path *path;
4402         struct btrfs_dir_item *di;
4403         struct btrfs_key key;
4404         struct fscrypt_str name = FSTR_INIT("default", 7);
4405         u64 dir_id;
4406         int ret;
4407
4408         path = btrfs_alloc_path();
4409         if (!path)
4410                 return -ENOMEM;
4411
4412         /* Make sure this root isn't set as the default subvol */
4413         dir_id = btrfs_super_root_dir(fs_info->super_copy);
4414         di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4415                                    dir_id, &name, 0);
4416         if (di && !IS_ERR(di)) {
4417                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4418                 if (key.objectid == root->root_key.objectid) {
4419                         ret = -EPERM;
4420                         btrfs_err(fs_info,
4421                                   "deleting default subvolume %llu is not allowed",
4422                                   key.objectid);
4423                         goto out;
4424                 }
4425                 btrfs_release_path(path);
4426         }
4427
4428         key.objectid = root->root_key.objectid;
4429         key.type = BTRFS_ROOT_REF_KEY;
4430         key.offset = (u64)-1;
4431
4432         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4433         if (ret < 0)
4434                 goto out;
4435         BUG_ON(ret == 0);
4436
4437         ret = 0;
4438         if (path->slots[0] > 0) {
4439                 path->slots[0]--;
4440                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4441                 if (key.objectid == root->root_key.objectid &&
4442                     key.type == BTRFS_ROOT_REF_KEY)
4443                         ret = -ENOTEMPTY;
4444         }
4445 out:
4446         btrfs_free_path(path);
4447         return ret;
4448 }
4449
4450 /* Delete all dentries for inodes belonging to the root */
4451 static void btrfs_prune_dentries(struct btrfs_root *root)
4452 {
4453         struct btrfs_fs_info *fs_info = root->fs_info;
4454         struct rb_node *node;
4455         struct rb_node *prev;
4456         struct btrfs_inode *entry;
4457         struct inode *inode;
4458         u64 objectid = 0;
4459
4460         if (!BTRFS_FS_ERROR(fs_info))
4461                 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4462
4463         spin_lock(&root->inode_lock);
4464 again:
4465         node = root->inode_tree.rb_node;
4466         prev = NULL;
4467         while (node) {
4468                 prev = node;
4469                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4470
4471                 if (objectid < btrfs_ino(entry))
4472                         node = node->rb_left;
4473                 else if (objectid > btrfs_ino(entry))
4474                         node = node->rb_right;
4475                 else
4476                         break;
4477         }
4478         if (!node) {
4479                 while (prev) {
4480                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
4481                         if (objectid <= btrfs_ino(entry)) {
4482                                 node = prev;
4483                                 break;
4484                         }
4485                         prev = rb_next(prev);
4486                 }
4487         }
4488         while (node) {
4489                 entry = rb_entry(node, struct btrfs_inode, rb_node);
4490                 objectid = btrfs_ino(entry) + 1;
4491                 inode = igrab(&entry->vfs_inode);
4492                 if (inode) {
4493                         spin_unlock(&root->inode_lock);
4494                         if (atomic_read(&inode->i_count) > 1)
4495                                 d_prune_aliases(inode);
4496                         /*
4497                          * btrfs_drop_inode will have it removed from the inode
4498                          * cache when its usage count hits zero.
4499                          */
4500                         iput(inode);
4501                         cond_resched();
4502                         spin_lock(&root->inode_lock);
4503                         goto again;
4504                 }
4505
4506                 if (cond_resched_lock(&root->inode_lock))
4507                         goto again;
4508
4509                 node = rb_next(node);
4510         }
4511         spin_unlock(&root->inode_lock);
4512 }
4513
4514 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4515 {
4516         struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4517         struct btrfs_root *root = dir->root;
4518         struct inode *inode = d_inode(dentry);
4519         struct btrfs_root *dest = BTRFS_I(inode)->root;
4520         struct btrfs_trans_handle *trans;
4521         struct btrfs_block_rsv block_rsv;
4522         u64 root_flags;
4523         int ret;
4524
4525         /*
4526          * Don't allow to delete a subvolume with send in progress. This is
4527          * inside the inode lock so the error handling that has to drop the bit
4528          * again is not run concurrently.
4529          */
4530         spin_lock(&dest->root_item_lock);
4531         if (dest->send_in_progress) {
4532                 spin_unlock(&dest->root_item_lock);
4533                 btrfs_warn(fs_info,
4534                            "attempt to delete subvolume %llu during send",
4535                            dest->root_key.objectid);
4536                 return -EPERM;
4537         }
4538         if (atomic_read(&dest->nr_swapfiles)) {
4539                 spin_unlock(&dest->root_item_lock);
4540                 btrfs_warn(fs_info,
4541                            "attempt to delete subvolume %llu with active swapfile",
4542                            root->root_key.objectid);
4543                 return -EPERM;
4544         }
4545         root_flags = btrfs_root_flags(&dest->root_item);
4546         btrfs_set_root_flags(&dest->root_item,
4547                              root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4548         spin_unlock(&dest->root_item_lock);
4549
4550         down_write(&fs_info->subvol_sem);
4551
4552         ret = may_destroy_subvol(dest);
4553         if (ret)
4554                 goto out_up_write;
4555
4556         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4557         /*
4558          * One for dir inode,
4559          * two for dir entries,
4560          * two for root ref/backref.
4561          */
4562         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4563         if (ret)
4564                 goto out_up_write;
4565
4566         trans = btrfs_start_transaction(root, 0);
4567         if (IS_ERR(trans)) {
4568                 ret = PTR_ERR(trans);
4569                 goto out_release;
4570         }
4571         trans->block_rsv = &block_rsv;
4572         trans->bytes_reserved = block_rsv.size;
4573
4574         btrfs_record_snapshot_destroy(trans, dir);
4575
4576         ret = btrfs_unlink_subvol(trans, dir, dentry);
4577         if (ret) {
4578                 btrfs_abort_transaction(trans, ret);
4579                 goto out_end_trans;
4580         }
4581
4582         ret = btrfs_record_root_in_trans(trans, dest);
4583         if (ret) {
4584                 btrfs_abort_transaction(trans, ret);
4585                 goto out_end_trans;
4586         }
4587
4588         memset(&dest->root_item.drop_progress, 0,
4589                 sizeof(dest->root_item.drop_progress));
4590         btrfs_set_root_drop_level(&dest->root_item, 0);
4591         btrfs_set_root_refs(&dest->root_item, 0);
4592
4593         if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4594                 ret = btrfs_insert_orphan_item(trans,
4595                                         fs_info->tree_root,
4596                                         dest->root_key.objectid);
4597                 if (ret) {
4598                         btrfs_abort_transaction(trans, ret);
4599                         goto out_end_trans;
4600                 }
4601         }
4602
4603         ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4604                                   BTRFS_UUID_KEY_SUBVOL,
4605                                   dest->root_key.objectid);
4606         if (ret && ret != -ENOENT) {
4607                 btrfs_abort_transaction(trans, ret);
4608                 goto out_end_trans;
4609         }
4610         if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4611                 ret = btrfs_uuid_tree_remove(trans,
4612                                           dest->root_item.received_uuid,
4613                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4614                                           dest->root_key.objectid);
4615                 if (ret && ret != -ENOENT) {
4616                         btrfs_abort_transaction(trans, ret);
4617                         goto out_end_trans;
4618                 }
4619         }
4620
4621         free_anon_bdev(dest->anon_dev);
4622         dest->anon_dev = 0;
4623 out_end_trans:
4624         trans->block_rsv = NULL;
4625         trans->bytes_reserved = 0;
4626         ret = btrfs_end_transaction(trans);
4627         inode->i_flags |= S_DEAD;
4628 out_release:
4629         btrfs_subvolume_release_metadata(root, &block_rsv);
4630 out_up_write:
4631         up_write(&fs_info->subvol_sem);
4632         if (ret) {
4633                 spin_lock(&dest->root_item_lock);
4634                 root_flags = btrfs_root_flags(&dest->root_item);
4635                 btrfs_set_root_flags(&dest->root_item,
4636                                 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4637                 spin_unlock(&dest->root_item_lock);
4638         } else {
4639                 d_invalidate(dentry);
4640                 btrfs_prune_dentries(dest);
4641                 ASSERT(dest->send_in_progress == 0);
4642         }
4643
4644         return ret;
4645 }
4646
4647 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4648 {
4649         struct inode *inode = d_inode(dentry);
4650         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4651         int err = 0;
4652         struct btrfs_trans_handle *trans;
4653         u64 last_unlink_trans;
4654         struct fscrypt_name fname;
4655
4656         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4657                 return -ENOTEMPTY;
4658         if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4659                 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4660                         btrfs_err(fs_info,
4661                         "extent tree v2 doesn't support snapshot deletion yet");
4662                         return -EOPNOTSUPP;
4663                 }
4664                 return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4665         }
4666
4667         err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4668         if (err)
4669                 return err;
4670
4671         /* This needs to handle no-key deletions later on */
4672
4673         trans = __unlink_start_trans(BTRFS_I(dir));
4674         if (IS_ERR(trans)) {
4675                 err = PTR_ERR(trans);
4676                 goto out_notrans;
4677         }
4678
4679         if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4680                 err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4681                 goto out;
4682         }
4683
4684         err = btrfs_orphan_add(trans, BTRFS_I(inode));
4685         if (err)
4686                 goto out;
4687
4688         last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4689
4690         /* now the directory is empty */
4691         err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4692                                  &fname.disk_name);
4693         if (!err) {
4694                 btrfs_i_size_write(BTRFS_I(inode), 0);
4695                 /*
4696                  * Propagate the last_unlink_trans value of the deleted dir to
4697                  * its parent directory. This is to prevent an unrecoverable
4698                  * log tree in the case we do something like this:
4699                  * 1) create dir foo
4700                  * 2) create snapshot under dir foo
4701                  * 3) delete the snapshot
4702                  * 4) rmdir foo
4703                  * 5) mkdir foo
4704                  * 6) fsync foo or some file inside foo
4705                  */
4706                 if (last_unlink_trans >= trans->transid)
4707                         BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4708         }
4709 out:
4710         btrfs_end_transaction(trans);
4711 out_notrans:
4712         btrfs_btree_balance_dirty(fs_info);
4713         fscrypt_free_filename(&fname);
4714
4715         return err;
4716 }
4717
4718 /*
4719  * btrfs_truncate_block - read, zero a chunk and write a block
4720  * @inode - inode that we're zeroing
4721  * @from - the offset to start zeroing
4722  * @len - the length to zero, 0 to zero the entire range respective to the
4723  *      offset
4724  * @front - zero up to the offset instead of from the offset on
4725  *
4726  * This will find the block for the "from" offset and cow the block and zero the
4727  * part we want to zero.  This is used with truncate and hole punching.
4728  */
4729 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4730                          int front)
4731 {
4732         struct btrfs_fs_info *fs_info = inode->root->fs_info;
4733         struct address_space *mapping = inode->vfs_inode.i_mapping;
4734         struct extent_io_tree *io_tree = &inode->io_tree;
4735         struct btrfs_ordered_extent *ordered;
4736         struct extent_state *cached_state = NULL;
4737         struct extent_changeset *data_reserved = NULL;
4738         bool only_release_metadata = false;
4739         u32 blocksize = fs_info->sectorsize;
4740         pgoff_t index = from >> PAGE_SHIFT;
4741         unsigned offset = from & (blocksize - 1);
4742         struct page *page;
4743         gfp_t mask = btrfs_alloc_write_mask(mapping);
4744         size_t write_bytes = blocksize;
4745         int ret = 0;
4746         u64 block_start;
4747         u64 block_end;
4748
4749         if (IS_ALIGNED(offset, blocksize) &&
4750             (!len || IS_ALIGNED(len, blocksize)))
4751                 goto out;
4752
4753         block_start = round_down(from, blocksize);
4754         block_end = block_start + blocksize - 1;
4755
4756         ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4757                                           blocksize, false);
4758         if (ret < 0) {
4759                 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4760                         /* For nocow case, no need to reserve data space */
4761                         only_release_metadata = true;
4762                 } else {
4763                         goto out;
4764                 }
4765         }
4766         ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4767         if (ret < 0) {
4768                 if (!only_release_metadata)
4769                         btrfs_free_reserved_data_space(inode, data_reserved,
4770                                                        block_start, blocksize);
4771                 goto out;
4772         }
4773 again:
4774         page = find_or_create_page(mapping, index, mask);
4775         if (!page) {
4776                 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4777                                              blocksize, true);
4778                 btrfs_delalloc_release_extents(inode, blocksize);
4779                 ret = -ENOMEM;
4780                 goto out;
4781         }
4782
4783         if (!PageUptodate(page)) {
4784                 ret = btrfs_read_folio(NULL, page_folio(page));
4785                 lock_page(page);
4786                 if (page->mapping != mapping) {
4787                         unlock_page(page);
4788                         put_page(page);
4789                         goto again;
4790                 }
4791                 if (!PageUptodate(page)) {
4792                         ret = -EIO;
4793                         goto out_unlock;
4794                 }
4795         }
4796
4797         /*
4798          * We unlock the page after the io is completed and then re-lock it
4799          * above.  release_folio() could have come in between that and cleared
4800          * PagePrivate(), but left the page in the mapping.  Set the page mapped
4801          * here to make sure it's properly set for the subpage stuff.
4802          */
4803         ret = set_page_extent_mapped(page);
4804         if (ret < 0)
4805                 goto out_unlock;
4806
4807         wait_on_page_writeback(page);
4808
4809         lock_extent(io_tree, block_start, block_end, &cached_state);
4810
4811         ordered = btrfs_lookup_ordered_extent(inode, block_start);
4812         if (ordered) {
4813                 unlock_extent(io_tree, block_start, block_end, &cached_state);
4814                 unlock_page(page);
4815                 put_page(page);
4816                 btrfs_start_ordered_extent(ordered);
4817                 btrfs_put_ordered_extent(ordered);
4818                 goto again;
4819         }
4820
4821         clear_extent_bit(&inode->io_tree, block_start, block_end,
4822                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4823                          &cached_state);
4824
4825         ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4826                                         &cached_state);
4827         if (ret) {
4828                 unlock_extent(io_tree, block_start, block_end, &cached_state);
4829                 goto out_unlock;
4830         }
4831
4832         if (offset != blocksize) {
4833                 if (!len)
4834                         len = blocksize - offset;
4835                 if (front)
4836                         memzero_page(page, (block_start - page_offset(page)),
4837                                      offset);
4838                 else
4839                         memzero_page(page, (block_start - page_offset(page)) + offset,
4840                                      len);
4841         }
4842         btrfs_page_clear_checked(fs_info, page, block_start,
4843                                  block_end + 1 - block_start);
4844         btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4845         unlock_extent(io_tree, block_start, block_end, &cached_state);
4846
4847         if (only_release_metadata)
4848                 set_extent_bit(&inode->io_tree, block_start, block_end,
4849                                EXTENT_NORESERVE, NULL);
4850
4851 out_unlock:
4852         if (ret) {
4853                 if (only_release_metadata)
4854                         btrfs_delalloc_release_metadata(inode, blocksize, true);
4855                 else
4856                         btrfs_delalloc_release_space(inode, data_reserved,
4857                                         block_start, blocksize, true);
4858         }
4859         btrfs_delalloc_release_extents(inode, blocksize);
4860         unlock_page(page);
4861         put_page(page);
4862 out:
4863         if (only_release_metadata)
4864                 btrfs_check_nocow_unlock(inode);
4865         extent_changeset_free(data_reserved);
4866         return ret;
4867 }
4868
4869 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4870                              u64 offset, u64 len)
4871 {
4872         struct btrfs_fs_info *fs_info = root->fs_info;
4873         struct btrfs_trans_handle *trans;
4874         struct btrfs_drop_extents_args drop_args = { 0 };
4875         int ret;
4876
4877         /*
4878          * If NO_HOLES is enabled, we don't need to do anything.
4879          * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4880          * or btrfs_update_inode() will be called, which guarantee that the next
4881          * fsync will know this inode was changed and needs to be logged.
4882          */
4883         if (btrfs_fs_incompat(fs_info, NO_HOLES))
4884                 return 0;
4885
4886         /*
4887          * 1 - for the one we're dropping
4888          * 1 - for the one we're adding
4889          * 1 - for updating the inode.
4890          */
4891         trans = btrfs_start_transaction(root, 3);
4892         if (IS_ERR(trans))
4893                 return PTR_ERR(trans);
4894
4895         drop_args.start = offset;
4896         drop_args.end = offset + len;
4897         drop_args.drop_cache = true;
4898
4899         ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4900         if (ret) {
4901                 btrfs_abort_transaction(trans, ret);
4902                 btrfs_end_transaction(trans);
4903                 return ret;
4904         }
4905
4906         ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4907         if (ret) {
4908                 btrfs_abort_transaction(trans, ret);
4909         } else {
4910                 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4911                 btrfs_update_inode(trans, root, inode);
4912         }
4913         btrfs_end_transaction(trans);
4914         return ret;
4915 }
4916
4917 /*
4918  * This function puts in dummy file extents for the area we're creating a hole
4919  * for.  So if we are truncating this file to a larger size we need to insert
4920  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4921  * the range between oldsize and size
4922  */
4923 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4924 {
4925         struct btrfs_root *root = inode->root;
4926         struct btrfs_fs_info *fs_info = root->fs_info;
4927         struct extent_io_tree *io_tree = &inode->io_tree;
4928         struct extent_map *em = NULL;
4929         struct extent_state *cached_state = NULL;
4930         u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4931         u64 block_end = ALIGN(size, fs_info->sectorsize);
4932         u64 last_byte;
4933         u64 cur_offset;
4934         u64 hole_size;
4935         int err = 0;
4936
4937         /*
4938          * If our size started in the middle of a block we need to zero out the
4939          * rest of the block before we expand the i_size, otherwise we could
4940          * expose stale data.
4941          */
4942         err = btrfs_truncate_block(inode, oldsize, 0, 0);
4943         if (err)
4944                 return err;
4945
4946         if (size <= hole_start)
4947                 return 0;
4948
4949         btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4950                                            &cached_state);
4951         cur_offset = hole_start;
4952         while (1) {
4953                 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4954                                       block_end - cur_offset);
4955                 if (IS_ERR(em)) {
4956                         err = PTR_ERR(em);
4957                         em = NULL;
4958                         break;
4959                 }
4960                 last_byte = min(extent_map_end(em), block_end);
4961                 last_byte = ALIGN(last_byte, fs_info->sectorsize);
4962                 hole_size = last_byte - cur_offset;
4963
4964                 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4965                         struct extent_map *hole_em;
4966
4967                         err = maybe_insert_hole(root, inode, cur_offset,
4968                                                 hole_size);
4969                         if (err)
4970                                 break;
4971
4972                         err = btrfs_inode_set_file_extent_range(inode,
4973                                                         cur_offset, hole_size);
4974                         if (err)
4975                                 break;
4976
4977                         hole_em = alloc_extent_map();
4978                         if (!hole_em) {
4979                                 btrfs_drop_extent_map_range(inode, cur_offset,
4980                                                     cur_offset + hole_size - 1,
4981                                                     false);
4982                                 btrfs_set_inode_full_sync(inode);
4983                                 goto next;
4984                         }
4985                         hole_em->start = cur_offset;
4986                         hole_em->len = hole_size;
4987                         hole_em->orig_start = cur_offset;
4988
4989                         hole_em->block_start = EXTENT_MAP_HOLE;
4990                         hole_em->block_len = 0;
4991                         hole_em->orig_block_len = 0;
4992                         hole_em->ram_bytes = hole_size;
4993                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
4994                         hole_em->generation = fs_info->generation;
4995
4996                         err = btrfs_replace_extent_map_range(inode, hole_em, true);
4997                         free_extent_map(hole_em);
4998                 } else {
4999                         err = btrfs_inode_set_file_extent_range(inode,
5000                                                         cur_offset, hole_size);
5001                         if (err)
5002                                 break;
5003                 }
5004 next:
5005                 free_extent_map(em);
5006                 em = NULL;
5007                 cur_offset = last_byte;
5008                 if (cur_offset >= block_end)
5009                         break;
5010         }
5011         free_extent_map(em);
5012         unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
5013         return err;
5014 }
5015
5016 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5017 {
5018         struct btrfs_root *root = BTRFS_I(inode)->root;
5019         struct btrfs_trans_handle *trans;
5020         loff_t oldsize = i_size_read(inode);
5021         loff_t newsize = attr->ia_size;
5022         int mask = attr->ia_valid;
5023         int ret;
5024
5025         /*
5026          * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5027          * special case where we need to update the times despite not having
5028          * these flags set.  For all other operations the VFS set these flags
5029          * explicitly if it wants a timestamp update.
5030          */
5031         if (newsize != oldsize) {
5032                 inode_inc_iversion(inode);
5033                 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
5034                         inode->i_mtime = current_time(inode);
5035                         inode->i_ctime = inode->i_mtime;
5036                 }
5037         }
5038
5039         if (newsize > oldsize) {
5040                 /*
5041                  * Don't do an expanding truncate while snapshotting is ongoing.
5042                  * This is to ensure the snapshot captures a fully consistent
5043                  * state of this file - if the snapshot captures this expanding
5044                  * truncation, it must capture all writes that happened before
5045                  * this truncation.
5046                  */
5047                 btrfs_drew_write_lock(&root->snapshot_lock);
5048                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5049                 if (ret) {
5050                         btrfs_drew_write_unlock(&root->snapshot_lock);
5051                         return ret;
5052                 }
5053
5054                 trans = btrfs_start_transaction(root, 1);
5055                 if (IS_ERR(trans)) {
5056                         btrfs_drew_write_unlock(&root->snapshot_lock);
5057                         return PTR_ERR(trans);
5058                 }
5059
5060                 i_size_write(inode, newsize);
5061                 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5062                 pagecache_isize_extended(inode, oldsize, newsize);
5063                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5064                 btrfs_drew_write_unlock(&root->snapshot_lock);
5065                 btrfs_end_transaction(trans);
5066         } else {
5067                 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5068
5069                 if (btrfs_is_zoned(fs_info)) {
5070                         ret = btrfs_wait_ordered_range(inode,
5071                                         ALIGN(newsize, fs_info->sectorsize),
5072                                         (u64)-1);
5073                         if (ret)
5074                                 return ret;
5075                 }
5076
5077                 /*
5078                  * We're truncating a file that used to have good data down to
5079                  * zero. Make sure any new writes to the file get on disk
5080                  * on close.
5081                  */
5082                 if (newsize == 0)
5083                         set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5084                                 &BTRFS_I(inode)->runtime_flags);
5085
5086                 truncate_setsize(inode, newsize);
5087
5088                 inode_dio_wait(inode);
5089
5090                 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5091                 if (ret && inode->i_nlink) {
5092                         int err;
5093
5094                         /*
5095                          * Truncate failed, so fix up the in-memory size. We
5096                          * adjusted disk_i_size down as we removed extents, so
5097                          * wait for disk_i_size to be stable and then update the
5098                          * in-memory size to match.
5099                          */
5100                         err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5101                         if (err)
5102                                 return err;
5103                         i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5104                 }
5105         }
5106
5107         return ret;
5108 }
5109
5110 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5111                          struct iattr *attr)
5112 {
5113         struct inode *inode = d_inode(dentry);
5114         struct btrfs_root *root = BTRFS_I(inode)->root;
5115         int err;
5116
5117         if (btrfs_root_readonly(root))
5118                 return -EROFS;
5119
5120         err = setattr_prepare(idmap, dentry, attr);
5121         if (err)
5122                 return err;
5123
5124         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5125                 err = btrfs_setsize(inode, attr);
5126                 if (err)
5127                         return err;
5128         }
5129
5130         if (attr->ia_valid) {
5131                 setattr_copy(idmap, inode, attr);
5132                 inode_inc_iversion(inode);
5133                 err = btrfs_dirty_inode(BTRFS_I(inode));
5134
5135                 if (!err && attr->ia_valid & ATTR_MODE)
5136                         err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5137         }
5138
5139         return err;
5140 }
5141
5142 /*
5143  * While truncating the inode pages during eviction, we get the VFS
5144  * calling btrfs_invalidate_folio() against each folio of the inode. This
5145  * is slow because the calls to btrfs_invalidate_folio() result in a
5146  * huge amount of calls to lock_extent() and clear_extent_bit(),
5147  * which keep merging and splitting extent_state structures over and over,
5148  * wasting lots of time.
5149  *
5150  * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5151  * skip all those expensive operations on a per folio basis and do only
5152  * the ordered io finishing, while we release here the extent_map and
5153  * extent_state structures, without the excessive merging and splitting.
5154  */
5155 static void evict_inode_truncate_pages(struct inode *inode)
5156 {
5157         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5158         struct rb_node *node;
5159
5160         ASSERT(inode->i_state & I_FREEING);
5161         truncate_inode_pages_final(&inode->i_data);
5162
5163         btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5164
5165         /*
5166          * Keep looping until we have no more ranges in the io tree.
5167          * We can have ongoing bios started by readahead that have
5168          * their endio callback (extent_io.c:end_bio_extent_readpage)
5169          * still in progress (unlocked the pages in the bio but did not yet
5170          * unlocked the ranges in the io tree). Therefore this means some
5171          * ranges can still be locked and eviction started because before
5172          * submitting those bios, which are executed by a separate task (work
5173          * queue kthread), inode references (inode->i_count) were not taken
5174          * (which would be dropped in the end io callback of each bio).
5175          * Therefore here we effectively end up waiting for those bios and
5176          * anyone else holding locked ranges without having bumped the inode's
5177          * reference count - if we don't do it, when they access the inode's
5178          * io_tree to unlock a range it may be too late, leading to an
5179          * use-after-free issue.
5180          */
5181         spin_lock(&io_tree->lock);
5182         while (!RB_EMPTY_ROOT(&io_tree->state)) {
5183                 struct extent_state *state;
5184                 struct extent_state *cached_state = NULL;
5185                 u64 start;
5186                 u64 end;
5187                 unsigned state_flags;
5188
5189                 node = rb_first(&io_tree->state);
5190                 state = rb_entry(node, struct extent_state, rb_node);
5191                 start = state->start;
5192                 end = state->end;
5193                 state_flags = state->state;
5194                 spin_unlock(&io_tree->lock);
5195
5196                 lock_extent(io_tree, start, end, &cached_state);
5197
5198                 /*
5199                  * If still has DELALLOC flag, the extent didn't reach disk,
5200                  * and its reserved space won't be freed by delayed_ref.
5201                  * So we need to free its reserved space here.
5202                  * (Refer to comment in btrfs_invalidate_folio, case 2)
5203                  *
5204                  * Note, end is the bytenr of last byte, so we need + 1 here.
5205                  */
5206                 if (state_flags & EXTENT_DELALLOC)
5207                         btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5208                                                end - start + 1);
5209
5210                 clear_extent_bit(io_tree, start, end,
5211                                  EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5212                                  &cached_state);
5213
5214                 cond_resched();
5215                 spin_lock(&io_tree->lock);
5216         }
5217         spin_unlock(&io_tree->lock);
5218 }
5219
5220 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5221                                                         struct btrfs_block_rsv *rsv)
5222 {
5223         struct btrfs_fs_info *fs_info = root->fs_info;
5224         struct btrfs_trans_handle *trans;
5225         u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5226         int ret;
5227
5228         /*
5229          * Eviction should be taking place at some place safe because of our
5230          * delayed iputs.  However the normal flushing code will run delayed
5231          * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5232          *
5233          * We reserve the delayed_refs_extra here again because we can't use
5234          * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5235          * above.  We reserve our extra bit here because we generate a ton of
5236          * delayed refs activity by truncating.
5237          *
5238          * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5239          * if we fail to make this reservation we can re-try without the
5240          * delayed_refs_extra so we can make some forward progress.
5241          */
5242         ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5243                                      BTRFS_RESERVE_FLUSH_EVICT);
5244         if (ret) {
5245                 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5246                                              BTRFS_RESERVE_FLUSH_EVICT);
5247                 if (ret) {
5248                         btrfs_warn(fs_info,
5249                                    "could not allocate space for delete; will truncate on mount");
5250                         return ERR_PTR(-ENOSPC);
5251                 }
5252                 delayed_refs_extra = 0;
5253         }
5254
5255         trans = btrfs_join_transaction(root);
5256         if (IS_ERR(trans))
5257                 return trans;
5258
5259         if (delayed_refs_extra) {
5260                 trans->block_rsv = &fs_info->trans_block_rsv;
5261                 trans->bytes_reserved = delayed_refs_extra;
5262                 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5263                                         delayed_refs_extra, true);
5264         }
5265         return trans;
5266 }
5267
5268 void btrfs_evict_inode(struct inode *inode)
5269 {
5270         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5271         struct btrfs_trans_handle *trans;
5272         struct btrfs_root *root = BTRFS_I(inode)->root;
5273         struct btrfs_block_rsv *rsv = NULL;
5274         int ret;
5275
5276         trace_btrfs_inode_evict(inode);
5277
5278         if (!root) {
5279                 fsverity_cleanup_inode(inode);
5280                 clear_inode(inode);
5281                 return;
5282         }
5283
5284         evict_inode_truncate_pages(inode);
5285
5286         if (inode->i_nlink &&
5287             ((btrfs_root_refs(&root->root_item) != 0 &&
5288               root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5289              btrfs_is_free_space_inode(BTRFS_I(inode))))
5290                 goto out;
5291
5292         if (is_bad_inode(inode))
5293                 goto out;
5294
5295         if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5296                 goto out;
5297
5298         if (inode->i_nlink > 0) {
5299                 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5300                        root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5301                 goto out;
5302         }
5303
5304         /*
5305          * This makes sure the inode item in tree is uptodate and the space for
5306          * the inode update is released.
5307          */
5308         ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5309         if (ret)
5310                 goto out;
5311
5312         /*
5313          * This drops any pending insert or delete operations we have for this
5314          * inode.  We could have a delayed dir index deletion queued up, but
5315          * we're removing the inode completely so that'll be taken care of in
5316          * the truncate.
5317          */
5318         btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5319
5320         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5321         if (!rsv)
5322                 goto out;
5323         rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5324         rsv->failfast = true;
5325
5326         btrfs_i_size_write(BTRFS_I(inode), 0);
5327
5328         while (1) {
5329                 struct btrfs_truncate_control control = {
5330                         .inode = BTRFS_I(inode),
5331                         .ino = btrfs_ino(BTRFS_I(inode)),
5332                         .new_size = 0,
5333                         .min_type = 0,
5334                 };
5335
5336                 trans = evict_refill_and_join(root, rsv);
5337                 if (IS_ERR(trans))
5338                         goto out;
5339
5340                 trans->block_rsv = rsv;
5341
5342                 ret = btrfs_truncate_inode_items(trans, root, &control);
5343                 trans->block_rsv = &fs_info->trans_block_rsv;
5344                 btrfs_end_transaction(trans);
5345                 /*
5346                  * We have not added new delayed items for our inode after we
5347                  * have flushed its delayed items, so no need to throttle on
5348                  * delayed items. However we have modified extent buffers.
5349                  */
5350                 btrfs_btree_balance_dirty_nodelay(fs_info);
5351                 if (ret && ret != -ENOSPC && ret != -EAGAIN)
5352                         goto out;
5353                 else if (!ret)
5354                         break;
5355         }
5356
5357         /*
5358          * Errors here aren't a big deal, it just means we leave orphan items in
5359          * the tree. They will be cleaned up on the next mount. If the inode
5360          * number gets reused, cleanup deletes the orphan item without doing
5361          * anything, and unlink reuses the existing orphan item.
5362          *
5363          * If it turns out that we are dropping too many of these, we might want
5364          * to add a mechanism for retrying these after a commit.
5365          */
5366         trans = evict_refill_and_join(root, rsv);
5367         if (!IS_ERR(trans)) {
5368                 trans->block_rsv = rsv;
5369                 btrfs_orphan_del(trans, BTRFS_I(inode));
5370                 trans->block_rsv = &fs_info->trans_block_rsv;
5371                 btrfs_end_transaction(trans);
5372         }
5373
5374 out:
5375         btrfs_free_block_rsv(fs_info, rsv);
5376         /*
5377          * If we didn't successfully delete, the orphan item will still be in
5378          * the tree and we'll retry on the next mount. Again, we might also want
5379          * to retry these periodically in the future.
5380          */
5381         btrfs_remove_delayed_node(BTRFS_I(inode));
5382         fsverity_cleanup_inode(inode);
5383         clear_inode(inode);
5384 }
5385
5386 /*
5387  * Return the key found in the dir entry in the location pointer, fill @type
5388  * with BTRFS_FT_*, and return 0.
5389  *
5390  * If no dir entries were found, returns -ENOENT.
5391  * If found a corrupted location in dir entry, returns -EUCLEAN.
5392  */
5393 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5394                                struct btrfs_key *location, u8 *type)
5395 {
5396         struct btrfs_dir_item *di;
5397         struct btrfs_path *path;
5398         struct btrfs_root *root = dir->root;
5399         int ret = 0;
5400         struct fscrypt_name fname;
5401
5402         path = btrfs_alloc_path();
5403         if (!path)
5404                 return -ENOMEM;
5405
5406         ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5407         if (ret < 0)
5408                 goto out;
5409         /*
5410          * fscrypt_setup_filename() should never return a positive value, but
5411          * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5412          */
5413         ASSERT(ret == 0);
5414
5415         /* This needs to handle no-key deletions later on */
5416
5417         di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5418                                    &fname.disk_name, 0);
5419         if (IS_ERR_OR_NULL(di)) {
5420                 ret = di ? PTR_ERR(di) : -ENOENT;
5421                 goto out;
5422         }
5423
5424         btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5425         if (location->type != BTRFS_INODE_ITEM_KEY &&
5426             location->type != BTRFS_ROOT_ITEM_KEY) {
5427                 ret = -EUCLEAN;
5428                 btrfs_warn(root->fs_info,
5429 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5430                            __func__, fname.disk_name.name, btrfs_ino(dir),
5431                            location->objectid, location->type, location->offset);
5432         }
5433         if (!ret)
5434                 *type = btrfs_dir_ftype(path->nodes[0], di);
5435 out:
5436         fscrypt_free_filename(&fname);
5437         btrfs_free_path(path);
5438         return ret;
5439 }
5440
5441 /*
5442  * when we hit a tree root in a directory, the btrfs part of the inode
5443  * needs to be changed to reflect the root directory of the tree root.  This
5444  * is kind of like crossing a mount point.
5445  */
5446 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5447                                     struct btrfs_inode *dir,
5448                                     struct dentry *dentry,
5449                                     struct btrfs_key *location,
5450                                     struct btrfs_root **sub_root)
5451 {
5452         struct btrfs_path *path;
5453         struct btrfs_root *new_root;
5454         struct btrfs_root_ref *ref;
5455         struct extent_buffer *leaf;
5456         struct btrfs_key key;
5457         int ret;
5458         int err = 0;
5459         struct fscrypt_name fname;
5460
5461         ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5462         if (ret)
5463                 return ret;
5464
5465         path = btrfs_alloc_path();
5466         if (!path) {
5467                 err = -ENOMEM;
5468                 goto out;
5469         }
5470
5471         err = -ENOENT;
5472         key.objectid = dir->root->root_key.objectid;
5473         key.type = BTRFS_ROOT_REF_KEY;
5474         key.offset = location->objectid;
5475
5476         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5477         if (ret) {
5478                 if (ret < 0)
5479                         err = ret;
5480                 goto out;
5481         }
5482
5483         leaf = path->nodes[0];
5484         ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5485         if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5486             btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5487                 goto out;
5488
5489         ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5490                                    (unsigned long)(ref + 1), fname.disk_name.len);
5491         if (ret)
5492                 goto out;
5493
5494         btrfs_release_path(path);
5495
5496         new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5497         if (IS_ERR(new_root)) {
5498                 err = PTR_ERR(new_root);
5499                 goto out;
5500         }
5501
5502         *sub_root = new_root;
5503         location->objectid = btrfs_root_dirid(&new_root->root_item);
5504         location->type = BTRFS_INODE_ITEM_KEY;
5505         location->offset = 0;
5506         err = 0;
5507 out:
5508         btrfs_free_path(path);
5509         fscrypt_free_filename(&fname);
5510         return err;
5511 }
5512
5513 static void inode_tree_add(struct btrfs_inode *inode)
5514 {
5515         struct btrfs_root *root = inode->root;
5516         struct btrfs_inode *entry;
5517         struct rb_node **p;
5518         struct rb_node *parent;
5519         struct rb_node *new = &inode->rb_node;
5520         u64 ino = btrfs_ino(inode);
5521
5522         if (inode_unhashed(&inode->vfs_inode))
5523                 return;
5524         parent = NULL;
5525         spin_lock(&root->inode_lock);
5526         p = &root->inode_tree.rb_node;
5527         while (*p) {
5528                 parent = *p;
5529                 entry = rb_entry(parent, struct btrfs_inode, rb_node);
5530
5531                 if (ino < btrfs_ino(entry))
5532                         p = &parent->rb_left;
5533                 else if (ino > btrfs_ino(entry))
5534                         p = &parent->rb_right;
5535                 else {
5536                         WARN_ON(!(entry->vfs_inode.i_state &
5537                                   (I_WILL_FREE | I_FREEING)));
5538                         rb_replace_node(parent, new, &root->inode_tree);
5539                         RB_CLEAR_NODE(parent);
5540                         spin_unlock(&root->inode_lock);
5541                         return;
5542                 }
5543         }
5544         rb_link_node(new, parent, p);
5545         rb_insert_color(new, &root->inode_tree);
5546         spin_unlock(&root->inode_lock);
5547 }
5548
5549 static void inode_tree_del(struct btrfs_inode *inode)
5550 {
5551         struct btrfs_root *root = inode->root;
5552         int empty = 0;
5553
5554         spin_lock(&root->inode_lock);
5555         if (!RB_EMPTY_NODE(&inode->rb_node)) {
5556                 rb_erase(&inode->rb_node, &root->inode_tree);
5557                 RB_CLEAR_NODE(&inode->rb_node);
5558                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5559         }
5560         spin_unlock(&root->inode_lock);
5561
5562         if (empty && btrfs_root_refs(&root->root_item) == 0) {
5563                 spin_lock(&root->inode_lock);
5564                 empty = RB_EMPTY_ROOT(&root->inode_tree);
5565                 spin_unlock(&root->inode_lock);
5566                 if (empty)
5567                         btrfs_add_dead_root(root);
5568         }
5569 }
5570
5571
5572 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5573 {
5574         struct btrfs_iget_args *args = p;
5575
5576         inode->i_ino = args->ino;
5577         BTRFS_I(inode)->location.objectid = args->ino;
5578         BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5579         BTRFS_I(inode)->location.offset = 0;
5580         BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5581         BUG_ON(args->root && !BTRFS_I(inode)->root);
5582
5583         if (args->root && args->root == args->root->fs_info->tree_root &&
5584             args->ino != BTRFS_BTREE_INODE_OBJECTID)
5585                 set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5586                         &BTRFS_I(inode)->runtime_flags);
5587         return 0;
5588 }
5589
5590 static int btrfs_find_actor(struct inode *inode, void *opaque)
5591 {
5592         struct btrfs_iget_args *args = opaque;
5593
5594         return args->ino == BTRFS_I(inode)->location.objectid &&
5595                 args->root == BTRFS_I(inode)->root;
5596 }
5597
5598 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5599                                        struct btrfs_root *root)
5600 {
5601         struct inode *inode;
5602         struct btrfs_iget_args args;
5603         unsigned long hashval = btrfs_inode_hash(ino, root);
5604
5605         args.ino = ino;
5606         args.root = root;
5607
5608         inode = iget5_locked(s, hashval, btrfs_find_actor,
5609                              btrfs_init_locked_inode,
5610                              (void *)&args);
5611         return inode;
5612 }
5613
5614 /*
5615  * Get an inode object given its inode number and corresponding root.
5616  * Path can be preallocated to prevent recursing back to iget through
5617  * allocator. NULL is also valid but may require an additional allocation
5618  * later.
5619  */
5620 struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5621                               struct btrfs_root *root, struct btrfs_path *path)
5622 {
5623         struct inode *inode;
5624
5625         inode = btrfs_iget_locked(s, ino, root);
5626         if (!inode)
5627                 return ERR_PTR(-ENOMEM);
5628
5629         if (inode->i_state & I_NEW) {
5630                 int ret;
5631
5632                 ret = btrfs_read_locked_inode(inode, path);
5633                 if (!ret) {
5634                         inode_tree_add(BTRFS_I(inode));
5635                         unlock_new_inode(inode);
5636                 } else {
5637                         iget_failed(inode);
5638                         /*
5639                          * ret > 0 can come from btrfs_search_slot called by
5640                          * btrfs_read_locked_inode, this means the inode item
5641                          * was not found.
5642                          */
5643                         if (ret > 0)
5644                                 ret = -ENOENT;
5645                         inode = ERR_PTR(ret);
5646                 }
5647         }
5648
5649         return inode;
5650 }
5651
5652 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5653 {
5654         return btrfs_iget_path(s, ino, root, NULL);
5655 }
5656
5657 static struct inode *new_simple_dir(struct super_block *s,
5658                                     struct btrfs_key *key,
5659                                     struct btrfs_root *root)
5660 {
5661         struct inode *inode = new_inode(s);
5662
5663         if (!inode)
5664                 return ERR_PTR(-ENOMEM);
5665
5666         BTRFS_I(inode)->root = btrfs_grab_root(root);
5667         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5668         set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5669
5670         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5671         /*
5672          * We only need lookup, the rest is read-only and there's no inode
5673          * associated with the dentry
5674          */
5675         inode->i_op = &simple_dir_inode_operations;
5676         inode->i_opflags &= ~IOP_XATTR;
5677         inode->i_fop = &simple_dir_operations;
5678         inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5679         inode->i_mtime = current_time(inode);
5680         inode->i_atime = inode->i_mtime;
5681         inode->i_ctime = inode->i_mtime;
5682         BTRFS_I(inode)->i_otime = inode->i_mtime;
5683
5684         return inode;
5685 }
5686
5687 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5688 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5689 static_assert(BTRFS_FT_DIR == FT_DIR);
5690 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5691 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5692 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5693 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5694 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5695
5696 static inline u8 btrfs_inode_type(struct inode *inode)
5697 {
5698         return fs_umode_to_ftype(inode->i_mode);
5699 }
5700
5701 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5702 {
5703         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5704         struct inode *inode;
5705         struct btrfs_root *root = BTRFS_I(dir)->root;
5706         struct btrfs_root *sub_root = root;
5707         struct btrfs_key location;
5708         u8 di_type = 0;
5709         int ret = 0;
5710
5711         if (dentry->d_name.len > BTRFS_NAME_LEN)
5712                 return ERR_PTR(-ENAMETOOLONG);
5713
5714         ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5715         if (ret < 0)
5716                 return ERR_PTR(ret);
5717
5718         if (location.type == BTRFS_INODE_ITEM_KEY) {
5719                 inode = btrfs_iget(dir->i_sb, location.objectid, root);
5720                 if (IS_ERR(inode))
5721                         return inode;
5722
5723                 /* Do extra check against inode mode with di_type */
5724                 if (btrfs_inode_type(inode) != di_type) {
5725                         btrfs_crit(fs_info,
5726 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5727                                   inode->i_mode, btrfs_inode_type(inode),
5728                                   di_type);
5729                         iput(inode);
5730                         return ERR_PTR(-EUCLEAN);
5731                 }
5732                 return inode;
5733         }
5734
5735         ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5736                                        &location, &sub_root);
5737         if (ret < 0) {
5738                 if (ret != -ENOENT)
5739                         inode = ERR_PTR(ret);
5740                 else
5741                         inode = new_simple_dir(dir->i_sb, &location, root);
5742         } else {
5743                 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5744                 btrfs_put_root(sub_root);
5745
5746                 if (IS_ERR(inode))
5747                         return inode;
5748
5749                 down_read(&fs_info->cleanup_work_sem);
5750                 if (!sb_rdonly(inode->i_sb))
5751                         ret = btrfs_orphan_cleanup(sub_root);
5752                 up_read(&fs_info->cleanup_work_sem);
5753                 if (ret) {
5754                         iput(inode);
5755                         inode = ERR_PTR(ret);
5756                 }
5757         }
5758
5759         return inode;
5760 }
5761
5762 static int btrfs_dentry_delete(const struct dentry *dentry)
5763 {
5764         struct btrfs_root *root;
5765         struct inode *inode = d_inode(dentry);
5766
5767         if (!inode && !IS_ROOT(dentry))
5768                 inode = d_inode(dentry->d_parent);
5769
5770         if (inode) {
5771                 root = BTRFS_I(inode)->root;
5772                 if (btrfs_root_refs(&root->root_item) == 0)
5773                         return 1;
5774
5775                 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5776                         return 1;
5777         }
5778         return 0;
5779 }
5780
5781 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5782                                    unsigned int flags)
5783 {
5784         struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5785
5786         if (inode == ERR_PTR(-ENOENT))
5787                 inode = NULL;
5788         return d_splice_alias(inode, dentry);
5789 }
5790
5791 /*
5792  * Find the highest existing sequence number in a directory and then set the
5793  * in-memory index_cnt variable to the first free sequence number.
5794  */
5795 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5796 {
5797         struct btrfs_root *root = inode->root;
5798         struct btrfs_key key, found_key;
5799         struct btrfs_path *path;
5800         struct extent_buffer *leaf;
5801         int ret;
5802
5803         key.objectid = btrfs_ino(inode);
5804         key.type = BTRFS_DIR_INDEX_KEY;
5805         key.offset = (u64)-1;
5806
5807         path = btrfs_alloc_path();
5808         if (!path)
5809                 return -ENOMEM;
5810
5811         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5812         if (ret < 0)
5813                 goto out;
5814         /* FIXME: we should be able to handle this */
5815         if (ret == 0)
5816                 goto out;
5817         ret = 0;
5818
5819         if (path->slots[0] == 0) {
5820                 inode->index_cnt = BTRFS_DIR_START_INDEX;
5821                 goto out;
5822         }
5823
5824         path->slots[0]--;
5825
5826         leaf = path->nodes[0];
5827         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5828
5829         if (found_key.objectid != btrfs_ino(inode) ||
5830             found_key.type != BTRFS_DIR_INDEX_KEY) {
5831                 inode->index_cnt = BTRFS_DIR_START_INDEX;
5832                 goto out;
5833         }
5834
5835         inode->index_cnt = found_key.offset + 1;
5836 out:
5837         btrfs_free_path(path);
5838         return ret;
5839 }
5840
5841 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5842 {
5843         if (dir->index_cnt == (u64)-1) {
5844                 int ret;
5845
5846                 ret = btrfs_inode_delayed_dir_index_count(dir);
5847                 if (ret) {
5848                         ret = btrfs_set_inode_index_count(dir);
5849                         if (ret)
5850                                 return ret;
5851                 }
5852         }
5853
5854         *index = dir->index_cnt;
5855
5856         return 0;
5857 }
5858
5859 /*
5860  * All this infrastructure exists because dir_emit can fault, and we are holding
5861  * the tree lock when doing readdir.  For now just allocate a buffer and copy
5862  * our information into that, and then dir_emit from the buffer.  This is
5863  * similar to what NFS does, only we don't keep the buffer around in pagecache
5864  * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5865  * copy_to_user_inatomic so we don't have to worry about page faulting under the
5866  * tree lock.
5867  */
5868 static int btrfs_opendir(struct inode *inode, struct file *file)
5869 {
5870         struct btrfs_file_private *private;
5871         u64 last_index;
5872         int ret;
5873
5874         ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5875         if (ret)
5876                 return ret;
5877
5878         private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5879         if (!private)
5880                 return -ENOMEM;
5881         private->last_index = last_index;
5882         private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5883         if (!private->filldir_buf) {
5884                 kfree(private);
5885                 return -ENOMEM;
5886         }
5887         file->private_data = private;
5888         return 0;
5889 }
5890
5891 struct dir_entry {
5892         u64 ino;
5893         u64 offset;
5894         unsigned type;
5895         int name_len;
5896 };
5897
5898 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5899 {
5900         while (entries--) {
5901                 struct dir_entry *entry = addr;
5902                 char *name = (char *)(entry + 1);
5903
5904                 ctx->pos = get_unaligned(&entry->offset);
5905                 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5906                                          get_unaligned(&entry->ino),
5907                                          get_unaligned(&entry->type)))
5908                         return 1;
5909                 addr += sizeof(struct dir_entry) +
5910                         get_unaligned(&entry->name_len);
5911                 ctx->pos++;
5912         }
5913         return 0;
5914 }
5915
5916 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5917 {
5918         struct inode *inode = file_inode(file);
5919         struct btrfs_root *root = BTRFS_I(inode)->root;
5920         struct btrfs_file_private *private = file->private_data;
5921         struct btrfs_dir_item *di;
5922         struct btrfs_key key;
5923         struct btrfs_key found_key;
5924         struct btrfs_path *path;
5925         void *addr;
5926         struct list_head ins_list;
5927         struct list_head del_list;
5928         int ret;
5929         char *name_ptr;
5930         int name_len;
5931         int entries = 0;
5932         int total_len = 0;
5933         bool put = false;
5934         struct btrfs_key location;
5935
5936         if (!dir_emit_dots(file, ctx))
5937                 return 0;
5938
5939         path = btrfs_alloc_path();
5940         if (!path)
5941                 return -ENOMEM;
5942
5943         addr = private->filldir_buf;
5944         path->reada = READA_FORWARD;
5945
5946         INIT_LIST_HEAD(&ins_list);
5947         INIT_LIST_HEAD(&del_list);
5948         put = btrfs_readdir_get_delayed_items(inode, private->last_index,
5949                                               &ins_list, &del_list);
5950
5951 again:
5952         key.type = BTRFS_DIR_INDEX_KEY;
5953         key.offset = ctx->pos;
5954         key.objectid = btrfs_ino(BTRFS_I(inode));
5955
5956         btrfs_for_each_slot(root, &key, &found_key, path, ret) {
5957                 struct dir_entry *entry;
5958                 struct extent_buffer *leaf = path->nodes[0];
5959                 u8 ftype;
5960
5961                 if (found_key.objectid != key.objectid)
5962                         break;
5963                 if (found_key.type != BTRFS_DIR_INDEX_KEY)
5964                         break;
5965                 if (found_key.offset < ctx->pos)
5966                         continue;
5967                 if (found_key.offset > private->last_index)
5968                         break;
5969                 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5970                         continue;
5971                 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5972                 name_len = btrfs_dir_name_len(leaf, di);
5973                 if ((total_len + sizeof(struct dir_entry) + name_len) >=
5974                     PAGE_SIZE) {
5975                         btrfs_release_path(path);
5976                         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5977                         if (ret)
5978                                 goto nopos;
5979                         addr = private->filldir_buf;
5980                         entries = 0;
5981                         total_len = 0;
5982                         goto again;
5983                 }
5984
5985                 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
5986                 entry = addr;
5987                 name_ptr = (char *)(entry + 1);
5988                 read_extent_buffer(leaf, name_ptr,
5989                                    (unsigned long)(di + 1), name_len);
5990                 put_unaligned(name_len, &entry->name_len);
5991                 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
5992                 btrfs_dir_item_key_to_cpu(leaf, di, &location);
5993                 put_unaligned(location.objectid, &entry->ino);
5994                 put_unaligned(found_key.offset, &entry->offset);
5995                 entries++;
5996                 addr += sizeof(struct dir_entry) + name_len;
5997                 total_len += sizeof(struct dir_entry) + name_len;
5998         }
5999         /* Catch error encountered during iteration */
6000         if (ret < 0)
6001                 goto err;
6002
6003         btrfs_release_path(path);
6004
6005         ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6006         if (ret)
6007                 goto nopos;
6008
6009         ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
6010         if (ret)
6011                 goto nopos;
6012
6013         /*
6014          * Stop new entries from being returned after we return the last
6015          * entry.
6016          *
6017          * New directory entries are assigned a strictly increasing
6018          * offset.  This means that new entries created during readdir
6019          * are *guaranteed* to be seen in the future by that readdir.
6020          * This has broken buggy programs which operate on names as
6021          * they're returned by readdir.  Until we re-use freed offsets
6022          * we have this hack to stop new entries from being returned
6023          * under the assumption that they'll never reach this huge
6024          * offset.
6025          *
6026          * This is being careful not to overflow 32bit loff_t unless the
6027          * last entry requires it because doing so has broken 32bit apps
6028          * in the past.
6029          */
6030         if (ctx->pos >= INT_MAX)
6031                 ctx->pos = LLONG_MAX;
6032         else
6033                 ctx->pos = INT_MAX;
6034 nopos:
6035         ret = 0;
6036 err:
6037         if (put)
6038                 btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6039         btrfs_free_path(path);
6040         return ret;
6041 }
6042
6043 /*
6044  * This is somewhat expensive, updating the tree every time the
6045  * inode changes.  But, it is most likely to find the inode in cache.
6046  * FIXME, needs more benchmarking...there are no reasons other than performance
6047  * to keep or drop this code.
6048  */
6049 static int btrfs_dirty_inode(struct btrfs_inode *inode)
6050 {
6051         struct btrfs_root *root = inode->root;
6052         struct btrfs_fs_info *fs_info = root->fs_info;
6053         struct btrfs_trans_handle *trans;
6054         int ret;
6055
6056         if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6057                 return 0;
6058
6059         trans = btrfs_join_transaction(root);
6060         if (IS_ERR(trans))
6061                 return PTR_ERR(trans);
6062
6063         ret = btrfs_update_inode(trans, root, inode);
6064         if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
6065                 /* whoops, lets try again with the full transaction */
6066                 btrfs_end_transaction(trans);
6067                 trans = btrfs_start_transaction(root, 1);
6068                 if (IS_ERR(trans))
6069                         return PTR_ERR(trans);
6070
6071                 ret = btrfs_update_inode(trans, root, inode);
6072         }
6073         btrfs_end_transaction(trans);
6074         if (inode->delayed_node)
6075                 btrfs_balance_delayed_items(fs_info);
6076
6077         return ret;
6078 }
6079
6080 /*
6081  * This is a copy of file_update_time.  We need this so we can return error on
6082  * ENOSPC for updating the inode in the case of file write and mmap writes.
6083  */
6084 static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
6085                              int flags)
6086 {
6087         struct btrfs_root *root = BTRFS_I(inode)->root;
6088         bool dirty = flags & ~S_VERSION;
6089
6090         if (btrfs_root_readonly(root))
6091                 return -EROFS;
6092
6093         if (flags & S_VERSION)
6094                 dirty |= inode_maybe_inc_iversion(inode, dirty);
6095         if (flags & S_CTIME)
6096                 inode->i_ctime = *now;
6097         if (flags & S_MTIME)
6098                 inode->i_mtime = *now;
6099         if (flags & S_ATIME)
6100                 inode->i_atime = *now;
6101         return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6102 }
6103
6104 /*
6105  * helper to find a free sequence number in a given directory.  This current
6106  * code is very simple, later versions will do smarter things in the btree
6107  */
6108 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6109 {
6110         int ret = 0;
6111
6112         if (dir->index_cnt == (u64)-1) {
6113                 ret = btrfs_inode_delayed_dir_index_count(dir);
6114                 if (ret) {
6115                         ret = btrfs_set_inode_index_count(dir);
6116                         if (ret)
6117                                 return ret;
6118                 }
6119         }
6120
6121         *index = dir->index_cnt;
6122         dir->index_cnt++;
6123
6124         return ret;
6125 }
6126
6127 static int btrfs_insert_inode_locked(struct inode *inode)
6128 {
6129         struct btrfs_iget_args args;
6130
6131         args.ino = BTRFS_I(inode)->location.objectid;
6132         args.root = BTRFS_I(inode)->root;
6133
6134         return insert_inode_locked4(inode,
6135                    btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6136                    btrfs_find_actor, &args);
6137 }
6138
6139 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6140                             unsigned int *trans_num_items)
6141 {
6142         struct inode *dir = args->dir;
6143         struct inode *inode = args->inode;
6144         int ret;
6145
6146         if (!args->orphan) {
6147                 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6148                                              &args->fname);
6149                 if (ret)
6150                         return ret;
6151         }
6152
6153         ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6154         if (ret) {
6155                 fscrypt_free_filename(&args->fname);
6156                 return ret;
6157         }
6158
6159         /* 1 to add inode item */
6160         *trans_num_items = 1;
6161         /* 1 to add compression property */
6162         if (BTRFS_I(dir)->prop_compress)
6163                 (*trans_num_items)++;
6164         /* 1 to add default ACL xattr */
6165         if (args->default_acl)
6166                 (*trans_num_items)++;
6167         /* 1 to add access ACL xattr */
6168         if (args->acl)
6169                 (*trans_num_items)++;
6170 #ifdef CONFIG_SECURITY
6171         /* 1 to add LSM xattr */
6172         if (dir->i_security)
6173                 (*trans_num_items)++;
6174 #endif
6175         if (args->orphan) {
6176                 /* 1 to add orphan item */
6177                 (*trans_num_items)++;
6178         } else {
6179                 /*
6180                  * 1 to add dir item
6181                  * 1 to add dir index
6182                  * 1 to update parent inode item
6183                  *
6184                  * No need for 1 unit for the inode ref item because it is
6185                  * inserted in a batch together with the inode item at
6186                  * btrfs_create_new_inode().
6187                  */
6188                 *trans_num_items += 3;
6189         }
6190         return 0;
6191 }
6192
6193 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6194 {
6195         posix_acl_release(args->acl);
6196         posix_acl_release(args->default_acl);
6197         fscrypt_free_filename(&args->fname);
6198 }
6199
6200 /*
6201  * Inherit flags from the parent inode.
6202  *
6203  * Currently only the compression flags and the cow flags are inherited.
6204  */
6205 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6206 {
6207         unsigned int flags;
6208
6209         flags = dir->flags;
6210
6211         if (flags & BTRFS_INODE_NOCOMPRESS) {
6212                 inode->flags &= ~BTRFS_INODE_COMPRESS;
6213                 inode->flags |= BTRFS_INODE_NOCOMPRESS;
6214         } else if (flags & BTRFS_INODE_COMPRESS) {
6215                 inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6216                 inode->flags |= BTRFS_INODE_COMPRESS;
6217         }
6218
6219         if (flags & BTRFS_INODE_NODATACOW) {
6220                 inode->flags |= BTRFS_INODE_NODATACOW;
6221                 if (S_ISREG(inode->vfs_inode.i_mode))
6222                         inode->flags |= BTRFS_INODE_NODATASUM;
6223         }
6224
6225         btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
6226 }
6227
6228 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6229                            struct btrfs_new_inode_args *args)
6230 {
6231         struct inode *dir = args->dir;
6232         struct inode *inode = args->inode;
6233         const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6234         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6235         struct btrfs_root *root;
6236         struct btrfs_inode_item *inode_item;
6237         struct btrfs_key *location;
6238         struct btrfs_path *path;
6239         u64 objectid;
6240         struct btrfs_inode_ref *ref;
6241         struct btrfs_key key[2];
6242         u32 sizes[2];
6243         struct btrfs_item_batch batch;
6244         unsigned long ptr;
6245         int ret;
6246
6247         path = btrfs_alloc_path();
6248         if (!path)
6249                 return -ENOMEM;
6250
6251         if (!args->subvol)
6252                 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6253         root = BTRFS_I(inode)->root;
6254
6255         ret = btrfs_get_free_objectid(root, &objectid);
6256         if (ret)
6257                 goto out;
6258         inode->i_ino = objectid;
6259
6260         if (args->orphan) {
6261                 /*
6262                  * O_TMPFILE, set link count to 0, so that after this point, we
6263                  * fill in an inode item with the correct link count.
6264                  */
6265                 set_nlink(inode, 0);
6266         } else {
6267                 trace_btrfs_inode_request(dir);
6268
6269                 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6270                 if (ret)
6271                         goto out;
6272         }
6273         /* index_cnt is ignored for everything but a dir. */
6274         BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6275         BTRFS_I(inode)->generation = trans->transid;
6276         inode->i_generation = BTRFS_I(inode)->generation;
6277
6278         /*
6279          * Subvolumes don't inherit flags from their parent directory.
6280          * Originally this was probably by accident, but we probably can't
6281          * change it now without compatibility issues.
6282          */
6283         if (!args->subvol)
6284                 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6285
6286         if (S_ISREG(inode->i_mode)) {
6287                 if (btrfs_test_opt(fs_info, NODATASUM))
6288                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6289                 if (btrfs_test_opt(fs_info, NODATACOW))
6290                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6291                                 BTRFS_INODE_NODATASUM;
6292         }
6293
6294         location = &BTRFS_I(inode)->location;
6295         location->objectid = objectid;
6296         location->offset = 0;
6297         location->type = BTRFS_INODE_ITEM_KEY;
6298
6299         ret = btrfs_insert_inode_locked(inode);
6300         if (ret < 0) {
6301                 if (!args->orphan)
6302                         BTRFS_I(dir)->index_cnt--;
6303                 goto out;
6304         }
6305
6306         /*
6307          * We could have gotten an inode number from somebody who was fsynced
6308          * and then removed in this same transaction, so let's just set full
6309          * sync since it will be a full sync anyway and this will blow away the
6310          * old info in the log.
6311          */
6312         btrfs_set_inode_full_sync(BTRFS_I(inode));
6313
6314         key[0].objectid = objectid;
6315         key[0].type = BTRFS_INODE_ITEM_KEY;
6316         key[0].offset = 0;
6317
6318         sizes[0] = sizeof(struct btrfs_inode_item);
6319
6320         if (!args->orphan) {
6321                 /*
6322                  * Start new inodes with an inode_ref. This is slightly more
6323                  * efficient for small numbers of hard links since they will
6324                  * be packed into one item. Extended refs will kick in if we
6325                  * add more hard links than can fit in the ref item.
6326                  */
6327                 key[1].objectid = objectid;
6328                 key[1].type = BTRFS_INODE_REF_KEY;
6329                 if (args->subvol) {
6330                         key[1].offset = objectid;
6331                         sizes[1] = 2 + sizeof(*ref);
6332                 } else {
6333                         key[1].offset = btrfs_ino(BTRFS_I(dir));
6334                         sizes[1] = name->len + sizeof(*ref);
6335                 }
6336         }
6337
6338         batch.keys = &key[0];
6339         batch.data_sizes = &sizes[0];
6340         batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6341         batch.nr = args->orphan ? 1 : 2;
6342         ret = btrfs_insert_empty_items(trans, root, path, &batch);
6343         if (ret != 0) {
6344                 btrfs_abort_transaction(trans, ret);
6345                 goto discard;
6346         }
6347
6348         inode->i_mtime = current_time(inode);
6349         inode->i_atime = inode->i_mtime;
6350         inode->i_ctime = inode->i_mtime;
6351         BTRFS_I(inode)->i_otime = inode->i_mtime;
6352
6353         /*
6354          * We're going to fill the inode item now, so at this point the inode
6355          * must be fully initialized.
6356          */
6357
6358         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6359                                   struct btrfs_inode_item);
6360         memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6361                              sizeof(*inode_item));
6362         fill_inode_item(trans, path->nodes[0], inode_item, inode);
6363
6364         if (!args->orphan) {
6365                 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6366                                      struct btrfs_inode_ref);
6367                 ptr = (unsigned long)(ref + 1);
6368                 if (args->subvol) {
6369                         btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6370                         btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6371                         write_extent_buffer(path->nodes[0], "..", ptr, 2);
6372                 } else {
6373                         btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6374                                                      name->len);
6375                         btrfs_set_inode_ref_index(path->nodes[0], ref,
6376                                                   BTRFS_I(inode)->dir_index);
6377                         write_extent_buffer(path->nodes[0], name->name, ptr,
6378                                             name->len);
6379                 }
6380         }
6381
6382         btrfs_mark_buffer_dirty(path->nodes[0]);
6383         /*
6384          * We don't need the path anymore, plus inheriting properties, adding
6385          * ACLs, security xattrs, orphan item or adding the link, will result in
6386          * allocating yet another path. So just free our path.
6387          */
6388         btrfs_free_path(path);
6389         path = NULL;
6390
6391         if (args->subvol) {
6392                 struct inode *parent;
6393
6394                 /*
6395                  * Subvolumes inherit properties from their parent subvolume,
6396                  * not the directory they were created in.
6397                  */
6398                 parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
6399                                     BTRFS_I(dir)->root);
6400                 if (IS_ERR(parent)) {
6401                         ret = PTR_ERR(parent);
6402                 } else {
6403                         ret = btrfs_inode_inherit_props(trans, inode, parent);
6404                         iput(parent);
6405                 }
6406         } else {
6407                 ret = btrfs_inode_inherit_props(trans, inode, dir);
6408         }
6409         if (ret) {
6410                 btrfs_err(fs_info,
6411                           "error inheriting props for ino %llu (root %llu): %d",
6412                           btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
6413                           ret);
6414         }
6415
6416         /*
6417          * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6418          * probably a bug.
6419          */
6420         if (!args->subvol) {
6421                 ret = btrfs_init_inode_security(trans, args);
6422                 if (ret) {
6423                         btrfs_abort_transaction(trans, ret);
6424                         goto discard;
6425                 }
6426         }
6427
6428         inode_tree_add(BTRFS_I(inode));
6429
6430         trace_btrfs_inode_new(inode);
6431         btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6432
6433         btrfs_update_root_times(trans, root);
6434
6435         if (args->orphan) {
6436                 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6437         } else {
6438                 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6439                                      0, BTRFS_I(inode)->dir_index);
6440         }
6441         if (ret) {
6442                 btrfs_abort_transaction(trans, ret);
6443                 goto discard;
6444         }
6445
6446         return 0;
6447
6448 discard:
6449         /*
6450          * discard_new_inode() calls iput(), but the caller owns the reference
6451          * to the inode.
6452          */
6453         ihold(inode);
6454         discard_new_inode(inode);
6455 out:
6456         btrfs_free_path(path);
6457         return ret;
6458 }
6459
6460 /*
6461  * utility function to add 'inode' into 'parent_inode' with
6462  * a give name and a given sequence number.
6463  * if 'add_backref' is true, also insert a backref from the
6464  * inode to the parent directory.
6465  */
6466 int btrfs_add_link(struct btrfs_trans_handle *trans,
6467                    struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6468                    const struct fscrypt_str *name, int add_backref, u64 index)
6469 {
6470         int ret = 0;
6471         struct btrfs_key key;
6472         struct btrfs_root *root = parent_inode->root;
6473         u64 ino = btrfs_ino(inode);
6474         u64 parent_ino = btrfs_ino(parent_inode);
6475
6476         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6477                 memcpy(&key, &inode->root->root_key, sizeof(key));
6478         } else {
6479                 key.objectid = ino;
6480                 key.type = BTRFS_INODE_ITEM_KEY;
6481                 key.offset = 0;
6482         }
6483
6484         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6485                 ret = btrfs_add_root_ref(trans, key.objectid,
6486                                          root->root_key.objectid, parent_ino,
6487                                          index, name);
6488         } else if (add_backref) {
6489                 ret = btrfs_insert_inode_ref(trans, root, name,
6490                                              ino, parent_ino, index);
6491         }
6492
6493         /* Nothing to clean up yet */
6494         if (ret)
6495                 return ret;
6496
6497         ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6498                                     btrfs_inode_type(&inode->vfs_inode), index);
6499         if (ret == -EEXIST || ret == -EOVERFLOW)
6500                 goto fail_dir_item;
6501         else if (ret) {
6502                 btrfs_abort_transaction(trans, ret);
6503                 return ret;
6504         }
6505
6506         btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6507                            name->len * 2);
6508         inode_inc_iversion(&parent_inode->vfs_inode);
6509         /*
6510          * If we are replaying a log tree, we do not want to update the mtime
6511          * and ctime of the parent directory with the current time, since the
6512          * log replay procedure is responsible for setting them to their correct
6513          * values (the ones it had when the fsync was done).
6514          */
6515         if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
6516                 struct timespec64 now = current_time(&parent_inode->vfs_inode);
6517
6518                 parent_inode->vfs_inode.i_mtime = now;
6519                 parent_inode->vfs_inode.i_ctime = now;
6520         }
6521         ret = btrfs_update_inode(trans, root, parent_inode);
6522         if (ret)
6523                 btrfs_abort_transaction(trans, ret);
6524         return ret;
6525
6526 fail_dir_item:
6527         if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6528                 u64 local_index;
6529                 int err;
6530                 err = btrfs_del_root_ref(trans, key.objectid,
6531                                          root->root_key.objectid, parent_ino,
6532                                          &local_index, name);
6533                 if (err)
6534                         btrfs_abort_transaction(trans, err);
6535         } else if (add_backref) {
6536                 u64 local_index;
6537                 int err;
6538
6539                 err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6540                                           &local_index);
6541                 if (err)
6542                         btrfs_abort_transaction(trans, err);
6543         }
6544
6545         /* Return the original error code */
6546         return ret;
6547 }
6548
6549 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6550                                struct inode *inode)
6551 {
6552         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6553         struct btrfs_root *root = BTRFS_I(dir)->root;
6554         struct btrfs_new_inode_args new_inode_args = {
6555                 .dir = dir,
6556                 .dentry = dentry,
6557                 .inode = inode,
6558         };
6559         unsigned int trans_num_items;
6560         struct btrfs_trans_handle *trans;
6561         int err;
6562
6563         err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6564         if (err)
6565                 goto out_inode;
6566
6567         trans = btrfs_start_transaction(root, trans_num_items);
6568         if (IS_ERR(trans)) {
6569                 err = PTR_ERR(trans);
6570                 goto out_new_inode_args;
6571         }
6572
6573         err = btrfs_create_new_inode(trans, &new_inode_args);
6574         if (!err)
6575                 d_instantiate_new(dentry, inode);
6576
6577         btrfs_end_transaction(trans);
6578         btrfs_btree_balance_dirty(fs_info);
6579 out_new_inode_args:
6580         btrfs_new_inode_args_destroy(&new_inode_args);
6581 out_inode:
6582         if (err)
6583                 iput(inode);
6584         return err;
6585 }
6586
6587 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6588                        struct dentry *dentry, umode_t mode, dev_t rdev)
6589 {
6590         struct inode *inode;
6591
6592         inode = new_inode(dir->i_sb);
6593         if (!inode)
6594                 return -ENOMEM;
6595         inode_init_owner(idmap, inode, dir, mode);
6596         inode->i_op = &btrfs_special_inode_operations;
6597         init_special_inode(inode, inode->i_mode, rdev);
6598         return btrfs_create_common(dir, dentry, inode);
6599 }
6600
6601 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6602                         struct dentry *dentry, umode_t mode, bool excl)
6603 {
6604         struct inode *inode;
6605
6606         inode = new_inode(dir->i_sb);
6607         if (!inode)
6608                 return -ENOMEM;
6609         inode_init_owner(idmap, inode, dir, mode);
6610         inode->i_fop = &btrfs_file_operations;
6611         inode->i_op = &btrfs_file_inode_operations;
6612         inode->i_mapping->a_ops = &btrfs_aops;
6613         return btrfs_create_common(dir, dentry, inode);
6614 }
6615
6616 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6617                       struct dentry *dentry)
6618 {
6619         struct btrfs_trans_handle *trans = NULL;
6620         struct btrfs_root *root = BTRFS_I(dir)->root;
6621         struct inode *inode = d_inode(old_dentry);
6622         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6623         struct fscrypt_name fname;
6624         u64 index;
6625         int err;
6626         int drop_inode = 0;
6627
6628         /* do not allow sys_link's with other subvols of the same device */
6629         if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6630                 return -EXDEV;
6631
6632         if (inode->i_nlink >= BTRFS_LINK_MAX)
6633                 return -EMLINK;
6634
6635         err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6636         if (err)
6637                 goto fail;
6638
6639         err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6640         if (err)
6641                 goto fail;
6642
6643         /*
6644          * 2 items for inode and inode ref
6645          * 2 items for dir items
6646          * 1 item for parent inode
6647          * 1 item for orphan item deletion if O_TMPFILE
6648          */
6649         trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6650         if (IS_ERR(trans)) {
6651                 err = PTR_ERR(trans);
6652                 trans = NULL;
6653                 goto fail;
6654         }
6655
6656         /* There are several dir indexes for this inode, clear the cache. */
6657         BTRFS_I(inode)->dir_index = 0ULL;
6658         inc_nlink(inode);
6659         inode_inc_iversion(inode);
6660         inode->i_ctime = current_time(inode);
6661         ihold(inode);
6662         set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6663
6664         err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6665                              &fname.disk_name, 1, index);
6666
6667         if (err) {
6668                 drop_inode = 1;
6669         } else {
6670                 struct dentry *parent = dentry->d_parent;
6671
6672                 err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6673                 if (err)
6674                         goto fail;
6675                 if (inode->i_nlink == 1) {
6676                         /*
6677                          * If new hard link count is 1, it's a file created
6678                          * with open(2) O_TMPFILE flag.
6679                          */
6680                         err = btrfs_orphan_del(trans, BTRFS_I(inode));
6681                         if (err)
6682                                 goto fail;
6683                 }
6684                 d_instantiate(dentry, inode);
6685                 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6686         }
6687
6688 fail:
6689         fscrypt_free_filename(&fname);
6690         if (trans)
6691                 btrfs_end_transaction(trans);
6692         if (drop_inode) {
6693                 inode_dec_link_count(inode);
6694                 iput(inode);
6695         }
6696         btrfs_btree_balance_dirty(fs_info);
6697         return err;
6698 }
6699
6700 static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6701                        struct dentry *dentry, umode_t mode)
6702 {
6703         struct inode *inode;
6704
6705         inode = new_inode(dir->i_sb);
6706         if (!inode)
6707                 return -ENOMEM;
6708         inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6709         inode->i_op = &btrfs_dir_inode_operations;
6710         inode->i_fop = &btrfs_dir_file_operations;
6711         return btrfs_create_common(dir, dentry, inode);
6712 }
6713
6714 static noinline int uncompress_inline(struct btrfs_path *path,
6715                                       struct page *page,
6716                                       struct btrfs_file_extent_item *item)
6717 {
6718         int ret;
6719         struct extent_buffer *leaf = path->nodes[0];
6720         char *tmp;
6721         size_t max_size;
6722         unsigned long inline_size;
6723         unsigned long ptr;
6724         int compress_type;
6725
6726         compress_type = btrfs_file_extent_compression(leaf, item);
6727         max_size = btrfs_file_extent_ram_bytes(leaf, item);
6728         inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6729         tmp = kmalloc(inline_size, GFP_NOFS);
6730         if (!tmp)
6731                 return -ENOMEM;
6732         ptr = btrfs_file_extent_inline_start(item);
6733
6734         read_extent_buffer(leaf, tmp, ptr, inline_size);
6735
6736         max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6737         ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
6738
6739         /*
6740          * decompression code contains a memset to fill in any space between the end
6741          * of the uncompressed data and the end of max_size in case the decompressed
6742          * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6743          * the end of an inline extent and the beginning of the next block, so we
6744          * cover that region here.
6745          */
6746
6747         if (max_size < PAGE_SIZE)
6748                 memzero_page(page, max_size, PAGE_SIZE - max_size);
6749         kfree(tmp);
6750         return ret;
6751 }
6752
6753 static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
6754                               struct page *page)
6755 {
6756         struct btrfs_file_extent_item *fi;
6757         void *kaddr;
6758         size_t copy_size;
6759
6760         if (!page || PageUptodate(page))
6761                 return 0;
6762
6763         ASSERT(page_offset(page) == 0);
6764
6765         fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6766                             struct btrfs_file_extent_item);
6767         if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6768                 return uncompress_inline(path, page, fi);
6769
6770         copy_size = min_t(u64, PAGE_SIZE,
6771                           btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6772         kaddr = kmap_local_page(page);
6773         read_extent_buffer(path->nodes[0], kaddr,
6774                            btrfs_file_extent_inline_start(fi), copy_size);
6775         kunmap_local(kaddr);
6776         if (copy_size < PAGE_SIZE)
6777                 memzero_page(page, copy_size, PAGE_SIZE - copy_size);
6778         return 0;
6779 }
6780
6781 /*
6782  * Lookup the first extent overlapping a range in a file.
6783  *
6784  * @inode:      file to search in
6785  * @page:       page to read extent data into if the extent is inline
6786  * @pg_offset:  offset into @page to copy to
6787  * @start:      file offset
6788  * @len:        length of range starting at @start
6789  *
6790  * Return the first &struct extent_map which overlaps the given range, reading
6791  * it from the B-tree and caching it if necessary. Note that there may be more
6792  * extents which overlap the given range after the returned extent_map.
6793  *
6794  * If @page is not NULL and the extent is inline, this also reads the extent
6795  * data directly into the page and marks the extent up to date in the io_tree.
6796  *
6797  * Return: ERR_PTR on error, non-NULL extent_map on success.
6798  */
6799 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6800                                     struct page *page, size_t pg_offset,
6801                                     u64 start, u64 len)
6802 {
6803         struct btrfs_fs_info *fs_info = inode->root->fs_info;
6804         int ret = 0;
6805         u64 extent_start = 0;
6806         u64 extent_end = 0;
6807         u64 objectid = btrfs_ino(inode);
6808         int extent_type = -1;
6809         struct btrfs_path *path = NULL;
6810         struct btrfs_root *root = inode->root;
6811         struct btrfs_file_extent_item *item;
6812         struct extent_buffer *leaf;
6813         struct btrfs_key found_key;
6814         struct extent_map *em = NULL;
6815         struct extent_map_tree *em_tree = &inode->extent_tree;
6816
6817         read_lock(&em_tree->lock);
6818         em = lookup_extent_mapping(em_tree, start, len);
6819         read_unlock(&em_tree->lock);
6820
6821         if (em) {
6822                 if (em->start > start || em->start + em->len <= start)
6823                         free_extent_map(em);
6824                 else if (em->block_start == EXTENT_MAP_INLINE && page)
6825                         free_extent_map(em);
6826                 else
6827                         goto out;
6828         }
6829         em = alloc_extent_map();
6830         if (!em) {
6831                 ret = -ENOMEM;
6832                 goto out;
6833         }
6834         em->start = EXTENT_MAP_HOLE;
6835         em->orig_start = EXTENT_MAP_HOLE;
6836         em->len = (u64)-1;
6837         em->block_len = (u64)-1;
6838
6839         path = btrfs_alloc_path();
6840         if (!path) {
6841                 ret = -ENOMEM;
6842                 goto out;
6843         }
6844
6845         /* Chances are we'll be called again, so go ahead and do readahead */
6846         path->reada = READA_FORWARD;
6847
6848         /*
6849          * The same explanation in load_free_space_cache applies here as well,
6850          * we only read when we're loading the free space cache, and at that
6851          * point the commit_root has everything we need.
6852          */
6853         if (btrfs_is_free_space_inode(inode)) {
6854                 path->search_commit_root = 1;
6855                 path->skip_locking = 1;
6856         }
6857
6858         ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6859         if (ret < 0) {
6860                 goto out;
6861         } else if (ret > 0) {
6862                 if (path->slots[0] == 0)
6863                         goto not_found;
6864                 path->slots[0]--;
6865                 ret = 0;
6866         }
6867
6868         leaf = path->nodes[0];
6869         item = btrfs_item_ptr(leaf, path->slots[0],
6870                               struct btrfs_file_extent_item);
6871         btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6872         if (found_key.objectid != objectid ||
6873             found_key.type != BTRFS_EXTENT_DATA_KEY) {
6874                 /*
6875                  * If we backup past the first extent we want to move forward
6876                  * and see if there is an extent in front of us, otherwise we'll
6877                  * say there is a hole for our whole search range which can
6878                  * cause problems.
6879                  */
6880                 extent_end = start;
6881                 goto next;
6882         }
6883
6884         extent_type = btrfs_file_extent_type(leaf, item);
6885         extent_start = found_key.offset;
6886         extent_end = btrfs_file_extent_end(path);
6887         if (extent_type == BTRFS_FILE_EXTENT_REG ||
6888             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6889                 /* Only regular file could have regular/prealloc extent */
6890                 if (!S_ISREG(inode->vfs_inode.i_mode)) {
6891                         ret = -EUCLEAN;
6892                         btrfs_crit(fs_info,
6893                 "regular/prealloc extent found for non-regular inode %llu",
6894                                    btrfs_ino(inode));
6895                         goto out;
6896                 }
6897                 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6898                                                        extent_start);
6899         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6900                 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6901                                                       path->slots[0],
6902                                                       extent_start);
6903         }
6904 next:
6905         if (start >= extent_end) {
6906                 path->slots[0]++;
6907                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6908                         ret = btrfs_next_leaf(root, path);
6909                         if (ret < 0)
6910                                 goto out;
6911                         else if (ret > 0)
6912                                 goto not_found;
6913
6914                         leaf = path->nodes[0];
6915                 }
6916                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6917                 if (found_key.objectid != objectid ||
6918                     found_key.type != BTRFS_EXTENT_DATA_KEY)
6919                         goto not_found;
6920                 if (start + len <= found_key.offset)
6921                         goto not_found;
6922                 if (start > found_key.offset)
6923                         goto next;
6924
6925                 /* New extent overlaps with existing one */
6926                 em->start = start;
6927                 em->orig_start = start;
6928                 em->len = found_key.offset - start;
6929                 em->block_start = EXTENT_MAP_HOLE;
6930                 goto insert;
6931         }
6932
6933         btrfs_extent_item_to_extent_map(inode, path, item, em);
6934
6935         if (extent_type == BTRFS_FILE_EXTENT_REG ||
6936             extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6937                 goto insert;
6938         } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6939                 /*
6940                  * Inline extent can only exist at file offset 0. This is
6941                  * ensured by tree-checker and inline extent creation path.
6942                  * Thus all members representing file offsets should be zero.
6943                  */
6944                 ASSERT(pg_offset == 0);
6945                 ASSERT(extent_start == 0);
6946                 ASSERT(em->start == 0);
6947
6948                 /*
6949                  * btrfs_extent_item_to_extent_map() should have properly
6950                  * initialized em members already.
6951                  *
6952                  * Other members are not utilized for inline extents.
6953                  */
6954                 ASSERT(em->block_start == EXTENT_MAP_INLINE);
6955                 ASSERT(em->len == fs_info->sectorsize);
6956
6957                 ret = read_inline_extent(inode, path, page);
6958                 if (ret < 0)
6959                         goto out;
6960                 goto insert;
6961         }
6962 not_found:
6963         em->start = start;
6964         em->orig_start = start;
6965         em->len = len;
6966         em->block_start = EXTENT_MAP_HOLE;
6967 insert:
6968         ret = 0;
6969         btrfs_release_path(path);
6970         if (em->start > start || extent_map_end(em) <= start) {
6971                 btrfs_err(fs_info,
6972                           "bad extent! em: [%llu %llu] passed [%llu %llu]",
6973                           em->start, em->len, start, len);
6974                 ret = -EIO;
6975                 goto out;
6976         }
6977
6978         write_lock(&em_tree->lock);
6979         ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6980         write_unlock(&em_tree->lock);
6981 out:
6982         btrfs_free_path(path);
6983
6984         trace_btrfs_get_extent(root, inode, em);
6985
6986         if (ret) {
6987                 free_extent_map(em);
6988                 return ERR_PTR(ret);
6989         }
6990         return em;
6991 }
6992
6993 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
6994                                                   struct btrfs_dio_data *dio_data,
6995                                                   const u64 start,
6996                                                   const u64 len,
6997                                                   const u64 orig_start,
6998                                                   const u64 block_start,
6999                                                   const u64 block_len,
7000                                                   const u64 orig_block_len,
7001                                                   const u64 ram_bytes,
7002                                                   const int type)
7003 {
7004         struct extent_map *em = NULL;
7005         struct btrfs_ordered_extent *ordered;
7006
7007         if (type != BTRFS_ORDERED_NOCOW) {
7008                 em = create_io_em(inode, start, len, orig_start, block_start,
7009                                   block_len, orig_block_len, ram_bytes,
7010                                   BTRFS_COMPRESS_NONE, /* compress_type */
7011                                   type);
7012                 if (IS_ERR(em))
7013                         goto out;
7014         }
7015         ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
7016                                              block_start, block_len, 0,
7017                                              (1 << type) |
7018                                              (1 << BTRFS_ORDERED_DIRECT),
7019                                              BTRFS_COMPRESS_NONE);
7020         if (IS_ERR(ordered)) {
7021                 if (em) {
7022                         free_extent_map(em);
7023                         btrfs_drop_extent_map_range(inode, start,
7024                                                     start + len - 1, false);
7025                 }
7026                 em = ERR_CAST(ordered);
7027         } else {
7028                 ASSERT(!dio_data->ordered);
7029                 dio_data->ordered = ordered;
7030         }
7031  out:
7032
7033         return em;
7034 }
7035
7036 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
7037                                                   struct btrfs_dio_data *dio_data,
7038                                                   u64 start, u64 len)
7039 {
7040         struct btrfs_root *root = inode->root;
7041         struct btrfs_fs_info *fs_info = root->fs_info;
7042         struct extent_map *em;
7043         struct btrfs_key ins;
7044         u64 alloc_hint;
7045         int ret;
7046
7047         alloc_hint = get_extent_allocation_hint(inode, start, len);
7048         ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7049                                    0, alloc_hint, &ins, 1, 1);
7050         if (ret)
7051                 return ERR_PTR(ret);
7052
7053         em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
7054                                      ins.objectid, ins.offset, ins.offset,
7055                                      ins.offset, BTRFS_ORDERED_REGULAR);
7056         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7057         if (IS_ERR(em))
7058                 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
7059                                            1);
7060
7061         return em;
7062 }
7063
7064 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7065 {
7066         struct btrfs_block_group *block_group;
7067         bool readonly = false;
7068
7069         block_group = btrfs_lookup_block_group(fs_info, bytenr);
7070         if (!block_group || block_group->ro)
7071                 readonly = true;
7072         if (block_group)
7073                 btrfs_put_block_group(block_group);
7074         return readonly;
7075 }
7076
7077 /*
7078  * Check if we can do nocow write into the range [@offset, @offset + @len)
7079  *
7080  * @offset:     File offset
7081  * @len:        The length to write, will be updated to the nocow writeable
7082  *              range
7083  * @orig_start: (optional) Return the original file offset of the file extent
7084  * @orig_len:   (optional) Return the original on-disk length of the file extent
7085  * @ram_bytes:  (optional) Return the ram_bytes of the file extent
7086  * @strict:     if true, omit optimizations that might force us into unnecessary
7087  *              cow. e.g., don't trust generation number.
7088  *
7089  * Return:
7090  * >0   and update @len if we can do nocow write
7091  *  0   if we can't do nocow write
7092  * <0   if error happened
7093  *
7094  * NOTE: This only checks the file extents, caller is responsible to wait for
7095  *       any ordered extents.
7096  */
7097 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7098                               u64 *orig_start, u64 *orig_block_len,
7099                               u64 *ram_bytes, bool nowait, bool strict)
7100 {
7101         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7102         struct can_nocow_file_extent_args nocow_args = { 0 };
7103         struct btrfs_path *path;
7104         int ret;
7105         struct extent_buffer *leaf;
7106         struct btrfs_root *root = BTRFS_I(inode)->root;
7107         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7108         struct btrfs_file_extent_item *fi;
7109         struct btrfs_key key;
7110         int found_type;
7111
7112         path = btrfs_alloc_path();
7113         if (!path)
7114                 return -ENOMEM;
7115         path->nowait = nowait;
7116
7117         ret = btrfs_lookup_file_extent(NULL, root, path,
7118                         btrfs_ino(BTRFS_I(inode)), offset, 0);
7119         if (ret < 0)
7120                 goto out;
7121
7122         if (ret == 1) {
7123                 if (path->slots[0] == 0) {
7124                         /* can't find the item, must cow */
7125                         ret = 0;
7126                         goto out;
7127                 }
7128                 path->slots[0]--;
7129         }
7130         ret = 0;
7131         leaf = path->nodes[0];
7132         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7133         if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7134             key.type != BTRFS_EXTENT_DATA_KEY) {
7135                 /* not our file or wrong item type, must cow */
7136                 goto out;
7137         }
7138
7139         if (key.offset > offset) {
7140                 /* Wrong offset, must cow */
7141                 goto out;
7142         }
7143
7144         if (btrfs_file_extent_end(path) <= offset)
7145                 goto out;
7146
7147         fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7148         found_type = btrfs_file_extent_type(leaf, fi);
7149         if (ram_bytes)
7150                 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7151
7152         nocow_args.start = offset;
7153         nocow_args.end = offset + *len - 1;
7154         nocow_args.strict = strict;
7155         nocow_args.free_path = true;
7156
7157         ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
7158         /* can_nocow_file_extent() has freed the path. */
7159         path = NULL;
7160
7161         if (ret != 1) {
7162                 /* Treat errors as not being able to NOCOW. */
7163                 ret = 0;
7164                 goto out;
7165         }
7166
7167         ret = 0;
7168         if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
7169                 goto out;
7170
7171         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7172             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7173                 u64 range_end;
7174
7175                 range_end = round_up(offset + nocow_args.num_bytes,
7176                                      root->fs_info->sectorsize) - 1;
7177                 ret = test_range_bit(io_tree, offset, range_end,
7178                                      EXTENT_DELALLOC, 0, NULL);
7179                 if (ret) {
7180                         ret = -EAGAIN;
7181                         goto out;
7182                 }
7183         }
7184
7185         if (orig_start)
7186                 *orig_start = key.offset - nocow_args.extent_offset;
7187         if (orig_block_len)
7188                 *orig_block_len = nocow_args.disk_num_bytes;
7189
7190         *len = nocow_args.num_bytes;
7191         ret = 1;
7192 out:
7193         btrfs_free_path(path);
7194         return ret;
7195 }
7196
7197 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7198                               struct extent_state **cached_state,
7199                               unsigned int iomap_flags)
7200 {
7201         const bool writing = (iomap_flags & IOMAP_WRITE);
7202         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7203         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7204         struct btrfs_ordered_extent *ordered;
7205         int ret = 0;
7206
7207         while (1) {
7208                 if (nowait) {
7209                         if (!try_lock_extent(io_tree, lockstart, lockend,
7210                                              cached_state))
7211                                 return -EAGAIN;
7212                 } else {
7213                         lock_extent(io_tree, lockstart, lockend, cached_state);
7214                 }
7215                 /*
7216                  * We're concerned with the entire range that we're going to be
7217                  * doing DIO to, so we need to make sure there's no ordered
7218                  * extents in this range.
7219                  */
7220                 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7221                                                      lockend - lockstart + 1);
7222
7223                 /*
7224                  * We need to make sure there are no buffered pages in this
7225                  * range either, we could have raced between the invalidate in
7226                  * generic_file_direct_write and locking the extent.  The
7227                  * invalidate needs to happen so that reads after a write do not
7228                  * get stale data.
7229                  */
7230                 if (!ordered &&
7231                     (!writing || !filemap_range_has_page(inode->i_mapping,
7232                                                          lockstart, lockend)))
7233                         break;
7234
7235                 unlock_extent(io_tree, lockstart, lockend, cached_state);
7236
7237                 if (ordered) {
7238                         if (nowait) {
7239                                 btrfs_put_ordered_extent(ordered);
7240                                 ret = -EAGAIN;
7241                                 break;
7242                         }
7243                         /*
7244                          * If we are doing a DIO read and the ordered extent we
7245                          * found is for a buffered write, we can not wait for it
7246                          * to complete and retry, because if we do so we can
7247                          * deadlock with concurrent buffered writes on page
7248                          * locks. This happens only if our DIO read covers more
7249                          * than one extent map, if at this point has already
7250                          * created an ordered extent for a previous extent map
7251                          * and locked its range in the inode's io tree, and a
7252                          * concurrent write against that previous extent map's
7253                          * range and this range started (we unlock the ranges
7254                          * in the io tree only when the bios complete and
7255                          * buffered writes always lock pages before attempting
7256                          * to lock range in the io tree).
7257                          */
7258                         if (writing ||
7259                             test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7260                                 btrfs_start_ordered_extent(ordered);
7261                         else
7262                                 ret = nowait ? -EAGAIN : -ENOTBLK;
7263                         btrfs_put_ordered_extent(ordered);
7264                 } else {
7265                         /*
7266                          * We could trigger writeback for this range (and wait
7267                          * for it to complete) and then invalidate the pages for
7268                          * this range (through invalidate_inode_pages2_range()),
7269                          * but that can lead us to a deadlock with a concurrent
7270                          * call to readahead (a buffered read or a defrag call
7271                          * triggered a readahead) on a page lock due to an
7272                          * ordered dio extent we created before but did not have
7273                          * yet a corresponding bio submitted (whence it can not
7274                          * complete), which makes readahead wait for that
7275                          * ordered extent to complete while holding a lock on
7276                          * that page.
7277                          */
7278                         ret = nowait ? -EAGAIN : -ENOTBLK;
7279                 }
7280
7281                 if (ret)
7282                         break;
7283
7284                 cond_resched();
7285         }
7286
7287         return ret;
7288 }
7289
7290 /* The callers of this must take lock_extent() */
7291 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7292                                        u64 len, u64 orig_start, u64 block_start,
7293                                        u64 block_len, u64 orig_block_len,
7294                                        u64 ram_bytes, int compress_type,
7295                                        int type)
7296 {
7297         struct extent_map *em;
7298         int ret;
7299
7300         ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7301                type == BTRFS_ORDERED_COMPRESSED ||
7302                type == BTRFS_ORDERED_NOCOW ||
7303                type == BTRFS_ORDERED_REGULAR);
7304
7305         em = alloc_extent_map();
7306         if (!em)
7307                 return ERR_PTR(-ENOMEM);
7308
7309         em->start = start;
7310         em->orig_start = orig_start;
7311         em->len = len;
7312         em->block_len = block_len;
7313         em->block_start = block_start;
7314         em->orig_block_len = orig_block_len;
7315         em->ram_bytes = ram_bytes;
7316         em->generation = -1;
7317         set_bit(EXTENT_FLAG_PINNED, &em->flags);
7318         if (type == BTRFS_ORDERED_PREALLOC) {
7319                 set_bit(EXTENT_FLAG_FILLING, &em->flags);
7320         } else if (type == BTRFS_ORDERED_COMPRESSED) {
7321                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7322                 em->compress_type = compress_type;
7323         }
7324
7325         ret = btrfs_replace_extent_map_range(inode, em, true);
7326         if (ret) {
7327                 free_extent_map(em);
7328                 return ERR_PTR(ret);
7329         }
7330
7331         /* em got 2 refs now, callers needs to do free_extent_map once. */
7332         return em;
7333 }
7334
7335
7336 static int btrfs_get_blocks_direct_write(struct extent_map **map,
7337                                          struct inode *inode,
7338                                          struct btrfs_dio_data *dio_data,
7339                                          u64 start, u64 *lenp,
7340                                          unsigned int iomap_flags)
7341 {
7342         const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7343         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7344         struct extent_map *em = *map;
7345         int type;
7346         u64 block_start, orig_start, orig_block_len, ram_bytes;
7347         struct btrfs_block_group *bg;
7348         bool can_nocow = false;
7349         bool space_reserved = false;
7350         u64 len = *lenp;
7351         u64 prev_len;
7352         int ret = 0;
7353
7354         /*
7355          * We don't allocate a new extent in the following cases
7356          *
7357          * 1) The inode is marked as NODATACOW. In this case we'll just use the
7358          * existing extent.
7359          * 2) The extent is marked as PREALLOC. We're good to go here and can
7360          * just use the extent.
7361          *
7362          */
7363         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7364             ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7365              em->block_start != EXTENT_MAP_HOLE)) {
7366                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7367                         type = BTRFS_ORDERED_PREALLOC;
7368                 else
7369                         type = BTRFS_ORDERED_NOCOW;
7370                 len = min(len, em->len - (start - em->start));
7371                 block_start = em->block_start + (start - em->start);
7372
7373                 if (can_nocow_extent(inode, start, &len, &orig_start,
7374                                      &orig_block_len, &ram_bytes, false, false) == 1) {
7375                         bg = btrfs_inc_nocow_writers(fs_info, block_start);
7376                         if (bg)
7377                                 can_nocow = true;
7378                 }
7379         }
7380
7381         prev_len = len;
7382         if (can_nocow) {
7383                 struct extent_map *em2;
7384
7385                 /* We can NOCOW, so only need to reserve metadata space. */
7386                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7387                                                       nowait);
7388                 if (ret < 0) {
7389                         /* Our caller expects us to free the input extent map. */
7390                         free_extent_map(em);
7391                         *map = NULL;
7392                         btrfs_dec_nocow_writers(bg);
7393                         if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
7394                                 ret = -EAGAIN;
7395                         goto out;
7396                 }
7397                 space_reserved = true;
7398
7399                 em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
7400                                               orig_start, block_start,
7401                                               len, orig_block_len,
7402                                               ram_bytes, type);
7403                 btrfs_dec_nocow_writers(bg);
7404                 if (type == BTRFS_ORDERED_PREALLOC) {
7405                         free_extent_map(em);
7406                         *map = em2;
7407                         em = em2;
7408                 }
7409
7410                 if (IS_ERR(em2)) {
7411                         ret = PTR_ERR(em2);
7412                         goto out;
7413                 }
7414
7415                 dio_data->nocow_done = true;
7416         } else {
7417                 /* Our caller expects us to free the input extent map. */
7418                 free_extent_map(em);
7419                 *map = NULL;
7420
7421                 if (nowait) {
7422                         ret = -EAGAIN;
7423                         goto out;
7424                 }
7425
7426                 /*
7427                  * If we could not allocate data space before locking the file
7428                  * range and we can't do a NOCOW write, then we have to fail.
7429                  */
7430                 if (!dio_data->data_space_reserved) {
7431                         ret = -ENOSPC;
7432                         goto out;
7433                 }
7434
7435                 /*
7436                  * We have to COW and we have already reserved data space before,
7437                  * so now we reserve only metadata.
7438                  */
7439                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7440                                                       false);
7441                 if (ret < 0)
7442                         goto out;
7443                 space_reserved = true;
7444
7445                 em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
7446                 if (IS_ERR(em)) {
7447                         ret = PTR_ERR(em);
7448                         goto out;
7449                 }
7450                 *map = em;
7451                 len = min(len, em->len - (start - em->start));
7452                 if (len < prev_len)
7453                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
7454                                                         prev_len - len, true);
7455         }
7456
7457         /*
7458          * We have created our ordered extent, so we can now release our reservation
7459          * for an outstanding extent.
7460          */
7461         btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
7462
7463         /*
7464          * Need to update the i_size under the extent lock so buffered
7465          * readers will get the updated i_size when we unlock.
7466          */
7467         if (start + len > i_size_read(inode))
7468                 i_size_write(inode, start + len);
7469 out:
7470         if (ret && space_reserved) {
7471                 btrfs_delalloc_release_extents(BTRFS_I(inode), len);
7472                 btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
7473         }
7474         *lenp = len;
7475         return ret;
7476 }
7477
7478 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7479                 loff_t length, unsigned int flags, struct iomap *iomap,
7480                 struct iomap *srcmap)
7481 {
7482         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7483         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7484         struct extent_map *em;
7485         struct extent_state *cached_state = NULL;
7486         struct btrfs_dio_data *dio_data = iter->private;
7487         u64 lockstart, lockend;
7488         const bool write = !!(flags & IOMAP_WRITE);
7489         int ret = 0;
7490         u64 len = length;
7491         const u64 data_alloc_len = length;
7492         bool unlock_extents = false;
7493
7494         /*
7495          * We could potentially fault if we have a buffer > PAGE_SIZE, and if
7496          * we're NOWAIT we may submit a bio for a partial range and return
7497          * EIOCBQUEUED, which would result in an errant short read.
7498          *
7499          * The best way to handle this would be to allow for partial completions
7500          * of iocb's, so we could submit the partial bio, return and fault in
7501          * the rest of the pages, and then submit the io for the rest of the
7502          * range.  However we don't have that currently, so simply return
7503          * -EAGAIN at this point so that the normal path is used.
7504          */
7505         if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
7506                 return -EAGAIN;
7507
7508         /*
7509          * Cap the size of reads to that usually seen in buffered I/O as we need
7510          * to allocate a contiguous array for the checksums.
7511          */
7512         if (!write)
7513                 len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
7514
7515         lockstart = start;
7516         lockend = start + len - 1;
7517
7518         /*
7519          * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
7520          * enough if we've written compressed pages to this area, so we need to
7521          * flush the dirty pages again to make absolutely sure that any
7522          * outstanding dirty pages are on disk - the first flush only starts
7523          * compression on the data, while keeping the pages locked, so by the
7524          * time the second flush returns we know bios for the compressed pages
7525          * were submitted and finished, and the pages no longer under writeback.
7526          *
7527          * If we have a NOWAIT request and we have any pages in the range that
7528          * are locked, likely due to compression still in progress, we don't want
7529          * to block on page locks. We also don't want to block on pages marked as
7530          * dirty or under writeback (same as for the non-compression case).
7531          * iomap_dio_rw() did the same check, but after that and before we got
7532          * here, mmap'ed writes may have happened or buffered reads started
7533          * (readpage() and readahead(), which lock pages), as we haven't locked
7534          * the file range yet.
7535          */
7536         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7537                      &BTRFS_I(inode)->runtime_flags)) {
7538                 if (flags & IOMAP_NOWAIT) {
7539                         if (filemap_range_needs_writeback(inode->i_mapping,
7540                                                           lockstart, lockend))
7541                                 return -EAGAIN;
7542                 } else {
7543                         ret = filemap_fdatawrite_range(inode->i_mapping, start,
7544                                                        start + length - 1);
7545                         if (ret)
7546                                 return ret;
7547                 }
7548         }
7549
7550         memset(dio_data, 0, sizeof(*dio_data));
7551
7552         /*
7553          * We always try to allocate data space and must do it before locking
7554          * the file range, to avoid deadlocks with concurrent writes to the same
7555          * range if the range has several extents and the writes don't expand the
7556          * current i_size (the inode lock is taken in shared mode). If we fail to
7557          * allocate data space here we continue and later, after locking the
7558          * file range, we fail with ENOSPC only if we figure out we can not do a
7559          * NOCOW write.
7560          */
7561         if (write && !(flags & IOMAP_NOWAIT)) {
7562                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
7563                                                   &dio_data->data_reserved,
7564                                                   start, data_alloc_len, false);
7565                 if (!ret)
7566                         dio_data->data_space_reserved = true;
7567                 else if (ret && !(BTRFS_I(inode)->flags &
7568                                   (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
7569                         goto err;
7570         }
7571
7572         /*
7573          * If this errors out it's because we couldn't invalidate pagecache for
7574          * this range and we need to fallback to buffered IO, or we are doing a
7575          * NOWAIT read/write and we need to block.
7576          */
7577         ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
7578         if (ret < 0)
7579                 goto err;
7580
7581         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7582         if (IS_ERR(em)) {
7583                 ret = PTR_ERR(em);
7584                 goto unlock_err;
7585         }
7586
7587         /*
7588          * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7589          * io.  INLINE is special, and we could probably kludge it in here, but
7590          * it's still buffered so for safety lets just fall back to the generic
7591          * buffered path.
7592          *
7593          * For COMPRESSED we _have_ to read the entire extent in so we can
7594          * decompress it, so there will be buffering required no matter what we
7595          * do, so go ahead and fallback to buffered.
7596          *
7597          * We return -ENOTBLK because that's what makes DIO go ahead and go back
7598          * to buffered IO.  Don't blame me, this is the price we pay for using
7599          * the generic code.
7600          */
7601         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7602             em->block_start == EXTENT_MAP_INLINE) {
7603                 free_extent_map(em);
7604                 /*
7605                  * If we are in a NOWAIT context, return -EAGAIN in order to
7606                  * fallback to buffered IO. This is not only because we can
7607                  * block with buffered IO (no support for NOWAIT semantics at
7608                  * the moment) but also to avoid returning short reads to user
7609                  * space - this happens if we were able to read some data from
7610                  * previous non-compressed extents and then when we fallback to
7611                  * buffered IO, at btrfs_file_read_iter() by calling
7612                  * filemap_read(), we fail to fault in pages for the read buffer,
7613                  * in which case filemap_read() returns a short read (the number
7614                  * of bytes previously read is > 0, so it does not return -EFAULT).
7615                  */
7616                 ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
7617                 goto unlock_err;
7618         }
7619
7620         len = min(len, em->len - (start - em->start));
7621
7622         /*
7623          * If we have a NOWAIT request and the range contains multiple extents
7624          * (or a mix of extents and holes), then we return -EAGAIN to make the
7625          * caller fallback to a context where it can do a blocking (without
7626          * NOWAIT) request. This way we avoid doing partial IO and returning
7627          * success to the caller, which is not optimal for writes and for reads
7628          * it can result in unexpected behaviour for an application.
7629          *
7630          * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7631          * iomap_dio_rw(), we can end up returning less data then what the caller
7632          * asked for, resulting in an unexpected, and incorrect, short read.
7633          * That is, the caller asked to read N bytes and we return less than that,
7634          * which is wrong unless we are crossing EOF. This happens if we get a
7635          * page fault error when trying to fault in pages for the buffer that is
7636          * associated to the struct iov_iter passed to iomap_dio_rw(), and we
7637          * have previously submitted bios for other extents in the range, in
7638          * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7639          * those bios have completed by the time we get the page fault error,
7640          * which we return back to our caller - we should only return EIOCBQUEUED
7641          * after we have submitted bios for all the extents in the range.
7642          */
7643         if ((flags & IOMAP_NOWAIT) && len < length) {
7644                 free_extent_map(em);
7645                 ret = -EAGAIN;
7646                 goto unlock_err;
7647         }
7648
7649         if (write) {
7650                 ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7651                                                     start, &len, flags);
7652                 if (ret < 0)
7653                         goto unlock_err;
7654                 unlock_extents = true;
7655                 /* Recalc len in case the new em is smaller than requested */
7656                 len = min(len, em->len - (start - em->start));
7657                 if (dio_data->data_space_reserved) {
7658                         u64 release_offset;
7659                         u64 release_len = 0;
7660
7661                         if (dio_data->nocow_done) {
7662                                 release_offset = start;
7663                                 release_len = data_alloc_len;
7664                         } else if (len < data_alloc_len) {
7665                                 release_offset = start + len;
7666                                 release_len = data_alloc_len - len;
7667                         }
7668
7669                         if (release_len > 0)
7670                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
7671                                                                dio_data->data_reserved,
7672                                                                release_offset,
7673                                                                release_len);
7674                 }
7675         } else {
7676                 /*
7677                  * We need to unlock only the end area that we aren't using.
7678                  * The rest is going to be unlocked by the endio routine.
7679                  */
7680                 lockstart = start + len;
7681                 if (lockstart < lockend)
7682                         unlock_extents = true;
7683         }
7684
7685         if (unlock_extents)
7686                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7687                               &cached_state);
7688         else
7689                 free_extent_state(cached_state);
7690
7691         /*
7692          * Translate extent map information to iomap.
7693          * We trim the extents (and move the addr) even though iomap code does
7694          * that, since we have locked only the parts we are performing I/O in.
7695          */
7696         if ((em->block_start == EXTENT_MAP_HOLE) ||
7697             (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7698                 iomap->addr = IOMAP_NULL_ADDR;
7699                 iomap->type = IOMAP_HOLE;
7700         } else {
7701                 iomap->addr = em->block_start + (start - em->start);
7702                 iomap->type = IOMAP_MAPPED;
7703         }
7704         iomap->offset = start;
7705         iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7706         iomap->length = len;
7707         free_extent_map(em);
7708
7709         return 0;
7710
7711 unlock_err:
7712         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7713                       &cached_state);
7714 err:
7715         if (dio_data->data_space_reserved) {
7716                 btrfs_free_reserved_data_space(BTRFS_I(inode),
7717                                                dio_data->data_reserved,
7718                                                start, data_alloc_len);
7719                 extent_changeset_free(dio_data->data_reserved);
7720         }
7721
7722         return ret;
7723 }
7724
7725 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7726                 ssize_t written, unsigned int flags, struct iomap *iomap)
7727 {
7728         struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7729         struct btrfs_dio_data *dio_data = iter->private;
7730         size_t submitted = dio_data->submitted;
7731         const bool write = !!(flags & IOMAP_WRITE);
7732         int ret = 0;
7733
7734         if (!write && (iomap->type == IOMAP_HOLE)) {
7735                 /* If reading from a hole, unlock and return */
7736                 unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
7737                               NULL);
7738                 return 0;
7739         }
7740
7741         if (submitted < length) {
7742                 pos += submitted;
7743                 length -= submitted;
7744                 if (write)
7745                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7746                                                     pos, length, false);
7747                 else
7748                         unlock_extent(&BTRFS_I(inode)->io_tree, pos,
7749                                       pos + length - 1, NULL);
7750                 ret = -ENOTBLK;
7751         }
7752         if (write) {
7753                 btrfs_put_ordered_extent(dio_data->ordered);
7754                 dio_data->ordered = NULL;
7755         }
7756
7757         if (write)
7758                 extent_changeset_free(dio_data->data_reserved);
7759         return ret;
7760 }
7761
7762 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
7763 {
7764         struct btrfs_dio_private *dip =
7765                 container_of(bbio, struct btrfs_dio_private, bbio);
7766         struct btrfs_inode *inode = bbio->inode;
7767         struct bio *bio = &bbio->bio;
7768
7769         if (bio->bi_status) {
7770                 btrfs_warn(inode->root->fs_info,
7771                 "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
7772                            btrfs_ino(inode), bio->bi_opf,
7773                            dip->file_offset, dip->bytes, bio->bi_status);
7774         }
7775
7776         if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
7777                 btrfs_finish_ordered_extent(bbio->ordered, NULL,
7778                                             dip->file_offset, dip->bytes,
7779                                             !bio->bi_status);
7780         } else {
7781                 unlock_extent(&inode->io_tree, dip->file_offset,
7782                               dip->file_offset + dip->bytes - 1, NULL);
7783         }
7784
7785         bbio->bio.bi_private = bbio->private;
7786         iomap_dio_bio_end_io(bio);
7787 }
7788
7789 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
7790                                 loff_t file_offset)
7791 {
7792         struct btrfs_bio *bbio = btrfs_bio(bio);
7793         struct btrfs_dio_private *dip =
7794                 container_of(bbio, struct btrfs_dio_private, bbio);
7795         struct btrfs_dio_data *dio_data = iter->private;
7796
7797         btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
7798                        btrfs_dio_end_io, bio->bi_private);
7799         bbio->inode = BTRFS_I(iter->inode);
7800         bbio->file_offset = file_offset;
7801
7802         dip->file_offset = file_offset;
7803         dip->bytes = bio->bi_iter.bi_size;
7804
7805         dio_data->submitted += bio->bi_iter.bi_size;
7806
7807         /*
7808          * Check if we are doing a partial write.  If we are, we need to split
7809          * the ordered extent to match the submitted bio.  Hang on to the
7810          * remaining unfinishable ordered_extent in dio_data so that it can be
7811          * cancelled in iomap_end to avoid a deadlock wherein faulting the
7812          * remaining pages is blocked on the outstanding ordered extent.
7813          */
7814         if (iter->flags & IOMAP_WRITE) {
7815                 int ret;
7816
7817                 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
7818                 if (ret) {
7819                         btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7820                                                     file_offset, dip->bytes,
7821                                                     !ret);
7822                         bio->bi_status = errno_to_blk_status(ret);
7823                         iomap_dio_bio_end_io(bio);
7824                         return;
7825                 }
7826         }
7827
7828         btrfs_submit_bio(bbio, 0);
7829 }
7830
7831 static const struct iomap_ops btrfs_dio_iomap_ops = {
7832         .iomap_begin            = btrfs_dio_iomap_begin,
7833         .iomap_end              = btrfs_dio_iomap_end,
7834 };
7835
7836 static const struct iomap_dio_ops btrfs_dio_ops = {
7837         .submit_io              = btrfs_dio_submit_io,
7838         .bio_set                = &btrfs_dio_bioset,
7839 };
7840
7841 ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
7842 {
7843         struct btrfs_dio_data data = { 0 };
7844
7845         return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7846                             IOMAP_DIO_PARTIAL, &data, done_before);
7847 }
7848
7849 struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
7850                                   size_t done_before)
7851 {
7852         struct btrfs_dio_data data = { 0 };
7853
7854         return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7855                             IOMAP_DIO_PARTIAL, &data, done_before);
7856 }
7857
7858 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7859                         u64 start, u64 len)
7860 {
7861         int     ret;
7862
7863         ret = fiemap_prep(inode, fieinfo, start, &len, 0);
7864         if (ret)
7865                 return ret;
7866
7867         /*
7868          * fiemap_prep() called filemap_write_and_wait() for the whole possible
7869          * file range (0 to LLONG_MAX), but that is not enough if we have
7870          * compression enabled. The first filemap_fdatawrite_range() only kicks
7871          * in the compression of data (in an async thread) and will return
7872          * before the compression is done and writeback is started. A second
7873          * filemap_fdatawrite_range() is needed to wait for the compression to
7874          * complete and writeback to start. We also need to wait for ordered
7875          * extents to complete, because our fiemap implementation uses mainly
7876          * file extent items to list the extents, searching for extent maps
7877          * only for file ranges with holes or prealloc extents to figure out
7878          * if we have delalloc in those ranges.
7879          */
7880         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7881                 ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7882                 if (ret)
7883                         return ret;
7884         }
7885
7886         return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
7887 }
7888
7889 static int btrfs_writepages(struct address_space *mapping,
7890                             struct writeback_control *wbc)
7891 {
7892         return extent_writepages(mapping, wbc);
7893 }
7894
7895 static void btrfs_readahead(struct readahead_control *rac)
7896 {
7897         extent_readahead(rac);
7898 }
7899
7900 /*
7901  * For release_folio() and invalidate_folio() we have a race window where
7902  * folio_end_writeback() is called but the subpage spinlock is not yet released.
7903  * If we continue to release/invalidate the page, we could cause use-after-free
7904  * for subpage spinlock.  So this function is to spin and wait for subpage
7905  * spinlock.
7906  */
7907 static void wait_subpage_spinlock(struct page *page)
7908 {
7909         struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7910         struct btrfs_subpage *subpage;
7911
7912         if (!btrfs_is_subpage(fs_info, page))
7913                 return;
7914
7915         ASSERT(PagePrivate(page) && page->private);
7916         subpage = (struct btrfs_subpage *)page->private;
7917
7918         /*
7919          * This may look insane as we just acquire the spinlock and release it,
7920          * without doing anything.  But we just want to make sure no one is
7921          * still holding the subpage spinlock.
7922          * And since the page is not dirty nor writeback, and we have page
7923          * locked, the only possible way to hold a spinlock is from the endio
7924          * function to clear page writeback.
7925          *
7926          * Here we just acquire the spinlock so that all existing callers
7927          * should exit and we're safe to release/invalidate the page.
7928          */
7929         spin_lock_irq(&subpage->lock);
7930         spin_unlock_irq(&subpage->lock);
7931 }
7932
7933 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7934 {
7935         int ret = try_release_extent_mapping(&folio->page, gfp_flags);
7936
7937         if (ret == 1) {
7938                 wait_subpage_spinlock(&folio->page);
7939                 clear_page_extent_mapped(&folio->page);
7940         }
7941         return ret;
7942 }
7943
7944 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7945 {
7946         if (folio_test_writeback(folio) || folio_test_dirty(folio))
7947                 return false;
7948         return __btrfs_release_folio(folio, gfp_flags);
7949 }
7950
7951 #ifdef CONFIG_MIGRATION
7952 static int btrfs_migrate_folio(struct address_space *mapping,
7953                              struct folio *dst, struct folio *src,
7954                              enum migrate_mode mode)
7955 {
7956         int ret = filemap_migrate_folio(mapping, dst, src, mode);
7957
7958         if (ret != MIGRATEPAGE_SUCCESS)
7959                 return ret;
7960
7961         if (folio_test_ordered(src)) {
7962                 folio_clear_ordered(src);
7963                 folio_set_ordered(dst);
7964         }
7965
7966         return MIGRATEPAGE_SUCCESS;
7967 }
7968 #else
7969 #define btrfs_migrate_folio NULL
7970 #endif
7971
7972 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7973                                  size_t length)
7974 {
7975         struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
7976         struct btrfs_fs_info *fs_info = inode->root->fs_info;
7977         struct extent_io_tree *tree = &inode->io_tree;
7978         struct extent_state *cached_state = NULL;
7979         u64 page_start = folio_pos(folio);
7980         u64 page_end = page_start + folio_size(folio) - 1;
7981         u64 cur;
7982         int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7983
7984         /*
7985          * We have folio locked so no new ordered extent can be created on this
7986          * page, nor bio can be submitted for this folio.
7987          *
7988          * But already submitted bio can still be finished on this folio.
7989          * Furthermore, endio function won't skip folio which has Ordered
7990          * (Private2) already cleared, so it's possible for endio and
7991          * invalidate_folio to do the same ordered extent accounting twice
7992          * on one folio.
7993          *
7994          * So here we wait for any submitted bios to finish, so that we won't
7995          * do double ordered extent accounting on the same folio.
7996          */
7997         folio_wait_writeback(folio);
7998         wait_subpage_spinlock(&folio->page);
7999
8000         /*
8001          * For subpage case, we have call sites like
8002          * btrfs_punch_hole_lock_range() which passes range not aligned to
8003          * sectorsize.
8004          * If the range doesn't cover the full folio, we don't need to and
8005          * shouldn't clear page extent mapped, as folio->private can still
8006          * record subpage dirty bits for other part of the range.
8007          *
8008          * For cases that invalidate the full folio even the range doesn't
8009          * cover the full folio, like invalidating the last folio, we're
8010          * still safe to wait for ordered extent to finish.
8011          */
8012         if (!(offset == 0 && length == folio_size(folio))) {
8013                 btrfs_release_folio(folio, GFP_NOFS);
8014                 return;
8015         }
8016
8017         if (!inode_evicting)
8018                 lock_extent(tree, page_start, page_end, &cached_state);
8019
8020         cur = page_start;
8021         while (cur < page_end) {
8022                 struct btrfs_ordered_extent *ordered;
8023                 u64 range_end;
8024                 u32 range_len;
8025                 u32 extra_flags = 0;
8026
8027                 ordered = btrfs_lookup_first_ordered_range(inode, cur,
8028                                                            page_end + 1 - cur);
8029                 if (!ordered) {
8030                         range_end = page_end;
8031                         /*
8032                          * No ordered extent covering this range, we are safe
8033                          * to delete all extent states in the range.
8034                          */
8035                         extra_flags = EXTENT_CLEAR_ALL_BITS;
8036                         goto next;
8037                 }
8038                 if (ordered->file_offset > cur) {
8039                         /*
8040                          * There is a range between [cur, oe->file_offset) not
8041                          * covered by any ordered extent.
8042                          * We are safe to delete all extent states, and handle
8043                          * the ordered extent in the next iteration.
8044                          */
8045                         range_end = ordered->file_offset - 1;
8046                         extra_flags = EXTENT_CLEAR_ALL_BITS;
8047                         goto next;
8048                 }
8049
8050                 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
8051                                 page_end);
8052                 ASSERT(range_end + 1 - cur < U32_MAX);
8053                 range_len = range_end + 1 - cur;
8054                 if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
8055                         /*
8056                          * If Ordered (Private2) is cleared, it means endio has
8057                          * already been executed for the range.
8058                          * We can't delete the extent states as
8059                          * btrfs_finish_ordered_io() may still use some of them.
8060                          */
8061                         goto next;
8062                 }
8063                 btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
8064
8065                 /*
8066                  * IO on this page will never be started, so we need to account
8067                  * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8068                  * here, must leave that up for the ordered extent completion.
8069                  *
8070                  * This will also unlock the range for incoming
8071                  * btrfs_finish_ordered_io().
8072                  */
8073                 if (!inode_evicting)
8074                         clear_extent_bit(tree, cur, range_end,
8075                                          EXTENT_DELALLOC |
8076                                          EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8077                                          EXTENT_DEFRAG, &cached_state);
8078
8079                 spin_lock_irq(&inode->ordered_tree.lock);
8080                 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8081                 ordered->truncated_len = min(ordered->truncated_len,
8082                                              cur - ordered->file_offset);
8083                 spin_unlock_irq(&inode->ordered_tree.lock);
8084
8085                 /*
8086                  * If the ordered extent has finished, we're safe to delete all
8087                  * the extent states of the range, otherwise
8088                  * btrfs_finish_ordered_io() will get executed by endio for
8089                  * other pages, so we can't delete extent states.
8090                  */
8091                 if (btrfs_dec_test_ordered_pending(inode, &ordered,
8092                                                    cur, range_end + 1 - cur)) {
8093                         btrfs_finish_ordered_io(ordered);
8094                         /*
8095                          * The ordered extent has finished, now we're again
8096                          * safe to delete all extent states of the range.
8097                          */
8098                         extra_flags = EXTENT_CLEAR_ALL_BITS;
8099                 }
8100 next:
8101                 if (ordered)
8102                         btrfs_put_ordered_extent(ordered);
8103                 /*
8104                  * Qgroup reserved space handler
8105                  * Sector(s) here will be either:
8106                  *
8107                  * 1) Already written to disk or bio already finished
8108                  *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
8109                  *    Qgroup will be handled by its qgroup_record then.
8110                  *    btrfs_qgroup_free_data() call will do nothing here.
8111                  *
8112                  * 2) Not written to disk yet
8113                  *    Then btrfs_qgroup_free_data() call will clear the
8114                  *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
8115                  *    reserved data space.
8116                  *    Since the IO will never happen for this page.
8117                  */
8118                 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
8119                 if (!inode_evicting) {
8120                         clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
8121                                  EXTENT_DELALLOC | EXTENT_UPTODATE |
8122                                  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
8123                                  extra_flags, &cached_state);
8124                 }
8125                 cur = range_end + 1;
8126         }
8127         /*
8128          * We have iterated through all ordered extents of the page, the page
8129          * should not have Ordered (Private2) anymore, or the above iteration
8130          * did something wrong.
8131          */
8132         ASSERT(!folio_test_ordered(folio));
8133         btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
8134         if (!inode_evicting)
8135                 __btrfs_release_folio(folio, GFP_NOFS);
8136         clear_page_extent_mapped(&folio->page);
8137 }
8138
8139 /*
8140  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8141  * called from a page fault handler when a page is first dirtied. Hence we must
8142  * be careful to check for EOF conditions here. We set the page up correctly
8143  * for a written page which means we get ENOSPC checking when writing into
8144  * holes and correct delalloc and unwritten extent mapping on filesystems that
8145  * support these features.
8146  *
8147  * We are not allowed to take the i_mutex here so we have to play games to
8148  * protect against truncate races as the page could now be beyond EOF.  Because
8149  * truncate_setsize() writes the inode size before removing pages, once we have
8150  * the page lock we can determine safely if the page is beyond EOF. If it is not
8151  * beyond EOF, then the page is guaranteed safe against truncation until we
8152  * unlock the page.
8153  */
8154 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8155 {
8156         struct page *page = vmf->page;
8157         struct inode *inode = file_inode(vmf->vma->vm_file);
8158         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8159         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8160         struct btrfs_ordered_extent *ordered;
8161         struct extent_state *cached_state = NULL;
8162         struct extent_changeset *data_reserved = NULL;
8163         unsigned long zero_start;
8164         loff_t size;
8165         vm_fault_t ret;
8166         int ret2;
8167         int reserved = 0;
8168         u64 reserved_space;
8169         u64 page_start;
8170         u64 page_end;
8171         u64 end;
8172
8173         reserved_space = PAGE_SIZE;
8174
8175         sb_start_pagefault(inode->i_sb);
8176         page_start = page_offset(page);
8177         page_end = page_start + PAGE_SIZE - 1;
8178         end = page_end;
8179
8180         /*
8181          * Reserving delalloc space after obtaining the page lock can lead to
8182          * deadlock. For example, if a dirty page is locked by this function
8183          * and the call to btrfs_delalloc_reserve_space() ends up triggering
8184          * dirty page write out, then the btrfs_writepages() function could
8185          * end up waiting indefinitely to get a lock on the page currently
8186          * being processed by btrfs_page_mkwrite() function.
8187          */
8188         ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8189                                             page_start, reserved_space);
8190         if (!ret2) {
8191                 ret2 = file_update_time(vmf->vma->vm_file);
8192                 reserved = 1;
8193         }
8194         if (ret2) {
8195                 ret = vmf_error(ret2);
8196                 if (reserved)
8197                         goto out;
8198                 goto out_noreserve;
8199         }
8200
8201         ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8202 again:
8203         down_read(&BTRFS_I(inode)->i_mmap_lock);
8204         lock_page(page);
8205         size = i_size_read(inode);
8206
8207         if ((page->mapping != inode->i_mapping) ||
8208             (page_start >= size)) {
8209                 /* page got truncated out from underneath us */
8210                 goto out_unlock;
8211         }
8212         wait_on_page_writeback(page);
8213
8214         lock_extent(io_tree, page_start, page_end, &cached_state);
8215         ret2 = set_page_extent_mapped(page);
8216         if (ret2 < 0) {
8217                 ret = vmf_error(ret2);
8218                 unlock_extent(io_tree, page_start, page_end, &cached_state);
8219                 goto out_unlock;
8220         }
8221
8222         /*
8223          * we can't set the delalloc bits if there are pending ordered
8224          * extents.  Drop our locks and wait for them to finish
8225          */
8226         ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8227                         PAGE_SIZE);
8228         if (ordered) {
8229                 unlock_extent(io_tree, page_start, page_end, &cached_state);
8230                 unlock_page(page);
8231                 up_read(&BTRFS_I(inode)->i_mmap_lock);
8232                 btrfs_start_ordered_extent(ordered);
8233                 btrfs_put_ordered_extent(ordered);
8234                 goto again;
8235         }
8236
8237         if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8238                 reserved_space = round_up(size - page_start,
8239                                           fs_info->sectorsize);
8240                 if (reserved_space < PAGE_SIZE) {
8241                         end = page_start + reserved_space - 1;
8242                         btrfs_delalloc_release_space(BTRFS_I(inode),
8243                                         data_reserved, page_start,
8244                                         PAGE_SIZE - reserved_space, true);
8245                 }
8246         }
8247
8248         /*
8249          * page_mkwrite gets called when the page is firstly dirtied after it's
8250          * faulted in, but write(2) could also dirty a page and set delalloc
8251          * bits, thus in this case for space account reason, we still need to
8252          * clear any delalloc bits within this page range since we have to
8253          * reserve data&meta space before lock_page() (see above comments).
8254          */
8255         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8256                           EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8257                           EXTENT_DEFRAG, &cached_state);
8258
8259         ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8260                                         &cached_state);
8261         if (ret2) {
8262                 unlock_extent(io_tree, page_start, page_end, &cached_state);
8263                 ret = VM_FAULT_SIGBUS;
8264                 goto out_unlock;
8265         }
8266
8267         /* page is wholly or partially inside EOF */
8268         if (page_start + PAGE_SIZE > size)
8269                 zero_start = offset_in_page(size);
8270         else
8271                 zero_start = PAGE_SIZE;
8272
8273         if (zero_start != PAGE_SIZE)
8274                 memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8275
8276         btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8277         btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
8278         btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8279
8280         btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8281
8282         unlock_extent(io_tree, page_start, page_end, &cached_state);
8283         up_read(&BTRFS_I(inode)->i_mmap_lock);
8284
8285         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8286         sb_end_pagefault(inode->i_sb);
8287         extent_changeset_free(data_reserved);
8288         return VM_FAULT_LOCKED;
8289
8290 out_unlock:
8291         unlock_page(page);
8292         up_read(&BTRFS_I(inode)->i_mmap_lock);
8293 out:
8294         btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8295         btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8296                                      reserved_space, (ret != 0));
8297 out_noreserve:
8298         sb_end_pagefault(inode->i_sb);
8299         extent_changeset_free(data_reserved);
8300         return ret;
8301 }
8302
8303 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
8304 {
8305         struct btrfs_truncate_control control = {
8306                 .inode = inode,
8307                 .ino = btrfs_ino(inode),
8308                 .min_type = BTRFS_EXTENT_DATA_KEY,
8309                 .clear_extent_range = true,
8310         };
8311         struct btrfs_root *root = inode->root;
8312         struct btrfs_fs_info *fs_info = root->fs_info;
8313         struct btrfs_block_rsv *rsv;
8314         int ret;
8315         struct btrfs_trans_handle *trans;
8316         u64 mask = fs_info->sectorsize - 1;
8317         const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
8318
8319         if (!skip_writeback) {
8320                 ret = btrfs_wait_ordered_range(&inode->vfs_inode,
8321                                                inode->vfs_inode.i_size & (~mask),
8322                                                (u64)-1);
8323                 if (ret)
8324                         return ret;
8325         }
8326
8327         /*
8328          * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
8329          * things going on here:
8330          *
8331          * 1) We need to reserve space to update our inode.
8332          *
8333          * 2) We need to have something to cache all the space that is going to
8334          * be free'd up by the truncate operation, but also have some slack
8335          * space reserved in case it uses space during the truncate (thank you
8336          * very much snapshotting).
8337          *
8338          * And we need these to be separate.  The fact is we can use a lot of
8339          * space doing the truncate, and we have no earthly idea how much space
8340          * we will use, so we need the truncate reservation to be separate so it
8341          * doesn't end up using space reserved for updating the inode.  We also
8342          * need to be able to stop the transaction and start a new one, which
8343          * means we need to be able to update the inode several times, and we
8344          * have no idea of knowing how many times that will be, so we can't just
8345          * reserve 1 item for the entirety of the operation, so that has to be
8346          * done separately as well.
8347          *
8348          * So that leaves us with
8349          *
8350          * 1) rsv - for the truncate reservation, which we will steal from the
8351          * transaction reservation.
8352          * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8353          * updating the inode.
8354          */
8355         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8356         if (!rsv)
8357                 return -ENOMEM;
8358         rsv->size = min_size;
8359         rsv->failfast = true;
8360
8361         /*
8362          * 1 for the truncate slack space
8363          * 1 for updating the inode.
8364          */
8365         trans = btrfs_start_transaction(root, 2);
8366         if (IS_ERR(trans)) {
8367                 ret = PTR_ERR(trans);
8368                 goto out;
8369         }
8370
8371         /* Migrate the slack space for the truncate to our reserve */
8372         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8373                                       min_size, false);
8374         /*
8375          * We have reserved 2 metadata units when we started the transaction and
8376          * min_size matches 1 unit, so this should never fail, but if it does,
8377          * it's not critical we just fail truncation.
8378          */
8379         if (WARN_ON(ret)) {
8380                 btrfs_end_transaction(trans);
8381                 goto out;
8382         }
8383
8384         trans->block_rsv = rsv;
8385
8386         while (1) {
8387                 struct extent_state *cached_state = NULL;
8388                 const u64 new_size = inode->vfs_inode.i_size;
8389                 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
8390
8391                 control.new_size = new_size;
8392                 lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8393                 /*
8394                  * We want to drop from the next block forward in case this new
8395                  * size is not block aligned since we will be keeping the last
8396                  * block of the extent just the way it is.
8397                  */
8398                 btrfs_drop_extent_map_range(inode,
8399                                             ALIGN(new_size, fs_info->sectorsize),
8400                                             (u64)-1, false);
8401
8402                 ret = btrfs_truncate_inode_items(trans, root, &control);
8403
8404                 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
8405                 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
8406
8407                 unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8408
8409                 trans->block_rsv = &fs_info->trans_block_rsv;
8410                 if (ret != -ENOSPC && ret != -EAGAIN)
8411                         break;
8412
8413                 ret = btrfs_update_inode(trans, root, inode);
8414                 if (ret)
8415                         break;
8416
8417                 btrfs_end_transaction(trans);
8418                 btrfs_btree_balance_dirty(fs_info);
8419
8420                 trans = btrfs_start_transaction(root, 2);
8421                 if (IS_ERR(trans)) {
8422                         ret = PTR_ERR(trans);
8423                         trans = NULL;
8424                         break;
8425                 }
8426
8427                 btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8428                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8429                                               rsv, min_size, false);
8430                 /*
8431                  * We have reserved 2 metadata units when we started the
8432                  * transaction and min_size matches 1 unit, so this should never
8433                  * fail, but if it does, it's not critical we just fail truncation.
8434                  */
8435                 if (WARN_ON(ret))
8436                         break;
8437
8438                 trans->block_rsv = rsv;
8439         }
8440
8441         /*
8442          * We can't call btrfs_truncate_block inside a trans handle as we could
8443          * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
8444          * know we've truncated everything except the last little bit, and can
8445          * do btrfs_truncate_block and then update the disk_i_size.
8446          */
8447         if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8448                 btrfs_end_transaction(trans);
8449                 btrfs_btree_balance_dirty(fs_info);
8450
8451                 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
8452                 if (ret)
8453                         goto out;
8454                 trans = btrfs_start_transaction(root, 1);
8455                 if (IS_ERR(trans)) {
8456                         ret = PTR_ERR(trans);
8457                         goto out;
8458                 }
8459                 btrfs_inode_safe_disk_i_size_write(inode, 0);
8460         }
8461
8462         if (trans) {
8463                 int ret2;
8464
8465                 trans->block_rsv = &fs_info->trans_block_rsv;
8466                 ret2 = btrfs_update_inode(trans, root, inode);
8467                 if (ret2 && !ret)
8468                         ret = ret2;
8469
8470                 ret2 = btrfs_end_transaction(trans);
8471                 if (ret2 && !ret)
8472                         ret = ret2;
8473                 btrfs_btree_balance_dirty(fs_info);
8474         }
8475 out:
8476         btrfs_free_block_rsv(fs_info, rsv);
8477         /*
8478          * So if we truncate and then write and fsync we normally would just
8479          * write the extents that changed, which is a problem if we need to
8480          * first truncate that entire inode.  So set this flag so we write out
8481          * all of the extents in the inode to the sync log so we're completely
8482          * safe.
8483          *
8484          * If no extents were dropped or trimmed we don't need to force the next
8485          * fsync to truncate all the inode's items from the log and re-log them
8486          * all. This means the truncate operation did not change the file size,
8487          * or changed it to a smaller size but there was only an implicit hole
8488          * between the old i_size and the new i_size, and there were no prealloc
8489          * extents beyond i_size to drop.
8490          */
8491         if (control.extents_found > 0)
8492                 btrfs_set_inode_full_sync(inode);
8493
8494         return ret;
8495 }
8496
8497 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
8498                                      struct inode *dir)
8499 {
8500         struct inode *inode;
8501
8502         inode = new_inode(dir->i_sb);
8503         if (inode) {
8504                 /*
8505                  * Subvolumes don't inherit the sgid bit or the parent's gid if
8506                  * the parent's sgid bit is set. This is probably a bug.
8507                  */
8508                 inode_init_owner(idmap, inode, NULL,
8509                                  S_IFDIR | (~current_umask() & S_IRWXUGO));
8510                 inode->i_op = &btrfs_dir_inode_operations;
8511                 inode->i_fop = &btrfs_dir_file_operations;
8512         }
8513         return inode;
8514 }
8515
8516 struct inode *btrfs_alloc_inode(struct super_block *sb)
8517 {
8518         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
8519         struct btrfs_inode *ei;
8520         struct inode *inode;
8521
8522         ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
8523         if (!ei)
8524                 return NULL;
8525
8526         ei->root = NULL;
8527         ei->generation = 0;
8528         ei->last_trans = 0;
8529         ei->last_sub_trans = 0;
8530         ei->logged_trans = 0;
8531         ei->delalloc_bytes = 0;
8532         ei->new_delalloc_bytes = 0;
8533         ei->defrag_bytes = 0;
8534         ei->disk_i_size = 0;
8535         ei->flags = 0;
8536         ei->ro_flags = 0;
8537         ei->csum_bytes = 0;
8538         ei->index_cnt = (u64)-1;
8539         ei->dir_index = 0;
8540         ei->last_unlink_trans = 0;
8541         ei->last_reflink_trans = 0;
8542         ei->last_log_commit = 0;
8543
8544         spin_lock_init(&ei->lock);
8545         ei->outstanding_extents = 0;
8546         if (sb->s_magic != BTRFS_TEST_MAGIC)
8547                 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
8548                                               BTRFS_BLOCK_RSV_DELALLOC);
8549         ei->runtime_flags = 0;
8550         ei->prop_compress = BTRFS_COMPRESS_NONE;
8551         ei->defrag_compress = BTRFS_COMPRESS_NONE;
8552
8553         ei->delayed_node = NULL;
8554
8555         ei->i_otime.tv_sec = 0;
8556         ei->i_otime.tv_nsec = 0;
8557
8558         inode = &ei->vfs_inode;
8559         extent_map_tree_init(&ei->extent_tree);
8560         extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
8561         ei->io_tree.inode = ei;
8562         extent_io_tree_init(fs_info, &ei->file_extent_tree,
8563                             IO_TREE_INODE_FILE_EXTENT);
8564         mutex_init(&ei->log_mutex);
8565         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8566         INIT_LIST_HEAD(&ei->delalloc_inodes);
8567         INIT_LIST_HEAD(&ei->delayed_iput);
8568         RB_CLEAR_NODE(&ei->rb_node);
8569         init_rwsem(&ei->i_mmap_lock);
8570
8571         return inode;
8572 }
8573
8574 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8575 void btrfs_test_destroy_inode(struct inode *inode)
8576 {
8577         btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
8578         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8579 }
8580 #endif
8581
8582 void btrfs_free_inode(struct inode *inode)
8583 {
8584         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8585 }
8586
8587 void btrfs_destroy_inode(struct inode *vfs_inode)
8588 {
8589         struct btrfs_ordered_extent *ordered;
8590         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
8591         struct btrfs_root *root = inode->root;
8592         bool freespace_inode;
8593
8594         WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8595         WARN_ON(vfs_inode->i_data.nrpages);
8596         WARN_ON(inode->block_rsv.reserved);
8597         WARN_ON(inode->block_rsv.size);
8598         WARN_ON(inode->outstanding_extents);
8599         if (!S_ISDIR(vfs_inode->i_mode)) {
8600                 WARN_ON(inode->delalloc_bytes);
8601                 WARN_ON(inode->new_delalloc_bytes);
8602         }
8603         WARN_ON(inode->csum_bytes);
8604         WARN_ON(inode->defrag_bytes);
8605
8606         /*
8607          * This can happen where we create an inode, but somebody else also
8608          * created the same inode and we need to destroy the one we already
8609          * created.
8610          */
8611         if (!root)
8612                 return;
8613
8614         /*
8615          * If this is a free space inode do not take the ordered extents lockdep
8616          * map.
8617          */
8618         freespace_inode = btrfs_is_free_space_inode(inode);
8619
8620         while (1) {
8621                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8622                 if (!ordered)
8623                         break;
8624                 else {
8625                         btrfs_err(root->fs_info,
8626                                   "found ordered extent %llu %llu on inode cleanup",
8627                                   ordered->file_offset, ordered->num_bytes);
8628
8629                         if (!freespace_inode)
8630                                 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
8631
8632                         btrfs_remove_ordered_extent(inode, ordered);
8633                         btrfs_put_ordered_extent(ordered);
8634                         btrfs_put_ordered_extent(ordered);
8635                 }
8636         }
8637         btrfs_qgroup_check_reserved_leak(inode);
8638         inode_tree_del(inode);
8639         btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
8640         btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
8641         btrfs_put_root(inode->root);
8642 }
8643
8644 int btrfs_drop_inode(struct inode *inode)
8645 {
8646         struct btrfs_root *root = BTRFS_I(inode)->root;
8647
8648         if (root == NULL)
8649                 return 1;
8650
8651         /* the snap/subvol tree is on deleting */
8652         if (btrfs_root_refs(&root->root_item) == 0)
8653                 return 1;
8654         else
8655                 return generic_drop_inode(inode);
8656 }
8657
8658 static void init_once(void *foo)
8659 {
8660         struct btrfs_inode *ei = foo;
8661
8662         inode_init_once(&ei->vfs_inode);
8663 }
8664
8665 void __cold btrfs_destroy_cachep(void)
8666 {
8667         /*
8668          * Make sure all delayed rcu free inodes are flushed before we
8669          * destroy cache.
8670          */
8671         rcu_barrier();
8672         bioset_exit(&btrfs_dio_bioset);
8673         kmem_cache_destroy(btrfs_inode_cachep);
8674 }
8675
8676 int __init btrfs_init_cachep(void)
8677 {
8678         btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8679                         sizeof(struct btrfs_inode), 0,
8680                         SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
8681                         init_once);
8682         if (!btrfs_inode_cachep)
8683                 goto fail;
8684
8685         if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
8686                         offsetof(struct btrfs_dio_private, bbio.bio),
8687                         BIOSET_NEED_BVECS))
8688                 goto fail;
8689
8690         return 0;
8691 fail:
8692         btrfs_destroy_cachep();
8693         return -ENOMEM;
8694 }
8695
8696 static int btrfs_getattr(struct mnt_idmap *idmap,
8697                          const struct path *path, struct kstat *stat,
8698                          u32 request_mask, unsigned int flags)
8699 {
8700         u64 delalloc_bytes;
8701         u64 inode_bytes;
8702         struct inode *inode = d_inode(path->dentry);
8703         u32 blocksize = inode->i_sb->s_blocksize;
8704         u32 bi_flags = BTRFS_I(inode)->flags;
8705         u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8706
8707         stat->result_mask |= STATX_BTIME;
8708         stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
8709         stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
8710         if (bi_flags & BTRFS_INODE_APPEND)
8711                 stat->attributes |= STATX_ATTR_APPEND;
8712         if (bi_flags & BTRFS_INODE_COMPRESS)
8713                 stat->attributes |= STATX_ATTR_COMPRESSED;
8714         if (bi_flags & BTRFS_INODE_IMMUTABLE)
8715                 stat->attributes |= STATX_ATTR_IMMUTABLE;
8716         if (bi_flags & BTRFS_INODE_NODUMP)
8717                 stat->attributes |= STATX_ATTR_NODUMP;
8718         if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8719                 stat->attributes |= STATX_ATTR_VERITY;
8720
8721         stat->attributes_mask |= (STATX_ATTR_APPEND |
8722                                   STATX_ATTR_COMPRESSED |
8723                                   STATX_ATTR_IMMUTABLE |
8724                                   STATX_ATTR_NODUMP);
8725
8726         generic_fillattr(idmap, inode, stat);
8727         stat->dev = BTRFS_I(inode)->root->anon_dev;
8728
8729         spin_lock(&BTRFS_I(inode)->lock);
8730         delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8731         inode_bytes = inode_get_bytes(inode);
8732         spin_unlock(&BTRFS_I(inode)->lock);
8733         stat->blocks = (ALIGN(inode_bytes, blocksize) +
8734                         ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8735         return 0;
8736 }
8737
8738 static int btrfs_rename_exchange(struct inode *old_dir,
8739                               struct dentry *old_dentry,
8740                               struct inode *new_dir,
8741                               struct dentry *new_dentry)
8742 {
8743         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8744         struct btrfs_trans_handle *trans;
8745         unsigned int trans_num_items;
8746         struct btrfs_root *root = BTRFS_I(old_dir)->root;
8747         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8748         struct inode *new_inode = new_dentry->d_inode;
8749         struct inode *old_inode = old_dentry->d_inode;
8750         struct timespec64 ctime = current_time(old_inode);
8751         struct btrfs_rename_ctx old_rename_ctx;
8752         struct btrfs_rename_ctx new_rename_ctx;
8753         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8754         u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
8755         u64 old_idx = 0;
8756         u64 new_idx = 0;
8757         int ret;
8758         int ret2;
8759         bool need_abort = false;
8760         struct fscrypt_name old_fname, new_fname;
8761         struct fscrypt_str *old_name, *new_name;
8762
8763         /*
8764          * For non-subvolumes allow exchange only within one subvolume, in the
8765          * same inode namespace. Two subvolumes (represented as directory) can
8766          * be exchanged as they're a logical link and have a fixed inode number.
8767          */
8768         if (root != dest &&
8769             (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
8770              new_ino != BTRFS_FIRST_FREE_OBJECTID))
8771                 return -EXDEV;
8772
8773         ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8774         if (ret)
8775                 return ret;
8776
8777         ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8778         if (ret) {
8779                 fscrypt_free_filename(&old_fname);
8780                 return ret;
8781         }
8782
8783         old_name = &old_fname.disk_name;
8784         new_name = &new_fname.disk_name;
8785
8786         /* close the race window with snapshot create/destroy ioctl */
8787         if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8788             new_ino == BTRFS_FIRST_FREE_OBJECTID)
8789                 down_read(&fs_info->subvol_sem);
8790
8791         /*
8792          * For each inode:
8793          * 1 to remove old dir item
8794          * 1 to remove old dir index
8795          * 1 to add new dir item
8796          * 1 to add new dir index
8797          * 1 to update parent inode
8798          *
8799          * If the parents are the same, we only need to account for one
8800          */
8801         trans_num_items = (old_dir == new_dir ? 9 : 10);
8802         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8803                 /*
8804                  * 1 to remove old root ref
8805                  * 1 to remove old root backref
8806                  * 1 to add new root ref
8807                  * 1 to add new root backref
8808                  */
8809                 trans_num_items += 4;
8810         } else {
8811                 /*
8812                  * 1 to update inode item
8813                  * 1 to remove old inode ref
8814                  * 1 to add new inode ref
8815                  */
8816                 trans_num_items += 3;
8817         }
8818         if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8819                 trans_num_items += 4;
8820         else
8821                 trans_num_items += 3;
8822         trans = btrfs_start_transaction(root, trans_num_items);
8823         if (IS_ERR(trans)) {
8824                 ret = PTR_ERR(trans);
8825                 goto out_notrans;
8826         }
8827
8828         if (dest != root) {
8829                 ret = btrfs_record_root_in_trans(trans, dest);
8830                 if (ret)
8831                         goto out_fail;
8832         }
8833
8834         /*
8835          * We need to find a free sequence number both in the source and
8836          * in the destination directory for the exchange.
8837          */
8838         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8839         if (ret)
8840                 goto out_fail;
8841         ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8842         if (ret)
8843                 goto out_fail;
8844
8845         BTRFS_I(old_inode)->dir_index = 0ULL;
8846         BTRFS_I(new_inode)->dir_index = 0ULL;
8847
8848         /* Reference for the source. */
8849         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8850                 /* force full log commit if subvolume involved. */
8851                 btrfs_set_log_full_commit(trans);
8852         } else {
8853                 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8854                                              btrfs_ino(BTRFS_I(new_dir)),
8855                                              old_idx);
8856                 if (ret)
8857                         goto out_fail;
8858                 need_abort = true;
8859         }
8860
8861         /* And now for the dest. */
8862         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8863                 /* force full log commit if subvolume involved. */
8864                 btrfs_set_log_full_commit(trans);
8865         } else {
8866                 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8867                                              btrfs_ino(BTRFS_I(old_dir)),
8868                                              new_idx);
8869                 if (ret) {
8870                         if (need_abort)
8871                                 btrfs_abort_transaction(trans, ret);
8872                         goto out_fail;
8873                 }
8874         }
8875
8876         /* Update inode version and ctime/mtime. */
8877         inode_inc_iversion(old_dir);
8878         inode_inc_iversion(new_dir);
8879         inode_inc_iversion(old_inode);
8880         inode_inc_iversion(new_inode);
8881         old_dir->i_mtime = ctime;
8882         old_dir->i_ctime = ctime;
8883         new_dir->i_mtime = ctime;
8884         new_dir->i_ctime = ctime;
8885         old_inode->i_ctime = ctime;
8886         new_inode->i_ctime = ctime;
8887
8888         if (old_dentry->d_parent != new_dentry->d_parent) {
8889                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8890                                         BTRFS_I(old_inode), true);
8891                 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8892                                         BTRFS_I(new_inode), true);
8893         }
8894
8895         /* src is a subvolume */
8896         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8897                 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8898         } else { /* src is an inode */
8899                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8900                                            BTRFS_I(old_dentry->d_inode),
8901                                            old_name, &old_rename_ctx);
8902                 if (!ret)
8903                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
8904         }
8905         if (ret) {
8906                 btrfs_abort_transaction(trans, ret);
8907                 goto out_fail;
8908         }
8909
8910         /* dest is a subvolume */
8911         if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8912                 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8913         } else { /* dest is an inode */
8914                 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8915                                            BTRFS_I(new_dentry->d_inode),
8916                                            new_name, &new_rename_ctx);
8917                 if (!ret)
8918                         ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
8919         }
8920         if (ret) {
8921                 btrfs_abort_transaction(trans, ret);
8922                 goto out_fail;
8923         }
8924
8925         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8926                              new_name, 0, old_idx);
8927         if (ret) {
8928                 btrfs_abort_transaction(trans, ret);
8929                 goto out_fail;
8930         }
8931
8932         ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8933                              old_name, 0, new_idx);
8934         if (ret) {
8935                 btrfs_abort_transaction(trans, ret);
8936                 goto out_fail;
8937         }
8938
8939         if (old_inode->i_nlink == 1)
8940                 BTRFS_I(old_inode)->dir_index = old_idx;
8941         if (new_inode->i_nlink == 1)
8942                 BTRFS_I(new_inode)->dir_index = new_idx;
8943
8944         /*
8945          * Now pin the logs of the roots. We do it to ensure that no other task
8946          * can sync the logs while we are in progress with the rename, because
8947          * that could result in an inconsistency in case any of the inodes that
8948          * are part of this rename operation were logged before.
8949          */
8950         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8951                 btrfs_pin_log_trans(root);
8952         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8953                 btrfs_pin_log_trans(dest);
8954
8955         /* Do the log updates for all inodes. */
8956         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8957                 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8958                                    old_rename_ctx.index, new_dentry->d_parent);
8959         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8960                 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8961                                    new_rename_ctx.index, old_dentry->d_parent);
8962
8963         /* Now unpin the logs. */
8964         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8965                 btrfs_end_log_trans(root);
8966         if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8967                 btrfs_end_log_trans(dest);
8968 out_fail:
8969         ret2 = btrfs_end_transaction(trans);
8970         ret = ret ? ret : ret2;
8971 out_notrans:
8972         if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8973             old_ino == BTRFS_FIRST_FREE_OBJECTID)
8974                 up_read(&fs_info->subvol_sem);
8975
8976         fscrypt_free_filename(&new_fname);
8977         fscrypt_free_filename(&old_fname);
8978         return ret;
8979 }
8980
8981 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8982                                         struct inode *dir)
8983 {
8984         struct inode *inode;
8985
8986         inode = new_inode(dir->i_sb);
8987         if (inode) {
8988                 inode_init_owner(idmap, inode, dir,
8989                                  S_IFCHR | WHITEOUT_MODE);
8990                 inode->i_op = &btrfs_special_inode_operations;
8991                 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8992         }
8993         return inode;
8994 }
8995
8996 static int btrfs_rename(struct mnt_idmap *idmap,
8997                         struct inode *old_dir, struct dentry *old_dentry,
8998                         struct inode *new_dir, struct dentry *new_dentry,
8999                         unsigned int flags)
9000 {
9001         struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9002         struct btrfs_new_inode_args whiteout_args = {
9003                 .dir = old_dir,
9004                 .dentry = old_dentry,
9005         };
9006         struct btrfs_trans_handle *trans;
9007         unsigned int trans_num_items;
9008         struct btrfs_root *root = BTRFS_I(old_dir)->root;
9009         struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9010         struct inode *new_inode = d_inode(new_dentry);
9011         struct inode *old_inode = d_inode(old_dentry);
9012         struct btrfs_rename_ctx rename_ctx;
9013         u64 index = 0;
9014         int ret;
9015         int ret2;
9016         u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9017         struct fscrypt_name old_fname, new_fname;
9018
9019         if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9020                 return -EPERM;
9021
9022         /* we only allow rename subvolume link between subvolumes */
9023         if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9024                 return -EXDEV;
9025
9026         if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9027             (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9028                 return -ENOTEMPTY;
9029
9030         if (S_ISDIR(old_inode->i_mode) && new_inode &&
9031             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9032                 return -ENOTEMPTY;
9033
9034         ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
9035         if (ret)
9036                 return ret;
9037
9038         ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
9039         if (ret) {
9040                 fscrypt_free_filename(&old_fname);
9041                 return ret;
9042         }
9043
9044         /* check for collisions, even if the  name isn't there */
9045         ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
9046         if (ret) {
9047                 if (ret == -EEXIST) {
9048                         /* we shouldn't get
9049                          * eexist without a new_inode */
9050                         if (WARN_ON(!new_inode)) {
9051                                 goto out_fscrypt_names;
9052                         }
9053                 } else {
9054                         /* maybe -EOVERFLOW */
9055                         goto out_fscrypt_names;
9056                 }
9057         }
9058         ret = 0;
9059
9060         /*
9061          * we're using rename to replace one file with another.  Start IO on it
9062          * now so  we don't add too much work to the end of the transaction
9063          */
9064         if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9065                 filemap_flush(old_inode->i_mapping);
9066
9067         if (flags & RENAME_WHITEOUT) {
9068                 whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
9069                 if (!whiteout_args.inode) {
9070                         ret = -ENOMEM;
9071                         goto out_fscrypt_names;
9072                 }
9073                 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
9074                 if (ret)
9075                         goto out_whiteout_inode;
9076         } else {
9077                 /* 1 to update the old parent inode. */
9078                 trans_num_items = 1;
9079         }
9080
9081         if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9082                 /* Close the race window with snapshot create/destroy ioctl */
9083                 down_read(&fs_info->subvol_sem);
9084                 /*
9085                  * 1 to remove old root ref
9086                  * 1 to remove old root backref
9087                  * 1 to add new root ref
9088                  * 1 to add new root backref
9089                  */
9090                 trans_num_items += 4;
9091         } else {
9092                 /*
9093                  * 1 to update inode
9094                  * 1 to remove old inode ref
9095                  * 1 to add new inode ref
9096                  */
9097                 trans_num_items += 3;
9098         }
9099         /*
9100          * 1 to remove old dir item
9101          * 1 to remove old dir index
9102          * 1 to add new dir item
9103          * 1 to add new dir index
9104          */
9105         trans_num_items += 4;
9106         /* 1 to update new parent inode if it's not the same as the old parent */
9107         if (new_dir != old_dir)
9108                 trans_num_items++;
9109         if (new_inode) {
9110                 /*
9111                  * 1 to update inode
9112                  * 1 to remove inode ref
9113                  * 1 to remove dir item
9114                  * 1 to remove dir index
9115                  * 1 to possibly add orphan item
9116                  */
9117                 trans_num_items += 5;
9118         }
9119         trans = btrfs_start_transaction(root, trans_num_items);
9120         if (IS_ERR(trans)) {
9121                 ret = PTR_ERR(trans);
9122                 goto out_notrans;
9123         }
9124
9125         if (dest != root) {
9126                 ret = btrfs_record_root_in_trans(trans, dest);
9127                 if (ret)
9128                         goto out_fail;
9129         }
9130
9131         ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9132         if (ret)
9133                 goto out_fail;
9134
9135         BTRFS_I(old_inode)->dir_index = 0ULL;
9136         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9137                 /* force full log commit if subvolume involved. */
9138                 btrfs_set_log_full_commit(trans);
9139         } else {
9140                 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
9141                                              old_ino, btrfs_ino(BTRFS_I(new_dir)),
9142                                              index);
9143                 if (ret)
9144                         goto out_fail;
9145         }
9146
9147         inode_inc_iversion(old_dir);
9148         inode_inc_iversion(new_dir);
9149         inode_inc_iversion(old_inode);
9150         old_dir->i_mtime = current_time(old_dir);
9151         old_dir->i_ctime = old_dir->i_mtime;
9152         new_dir->i_mtime = old_dir->i_mtime;
9153         new_dir->i_ctime = old_dir->i_mtime;
9154         old_inode->i_ctime = old_dir->i_mtime;
9155
9156         if (old_dentry->d_parent != new_dentry->d_parent)
9157                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9158                                         BTRFS_I(old_inode), true);
9159
9160         if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9161                 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
9162         } else {
9163                 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9164                                            BTRFS_I(d_inode(old_dentry)),
9165                                            &old_fname.disk_name, &rename_ctx);
9166                 if (!ret)
9167                         ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9168         }
9169         if (ret) {
9170                 btrfs_abort_transaction(trans, ret);
9171                 goto out_fail;
9172         }
9173
9174         if (new_inode) {
9175                 inode_inc_iversion(new_inode);
9176                 new_inode->i_ctime = current_time(new_inode);
9177                 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9178                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9179                         ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
9180                         BUG_ON(new_inode->i_nlink == 0);
9181                 } else {
9182                         ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9183                                                  BTRFS_I(d_inode(new_dentry)),
9184                                                  &new_fname.disk_name);
9185                 }
9186                 if (!ret && new_inode->i_nlink == 0)
9187                         ret = btrfs_orphan_add(trans,
9188                                         BTRFS_I(d_inode(new_dentry)));
9189                 if (ret) {
9190                         btrfs_abort_transaction(trans, ret);
9191                         goto out_fail;
9192                 }
9193         }
9194
9195         ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9196                              &new_fname.disk_name, 0, index);
9197         if (ret) {
9198                 btrfs_abort_transaction(trans, ret);
9199                 goto out_fail;
9200         }
9201
9202         if (old_inode->i_nlink == 1)
9203                 BTRFS_I(old_inode)->dir_index = index;
9204
9205         if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9206                 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9207                                    rename_ctx.index, new_dentry->d_parent);
9208
9209         if (flags & RENAME_WHITEOUT) {
9210                 ret = btrfs_create_new_inode(trans, &whiteout_args);
9211                 if (ret) {
9212                         btrfs_abort_transaction(trans, ret);
9213                         goto out_fail;
9214                 } else {
9215                         unlock_new_inode(whiteout_args.inode);
9216                         iput(whiteout_args.inode);
9217                         whiteout_args.inode = NULL;
9218                 }
9219         }
9220 out_fail:
9221         ret2 = btrfs_end_transaction(trans);
9222         ret = ret ? ret : ret2;
9223 out_notrans:
9224         if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9225                 up_read(&fs_info->subvol_sem);
9226         if (flags & RENAME_WHITEOUT)
9227                 btrfs_new_inode_args_destroy(&whiteout_args);
9228 out_whiteout_inode:
9229         if (flags & RENAME_WHITEOUT)
9230                 iput(whiteout_args.inode);
9231 out_fscrypt_names:
9232         fscrypt_free_filename(&old_fname);
9233         fscrypt_free_filename(&new_fname);
9234         return ret;
9235 }
9236
9237 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
9238                          struct dentry *old_dentry, struct inode *new_dir,
9239                          struct dentry *new_dentry, unsigned int flags)
9240 {
9241         int ret;
9242
9243         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9244                 return -EINVAL;
9245
9246         if (flags & RENAME_EXCHANGE)
9247                 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9248                                             new_dentry);
9249         else
9250                 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
9251                                    new_dentry, flags);
9252
9253         btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
9254
9255         return ret;
9256 }
9257
9258 struct btrfs_delalloc_work {
9259         struct inode *inode;
9260         struct completion completion;
9261         struct list_head list;
9262         struct btrfs_work work;
9263 };
9264
9265 static void btrfs_run_delalloc_work(struct btrfs_work *work)
9266 {
9267         struct btrfs_delalloc_work *delalloc_work;
9268         struct inode *inode;
9269
9270         delalloc_work = container_of(work, struct btrfs_delalloc_work,
9271                                      work);
9272         inode = delalloc_work->inode;
9273         filemap_flush(inode->i_mapping);
9274         if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9275                                 &BTRFS_I(inode)->runtime_flags))
9276                 filemap_flush(inode->i_mapping);
9277
9278         iput(inode);
9279         complete(&delalloc_work->completion);
9280 }
9281
9282 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9283 {
9284         struct btrfs_delalloc_work *work;
9285
9286         work = kmalloc(sizeof(*work), GFP_NOFS);
9287         if (!work)
9288                 return NULL;
9289
9290         init_completion(&work->completion);
9291         INIT_LIST_HEAD(&work->list);
9292         work->inode = inode;
9293         btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9294
9295         return work;
9296 }
9297
9298 /*
9299  * some fairly slow code that needs optimization. This walks the list
9300  * of all the inodes with pending delalloc and forces them to disk.
9301  */
9302 static int start_delalloc_inodes(struct btrfs_root *root,
9303                                  struct writeback_control *wbc, bool snapshot,
9304                                  bool in_reclaim_context)
9305 {
9306         struct btrfs_inode *binode;
9307         struct inode *inode;
9308         struct btrfs_delalloc_work *work, *next;
9309         struct list_head works;
9310         struct list_head splice;
9311         int ret = 0;
9312         bool full_flush = wbc->nr_to_write == LONG_MAX;
9313
9314         INIT_LIST_HEAD(&works);
9315         INIT_LIST_HEAD(&splice);
9316
9317         mutex_lock(&root->delalloc_mutex);
9318         spin_lock(&root->delalloc_lock);
9319         list_splice_init(&root->delalloc_inodes, &splice);
9320         while (!list_empty(&splice)) {
9321                 binode = list_entry(splice.next, struct btrfs_inode,
9322                                     delalloc_inodes);
9323
9324                 list_move_tail(&binode->delalloc_inodes,
9325                                &root->delalloc_inodes);
9326
9327                 if (in_reclaim_context &&
9328                     test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9329                         continue;
9330
9331                 inode = igrab(&binode->vfs_inode);
9332                 if (!inode) {
9333                         cond_resched_lock(&root->delalloc_lock);
9334                         continue;
9335                 }
9336                 spin_unlock(&root->delalloc_lock);
9337
9338                 if (snapshot)
9339                         set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9340                                 &binode->runtime_flags);
9341                 if (full_flush) {
9342                         work = btrfs_alloc_delalloc_work(inode);
9343                         if (!work) {
9344                                 iput(inode);
9345                                 ret = -ENOMEM;
9346                                 goto out;
9347                         }
9348                         list_add_tail(&work->list, &works);
9349                         btrfs_queue_work(root->fs_info->flush_workers,
9350                                          &work->work);
9351                 } else {
9352                         ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9353                         btrfs_add_delayed_iput(BTRFS_I(inode));
9354                         if (ret || wbc->nr_to_write <= 0)
9355                                 goto out;
9356                 }
9357                 cond_resched();
9358                 spin_lock(&root->delalloc_lock);
9359         }
9360         spin_unlock(&root->delalloc_lock);
9361
9362 out:
9363         list_for_each_entry_safe(work, next, &works, list) {
9364                 list_del_init(&work->list);
9365                 wait_for_completion(&work->completion);
9366                 kfree(work);
9367         }
9368
9369         if (!list_empty(&splice)) {
9370                 spin_lock(&root->delalloc_lock);
9371                 list_splice_tail(&splice, &root->delalloc_inodes);
9372                 spin_unlock(&root->delalloc_lock);
9373         }
9374         mutex_unlock(&root->delalloc_mutex);
9375         return ret;
9376 }
9377
9378 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9379 {
9380         struct writeback_control wbc = {
9381                 .nr_to_write = LONG_MAX,
9382                 .sync_mode = WB_SYNC_NONE,
9383                 .range_start = 0,
9384                 .range_end = LLONG_MAX,
9385         };
9386         struct btrfs_fs_info *fs_info = root->fs_info;
9387
9388         if (BTRFS_FS_ERROR(fs_info))
9389                 return -EROFS;
9390
9391         return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
9392 }
9393
9394 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
9395                                bool in_reclaim_context)
9396 {
9397         struct writeback_control wbc = {
9398                 .nr_to_write = nr,
9399                 .sync_mode = WB_SYNC_NONE,
9400                 .range_start = 0,
9401                 .range_end = LLONG_MAX,
9402         };
9403         struct btrfs_root *root;
9404         struct list_head splice;
9405         int ret;
9406
9407         if (BTRFS_FS_ERROR(fs_info))
9408                 return -EROFS;
9409
9410         INIT_LIST_HEAD(&splice);
9411
9412         mutex_lock(&fs_info->delalloc_root_mutex);
9413         spin_lock(&fs_info->delalloc_root_lock);
9414         list_splice_init(&fs_info->delalloc_roots, &splice);
9415         while (!list_empty(&splice)) {
9416                 /*
9417                  * Reset nr_to_write here so we know that we're doing a full
9418                  * flush.
9419                  */
9420                 if (nr == LONG_MAX)
9421                         wbc.nr_to_write = LONG_MAX;
9422
9423                 root = list_first_entry(&splice, struct btrfs_root,
9424                                         delalloc_root);
9425                 root = btrfs_grab_root(root);
9426                 BUG_ON(!root);
9427                 list_move_tail(&root->delalloc_root,
9428                                &fs_info->delalloc_roots);
9429                 spin_unlock(&fs_info->delalloc_root_lock);
9430
9431                 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9432                 btrfs_put_root(root);
9433                 if (ret < 0 || wbc.nr_to_write <= 0)
9434                         goto out;
9435                 spin_lock(&fs_info->delalloc_root_lock);
9436         }
9437         spin_unlock(&fs_info->delalloc_root_lock);
9438
9439         ret = 0;
9440 out:
9441         if (!list_empty(&splice)) {
9442                 spin_lock(&fs_info->delalloc_root_lock);
9443                 list_splice_tail(&splice, &fs_info->delalloc_roots);
9444                 spin_unlock(&fs_info->delalloc_root_lock);
9445         }
9446         mutex_unlock(&fs_info->delalloc_root_mutex);
9447         return ret;
9448 }
9449
9450 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
9451                          struct dentry *dentry, const char *symname)
9452 {
9453         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9454         struct btrfs_trans_handle *trans;
9455         struct btrfs_root *root = BTRFS_I(dir)->root;
9456         struct btrfs_path *path;
9457         struct btrfs_key key;
9458         struct inode *inode;
9459         struct btrfs_new_inode_args new_inode_args = {
9460                 .dir = dir,
9461                 .dentry = dentry,
9462         };
9463         unsigned int trans_num_items;
9464         int err;
9465         int name_len;
9466         int datasize;
9467         unsigned long ptr;
9468         struct btrfs_file_extent_item *ei;
9469         struct extent_buffer *leaf;
9470
9471         name_len = strlen(symname);
9472         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
9473                 return -ENAMETOOLONG;
9474
9475         inode = new_inode(dir->i_sb);
9476         if (!inode)
9477                 return -ENOMEM;
9478         inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
9479         inode->i_op = &btrfs_symlink_inode_operations;
9480         inode_nohighmem(inode);
9481         inode->i_mapping->a_ops = &btrfs_aops;
9482         btrfs_i_size_write(BTRFS_I(inode), name_len);
9483         inode_set_bytes(inode, name_len);
9484
9485         new_inode_args.inode = inode;
9486         err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9487         if (err)
9488                 goto out_inode;
9489         /* 1 additional item for the inline extent */
9490         trans_num_items++;
9491
9492         trans = btrfs_start_transaction(root, trans_num_items);
9493         if (IS_ERR(trans)) {
9494                 err = PTR_ERR(trans);
9495                 goto out_new_inode_args;
9496         }
9497
9498         err = btrfs_create_new_inode(trans, &new_inode_args);
9499         if (err)
9500                 goto out;
9501
9502         path = btrfs_alloc_path();
9503         if (!path) {
9504                 err = -ENOMEM;
9505                 btrfs_abort_transaction(trans, err);
9506                 discard_new_inode(inode);
9507                 inode = NULL;
9508                 goto out;
9509         }
9510         key.objectid = btrfs_ino(BTRFS_I(inode));
9511         key.offset = 0;
9512         key.type = BTRFS_EXTENT_DATA_KEY;
9513         datasize = btrfs_file_extent_calc_inline_size(name_len);
9514         err = btrfs_insert_empty_item(trans, root, path, &key,
9515                                       datasize);
9516         if (err) {
9517                 btrfs_abort_transaction(trans, err);
9518                 btrfs_free_path(path);
9519                 discard_new_inode(inode);
9520                 inode = NULL;
9521                 goto out;
9522         }
9523         leaf = path->nodes[0];
9524         ei = btrfs_item_ptr(leaf, path->slots[0],
9525                             struct btrfs_file_extent_item);
9526         btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9527         btrfs_set_file_extent_type(leaf, ei,
9528                                    BTRFS_FILE_EXTENT_INLINE);
9529         btrfs_set_file_extent_encryption(leaf, ei, 0);
9530         btrfs_set_file_extent_compression(leaf, ei, 0);
9531         btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9532         btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9533
9534         ptr = btrfs_file_extent_inline_start(ei);
9535         write_extent_buffer(leaf, symname, ptr, name_len);
9536         btrfs_mark_buffer_dirty(leaf);
9537         btrfs_free_path(path);
9538
9539         d_instantiate_new(dentry, inode);
9540         err = 0;
9541 out:
9542         btrfs_end_transaction(trans);
9543         btrfs_btree_balance_dirty(fs_info);
9544 out_new_inode_args:
9545         btrfs_new_inode_args_destroy(&new_inode_args);
9546 out_inode:
9547         if (err)
9548                 iput(inode);
9549         return err;
9550 }
9551
9552 static struct btrfs_trans_handle *insert_prealloc_file_extent(
9553                                        struct btrfs_trans_handle *trans_in,
9554                                        struct btrfs_inode *inode,
9555                                        struct btrfs_key *ins,
9556                                        u64 file_offset)
9557 {
9558         struct btrfs_file_extent_item stack_fi;
9559         struct btrfs_replace_extent_info extent_info;
9560         struct btrfs_trans_handle *trans = trans_in;
9561         struct btrfs_path *path;
9562         u64 start = ins->objectid;
9563         u64 len = ins->offset;
9564         int qgroup_released;
9565         int ret;
9566
9567         memset(&stack_fi, 0, sizeof(stack_fi));
9568
9569         btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
9570         btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
9571         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
9572         btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
9573         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
9574         btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
9575         /* Encryption and other encoding is reserved and all 0 */
9576
9577         qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
9578         if (qgroup_released < 0)
9579                 return ERR_PTR(qgroup_released);
9580
9581         if (trans) {
9582                 ret = insert_reserved_file_extent(trans, inode,
9583                                                   file_offset, &stack_fi,
9584                                                   true, qgroup_released);
9585                 if (ret)
9586                         goto free_qgroup;
9587                 return trans;
9588         }
9589
9590         extent_info.disk_offset = start;
9591         extent_info.disk_len = len;
9592         extent_info.data_offset = 0;
9593         extent_info.data_len = len;
9594         extent_info.file_offset = file_offset;
9595         extent_info.extent_buf = (char *)&stack_fi;
9596         extent_info.is_new_extent = true;
9597         extent_info.update_times = true;
9598         extent_info.qgroup_reserved = qgroup_released;
9599         extent_info.insertions = 0;
9600
9601         path = btrfs_alloc_path();
9602         if (!path) {
9603                 ret = -ENOMEM;
9604                 goto free_qgroup;
9605         }
9606
9607         ret = btrfs_replace_file_extents(inode, path, file_offset,
9608                                      file_offset + len - 1, &extent_info,
9609                                      &trans);
9610         btrfs_free_path(path);
9611         if (ret)
9612                 goto free_qgroup;
9613         return trans;
9614
9615 free_qgroup:
9616         /*
9617          * We have released qgroup data range at the beginning of the function,
9618          * and normally qgroup_released bytes will be freed when committing
9619          * transaction.
9620          * But if we error out early, we have to free what we have released
9621          * or we leak qgroup data reservation.
9622          */
9623         btrfs_qgroup_free_refroot(inode->root->fs_info,
9624                         inode->root->root_key.objectid, qgroup_released,
9625                         BTRFS_QGROUP_RSV_DATA);
9626         return ERR_PTR(ret);
9627 }
9628
9629 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9630                                        u64 start, u64 num_bytes, u64 min_size,
9631                                        loff_t actual_len, u64 *alloc_hint,
9632                                        struct btrfs_trans_handle *trans)
9633 {
9634         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9635         struct extent_map *em;
9636         struct btrfs_root *root = BTRFS_I(inode)->root;
9637         struct btrfs_key ins;
9638         u64 cur_offset = start;
9639         u64 clear_offset = start;
9640         u64 i_size;
9641         u64 cur_bytes;
9642         u64 last_alloc = (u64)-1;
9643         int ret = 0;
9644         bool own_trans = true;
9645         u64 end = start + num_bytes - 1;
9646
9647         if (trans)
9648                 own_trans = false;
9649         while (num_bytes > 0) {
9650                 cur_bytes = min_t(u64, num_bytes, SZ_256M);
9651                 cur_bytes = max(cur_bytes, min_size);
9652                 /*
9653                  * If we are severely fragmented we could end up with really
9654                  * small allocations, so if the allocator is returning small
9655                  * chunks lets make its job easier by only searching for those
9656                  * sized chunks.
9657                  */
9658                 cur_bytes = min(cur_bytes, last_alloc);
9659                 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9660                                 min_size, 0, *alloc_hint, &ins, 1, 0);
9661                 if (ret)
9662                         break;
9663
9664                 /*
9665                  * We've reserved this space, and thus converted it from
9666                  * ->bytes_may_use to ->bytes_reserved.  Any error that happens
9667                  * from here on out we will only need to clear our reservation
9668                  * for the remaining unreserved area, so advance our
9669                  * clear_offset by our extent size.
9670                  */
9671                 clear_offset += ins.offset;
9672
9673                 last_alloc = ins.offset;
9674                 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9675                                                     &ins, cur_offset);
9676                 /*
9677                  * Now that we inserted the prealloc extent we can finally
9678                  * decrement the number of reservations in the block group.
9679                  * If we did it before, we could race with relocation and have
9680                  * relocation miss the reserved extent, making it fail later.
9681                  */
9682                 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9683                 if (IS_ERR(trans)) {
9684                         ret = PTR_ERR(trans);
9685                         btrfs_free_reserved_extent(fs_info, ins.objectid,
9686                                                    ins.offset, 0);
9687                         break;
9688                 }
9689
9690                 em = alloc_extent_map();
9691                 if (!em) {
9692                         btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
9693                                             cur_offset + ins.offset - 1, false);
9694                         btrfs_set_inode_full_sync(BTRFS_I(inode));
9695                         goto next;
9696                 }
9697
9698                 em->start = cur_offset;
9699                 em->orig_start = cur_offset;
9700                 em->len = ins.offset;
9701                 em->block_start = ins.objectid;
9702                 em->block_len = ins.offset;
9703                 em->orig_block_len = ins.offset;
9704                 em->ram_bytes = ins.offset;
9705                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9706                 em->generation = trans->transid;
9707
9708                 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
9709                 free_extent_map(em);
9710 next:
9711                 num_bytes -= ins.offset;
9712                 cur_offset += ins.offset;
9713                 *alloc_hint = ins.objectid + ins.offset;
9714
9715                 inode_inc_iversion(inode);
9716                 inode->i_ctime = current_time(inode);
9717                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9718                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9719                     (actual_len > inode->i_size) &&
9720                     (cur_offset > inode->i_size)) {
9721                         if (cur_offset > actual_len)
9722                                 i_size = actual_len;
9723                         else
9724                                 i_size = cur_offset;
9725                         i_size_write(inode, i_size);
9726                         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9727                 }
9728
9729                 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9730
9731                 if (ret) {
9732                         btrfs_abort_transaction(trans, ret);
9733                         if (own_trans)
9734                                 btrfs_end_transaction(trans);
9735                         break;
9736                 }
9737
9738                 if (own_trans) {
9739                         btrfs_end_transaction(trans);
9740                         trans = NULL;
9741                 }
9742         }
9743         if (clear_offset < end)
9744                 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
9745                         end - clear_offset + 1);
9746         return ret;
9747 }
9748
9749 int btrfs_prealloc_file_range(struct inode *inode, int mode,
9750                               u64 start, u64 num_bytes, u64 min_size,
9751                               loff_t actual_len, u64 *alloc_hint)
9752 {
9753         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9754                                            min_size, actual_len, alloc_hint,
9755                                            NULL);
9756 }
9757
9758 int btrfs_prealloc_file_range_trans(struct inode *inode,
9759                                     struct btrfs_trans_handle *trans, int mode,
9760                                     u64 start, u64 num_bytes, u64 min_size,
9761                                     loff_t actual_len, u64 *alloc_hint)
9762 {
9763         return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9764                                            min_size, actual_len, alloc_hint, trans);
9765 }
9766
9767 static int btrfs_permission(struct mnt_idmap *idmap,
9768                             struct inode *inode, int mask)
9769 {
9770         struct btrfs_root *root = BTRFS_I(inode)->root;
9771         umode_t mode = inode->i_mode;
9772
9773         if (mask & MAY_WRITE &&
9774             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9775                 if (btrfs_root_readonly(root))
9776                         return -EROFS;
9777                 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9778                         return -EACCES;
9779         }
9780         return generic_permission(idmap, inode, mask);
9781 }
9782
9783 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9784                          struct file *file, umode_t mode)
9785 {
9786         struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9787         struct btrfs_trans_handle *trans;
9788         struct btrfs_root *root = BTRFS_I(dir)->root;
9789         struct inode *inode;
9790         struct btrfs_new_inode_args new_inode_args = {
9791                 .dir = dir,
9792                 .dentry = file->f_path.dentry,
9793                 .orphan = true,
9794         };
9795         unsigned int trans_num_items;
9796         int ret;
9797
9798         inode = new_inode(dir->i_sb);
9799         if (!inode)
9800                 return -ENOMEM;
9801         inode_init_owner(idmap, inode, dir, mode);
9802         inode->i_fop = &btrfs_file_operations;
9803         inode->i_op = &btrfs_file_inode_operations;
9804         inode->i_mapping->a_ops = &btrfs_aops;
9805
9806         new_inode_args.inode = inode;
9807         ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9808         if (ret)
9809                 goto out_inode;
9810
9811         trans = btrfs_start_transaction(root, trans_num_items);
9812         if (IS_ERR(trans)) {
9813                 ret = PTR_ERR(trans);
9814                 goto out_new_inode_args;
9815         }
9816
9817         ret = btrfs_create_new_inode(trans, &new_inode_args);
9818
9819         /*
9820          * We set number of links to 0 in btrfs_create_new_inode(), and here we
9821          * set it to 1 because d_tmpfile() will issue a warning if the count is
9822          * 0, through:
9823          *
9824          *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9825          */
9826         set_nlink(inode, 1);
9827
9828         if (!ret) {
9829                 d_tmpfile(file, inode);
9830                 unlock_new_inode(inode);
9831                 mark_inode_dirty(inode);
9832         }
9833
9834         btrfs_end_transaction(trans);
9835         btrfs_btree_balance_dirty(fs_info);
9836 out_new_inode_args:
9837         btrfs_new_inode_args_destroy(&new_inode_args);
9838 out_inode:
9839         if (ret)
9840                 iput(inode);
9841         return finish_open_simple(file, ret);
9842 }
9843
9844 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
9845 {
9846         struct btrfs_fs_info *fs_info = inode->root->fs_info;
9847         unsigned long index = start >> PAGE_SHIFT;
9848         unsigned long end_index = end >> PAGE_SHIFT;
9849         struct page *page;
9850         u32 len;
9851
9852         ASSERT(end + 1 - start <= U32_MAX);
9853         len = end + 1 - start;
9854         while (index <= end_index) {
9855                 page = find_get_page(inode->vfs_inode.i_mapping, index);
9856                 ASSERT(page); /* Pages should be in the extent_io_tree */
9857
9858                 btrfs_page_set_writeback(fs_info, page, start, len);
9859                 put_page(page);
9860                 index++;
9861         }
9862 }
9863
9864 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9865                                              int compress_type)
9866 {
9867         switch (compress_type) {
9868         case BTRFS_COMPRESS_NONE:
9869                 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9870         case BTRFS_COMPRESS_ZLIB:
9871                 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9872         case BTRFS_COMPRESS_LZO:
9873                 /*
9874                  * The LZO format depends on the sector size. 64K is the maximum
9875                  * sector size that we support.
9876                  */
9877                 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9878                         return -EINVAL;
9879                 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9880                        (fs_info->sectorsize_bits - 12);
9881         case BTRFS_COMPRESS_ZSTD:
9882                 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9883         default:
9884                 return -EUCLEAN;
9885         }
9886 }
9887
9888 static ssize_t btrfs_encoded_read_inline(
9889                                 struct kiocb *iocb,
9890                                 struct iov_iter *iter, u64 start,
9891                                 u64 lockend,
9892                                 struct extent_state **cached_state,
9893                                 u64 extent_start, size_t count,
9894                                 struct btrfs_ioctl_encoded_io_args *encoded,
9895                                 bool *unlocked)
9896 {
9897         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9898         struct btrfs_root *root = inode->root;
9899         struct btrfs_fs_info *fs_info = root->fs_info;
9900         struct extent_io_tree *io_tree = &inode->io_tree;
9901         struct btrfs_path *path;
9902         struct extent_buffer *leaf;
9903         struct btrfs_file_extent_item *item;
9904         u64 ram_bytes;
9905         unsigned long ptr;
9906         void *tmp;
9907         ssize_t ret;
9908
9909         path = btrfs_alloc_path();
9910         if (!path) {
9911                 ret = -ENOMEM;
9912                 goto out;
9913         }
9914         ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9915                                        extent_start, 0);
9916         if (ret) {
9917                 if (ret > 0) {
9918                         /* The extent item disappeared? */
9919                         ret = -EIO;
9920                 }
9921                 goto out;
9922         }
9923         leaf = path->nodes[0];
9924         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9925
9926         ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9927         ptr = btrfs_file_extent_inline_start(item);
9928
9929         encoded->len = min_t(u64, extent_start + ram_bytes,
9930                              inode->vfs_inode.i_size) - iocb->ki_pos;
9931         ret = btrfs_encoded_io_compression_from_extent(fs_info,
9932                                  btrfs_file_extent_compression(leaf, item));
9933         if (ret < 0)
9934                 goto out;
9935         encoded->compression = ret;
9936         if (encoded->compression) {
9937                 size_t inline_size;
9938
9939                 inline_size = btrfs_file_extent_inline_item_len(leaf,
9940                                                                 path->slots[0]);
9941                 if (inline_size > count) {
9942                         ret = -ENOBUFS;
9943                         goto out;
9944                 }
9945                 count = inline_size;
9946                 encoded->unencoded_len = ram_bytes;
9947                 encoded->unencoded_offset = iocb->ki_pos - extent_start;
9948         } else {
9949                 count = min_t(u64, count, encoded->len);
9950                 encoded->len = count;
9951                 encoded->unencoded_len = count;
9952                 ptr += iocb->ki_pos - extent_start;
9953         }
9954
9955         tmp = kmalloc(count, GFP_NOFS);
9956         if (!tmp) {
9957                 ret = -ENOMEM;
9958                 goto out;
9959         }
9960         read_extent_buffer(leaf, tmp, ptr, count);
9961         btrfs_release_path(path);
9962         unlock_extent(io_tree, start, lockend, cached_state);
9963         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9964         *unlocked = true;
9965
9966         ret = copy_to_iter(tmp, count, iter);
9967         if (ret != count)
9968                 ret = -EFAULT;
9969         kfree(tmp);
9970 out:
9971         btrfs_free_path(path);
9972         return ret;
9973 }
9974
9975 struct btrfs_encoded_read_private {
9976         wait_queue_head_t wait;
9977         atomic_t pending;
9978         blk_status_t status;
9979 };
9980
9981 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9982 {
9983         struct btrfs_encoded_read_private *priv = bbio->private;
9984
9985         if (bbio->bio.bi_status) {
9986                 /*
9987                  * The memory barrier implied by the atomic_dec_return() here
9988                  * pairs with the memory barrier implied by the
9989                  * atomic_dec_return() or io_wait_event() in
9990                  * btrfs_encoded_read_regular_fill_pages() to ensure that this
9991                  * write is observed before the load of status in
9992                  * btrfs_encoded_read_regular_fill_pages().
9993                  */
9994                 WRITE_ONCE(priv->status, bbio->bio.bi_status);
9995         }
9996         if (!atomic_dec_return(&priv->pending))
9997                 wake_up(&priv->wait);
9998         bio_put(&bbio->bio);
9999 }
10000
10001 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
10002                                           u64 file_offset, u64 disk_bytenr,
10003                                           u64 disk_io_size, struct page **pages)
10004 {
10005         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10006         struct btrfs_encoded_read_private priv = {
10007                 .pending = ATOMIC_INIT(1),
10008         };
10009         unsigned long i = 0;
10010         struct btrfs_bio *bbio;
10011
10012         init_waitqueue_head(&priv.wait);
10013
10014         bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
10015                                btrfs_encoded_read_endio, &priv);
10016         bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
10017         bbio->inode = inode;
10018
10019         do {
10020                 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
10021
10022                 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
10023                         atomic_inc(&priv.pending);
10024                         btrfs_submit_bio(bbio, 0);
10025
10026                         bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
10027                                                btrfs_encoded_read_endio, &priv);
10028                         bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
10029                         bbio->inode = inode;
10030                         continue;
10031                 }
10032
10033                 i++;
10034                 disk_bytenr += bytes;
10035                 disk_io_size -= bytes;
10036         } while (disk_io_size);
10037
10038         atomic_inc(&priv.pending);
10039         btrfs_submit_bio(bbio, 0);
10040
10041         if (atomic_dec_return(&priv.pending))
10042                 io_wait_event(priv.wait, !atomic_read(&priv.pending));
10043         /* See btrfs_encoded_read_endio() for ordering. */
10044         return blk_status_to_errno(READ_ONCE(priv.status));
10045 }
10046
10047 static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
10048                                           struct iov_iter *iter,
10049                                           u64 start, u64 lockend,
10050                                           struct extent_state **cached_state,
10051                                           u64 disk_bytenr, u64 disk_io_size,
10052                                           size_t count, bool compressed,
10053                                           bool *unlocked)
10054 {
10055         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10056         struct extent_io_tree *io_tree = &inode->io_tree;
10057         struct page **pages;
10058         unsigned long nr_pages, i;
10059         u64 cur;
10060         size_t page_offset;
10061         ssize_t ret;
10062
10063         nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
10064         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
10065         if (!pages)
10066                 return -ENOMEM;
10067         ret = btrfs_alloc_page_array(nr_pages, pages);
10068         if (ret) {
10069                 ret = -ENOMEM;
10070                 goto out;
10071                 }
10072
10073         ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
10074                                                     disk_io_size, pages);
10075         if (ret)
10076                 goto out;
10077
10078         unlock_extent(io_tree, start, lockend, cached_state);
10079         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10080         *unlocked = true;
10081
10082         if (compressed) {
10083                 i = 0;
10084                 page_offset = 0;
10085         } else {
10086                 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10087                 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
10088         }
10089         cur = 0;
10090         while (cur < count) {
10091                 size_t bytes = min_t(size_t, count - cur,
10092                                      PAGE_SIZE - page_offset);
10093
10094                 if (copy_page_to_iter(pages[i], page_offset, bytes,
10095                                       iter) != bytes) {
10096                         ret = -EFAULT;
10097                         goto out;
10098                 }
10099                 i++;
10100                 cur += bytes;
10101                 page_offset = 0;
10102         }
10103         ret = count;
10104 out:
10105         for (i = 0; i < nr_pages; i++) {
10106                 if (pages[i])
10107                         __free_page(pages[i]);
10108         }
10109         kfree(pages);
10110         return ret;
10111 }
10112
10113 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
10114                            struct btrfs_ioctl_encoded_io_args *encoded)
10115 {
10116         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10117         struct btrfs_fs_info *fs_info = inode->root->fs_info;
10118         struct extent_io_tree *io_tree = &inode->io_tree;
10119         ssize_t ret;
10120         size_t count = iov_iter_count(iter);
10121         u64 start, lockend, disk_bytenr, disk_io_size;
10122         struct extent_state *cached_state = NULL;
10123         struct extent_map *em;
10124         bool unlocked = false;
10125
10126         file_accessed(iocb->ki_filp);
10127
10128         btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
10129
10130         if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10131                 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10132                 return 0;
10133         }
10134         start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10135         /*
10136          * We don't know how long the extent containing iocb->ki_pos is, but if
10137          * it's compressed we know that it won't be longer than this.
10138          */
10139         lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
10140
10141         for (;;) {
10142                 struct btrfs_ordered_extent *ordered;
10143
10144                 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
10145                                                lockend - start + 1);
10146                 if (ret)
10147                         goto out_unlock_inode;
10148                 lock_extent(io_tree, start, lockend, &cached_state);
10149                 ordered = btrfs_lookup_ordered_range(inode, start,
10150                                                      lockend - start + 1);
10151                 if (!ordered)
10152                         break;
10153                 btrfs_put_ordered_extent(ordered);
10154                 unlock_extent(io_tree, start, lockend, &cached_state);
10155                 cond_resched();
10156         }
10157
10158         em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
10159         if (IS_ERR(em)) {
10160                 ret = PTR_ERR(em);
10161                 goto out_unlock_extent;
10162         }
10163
10164         if (em->block_start == EXTENT_MAP_INLINE) {
10165                 u64 extent_start = em->start;
10166
10167                 /*
10168                  * For inline extents we get everything we need out of the
10169                  * extent item.
10170                  */
10171                 free_extent_map(em);
10172                 em = NULL;
10173                 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10174                                                 &cached_state, extent_start,
10175                                                 count, encoded, &unlocked);
10176                 goto out;
10177         }
10178
10179         /*
10180          * We only want to return up to EOF even if the extent extends beyond
10181          * that.
10182          */
10183         encoded->len = min_t(u64, extent_map_end(em),
10184                              inode->vfs_inode.i_size) - iocb->ki_pos;
10185         if (em->block_start == EXTENT_MAP_HOLE ||
10186             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10187                 disk_bytenr = EXTENT_MAP_HOLE;
10188                 count = min_t(u64, count, encoded->len);
10189                 encoded->len = count;
10190                 encoded->unencoded_len = count;
10191         } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10192                 disk_bytenr = em->block_start;
10193                 /*
10194                  * Bail if the buffer isn't large enough to return the whole
10195                  * compressed extent.
10196                  */
10197                 if (em->block_len > count) {
10198                         ret = -ENOBUFS;
10199                         goto out_em;
10200                 }
10201                 disk_io_size = em->block_len;
10202                 count = em->block_len;
10203                 encoded->unencoded_len = em->ram_bytes;
10204                 encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10205                 ret = btrfs_encoded_io_compression_from_extent(fs_info,
10206                                                              em->compress_type);
10207                 if (ret < 0)
10208                         goto out_em;
10209                 encoded->compression = ret;
10210         } else {
10211                 disk_bytenr = em->block_start + (start - em->start);
10212                 if (encoded->len > count)
10213                         encoded->len = count;
10214                 /*
10215                  * Don't read beyond what we locked. This also limits the page
10216                  * allocations that we'll do.
10217                  */
10218                 disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
10219                 count = start + disk_io_size - iocb->ki_pos;
10220                 encoded->len = count;
10221                 encoded->unencoded_len = count;
10222                 disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10223         }
10224         free_extent_map(em);
10225         em = NULL;
10226
10227         if (disk_bytenr == EXTENT_MAP_HOLE) {
10228                 unlock_extent(io_tree, start, lockend, &cached_state);
10229                 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10230                 unlocked = true;
10231                 ret = iov_iter_zero(count, iter);
10232                 if (ret != count)
10233                         ret = -EFAULT;
10234         } else {
10235                 ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10236                                                  &cached_state, disk_bytenr,
10237                                                  disk_io_size, count,
10238                                                  encoded->compression,
10239                                                  &unlocked);
10240         }
10241
10242 out:
10243         if (ret >= 0)
10244                 iocb->ki_pos += encoded->len;
10245 out_em:
10246         free_extent_map(em);
10247 out_unlock_extent:
10248         if (!unlocked)
10249                 unlock_extent(io_tree, start, lockend, &cached_state);
10250 out_unlock_inode:
10251         if (!unlocked)
10252                 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10253         return ret;
10254 }
10255
10256 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
10257                                const struct btrfs_ioctl_encoded_io_args *encoded)
10258 {
10259         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10260         struct btrfs_root *root = inode->root;
10261         struct btrfs_fs_info *fs_info = root->fs_info;
10262         struct extent_io_tree *io_tree = &inode->io_tree;
10263         struct extent_changeset *data_reserved = NULL;
10264         struct extent_state *cached_state = NULL;
10265         struct btrfs_ordered_extent *ordered;
10266         int compression;
10267         size_t orig_count;
10268         u64 start, end;
10269         u64 num_bytes, ram_bytes, disk_num_bytes;
10270         unsigned long nr_pages, i;
10271         struct page **pages;
10272         struct btrfs_key ins;
10273         bool extent_reserved = false;
10274         struct extent_map *em;
10275         ssize_t ret;
10276
10277         switch (encoded->compression) {
10278         case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10279                 compression = BTRFS_COMPRESS_ZLIB;
10280                 break;
10281         case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10282                 compression = BTRFS_COMPRESS_ZSTD;
10283                 break;
10284         case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10285         case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10286         case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10287         case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10288         case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10289                 /* The sector size must match for LZO. */
10290                 if (encoded->compression -
10291                     BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
10292                     fs_info->sectorsize_bits)
10293                         return -EINVAL;
10294                 compression = BTRFS_COMPRESS_LZO;
10295                 break;
10296         default:
10297                 return -EINVAL;
10298         }
10299         if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10300                 return -EINVAL;
10301
10302         orig_count = iov_iter_count(from);
10303
10304         /* The extent size must be sane. */
10305         if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
10306             orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
10307                 return -EINVAL;
10308
10309         /*
10310          * The compressed data must be smaller than the decompressed data.
10311          *
10312          * It's of course possible for data to compress to larger or the same
10313          * size, but the buffered I/O path falls back to no compression for such
10314          * data, and we don't want to break any assumptions by creating these
10315          * extents.
10316          *
10317          * Note that this is less strict than the current check we have that the
10318          * compressed data must be at least one sector smaller than the
10319          * decompressed data. We only want to enforce the weaker requirement
10320          * from old kernels that it is at least one byte smaller.
10321          */
10322         if (orig_count >= encoded->unencoded_len)
10323                 return -EINVAL;
10324
10325         /* The extent must start on a sector boundary. */
10326         start = iocb->ki_pos;
10327         if (!IS_ALIGNED(start, fs_info->sectorsize))
10328                 return -EINVAL;
10329
10330         /*
10331          * The extent must end on a sector boundary. However, we allow a write
10332          * which ends at or extends i_size to have an unaligned length; we round
10333          * up the extent size and set i_size to the unaligned end.
10334          */
10335         if (start + encoded->len < inode->vfs_inode.i_size &&
10336             !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10337                 return -EINVAL;
10338
10339         /* Finally, the offset in the unencoded data must be sector-aligned. */
10340         if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10341                 return -EINVAL;
10342
10343         num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10344         ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10345         end = start + num_bytes - 1;
10346
10347         /*
10348          * If the extent cannot be inline, the compressed data on disk must be
10349          * sector-aligned. For convenience, we extend it with zeroes if it
10350          * isn't.
10351          */
10352         disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10353         nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10354         pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10355         if (!pages)
10356                 return -ENOMEM;
10357         for (i = 0; i < nr_pages; i++) {
10358                 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10359                 char *kaddr;
10360
10361                 pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10362                 if (!pages[i]) {
10363                         ret = -ENOMEM;
10364                         goto out_pages;
10365                 }
10366                 kaddr = kmap_local_page(pages[i]);
10367                 if (copy_from_iter(kaddr, bytes, from) != bytes) {
10368                         kunmap_local(kaddr);
10369                         ret = -EFAULT;
10370                         goto out_pages;
10371                 }
10372                 if (bytes < PAGE_SIZE)
10373                         memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
10374                 kunmap_local(kaddr);
10375         }
10376
10377         for (;;) {
10378                 struct btrfs_ordered_extent *ordered;
10379
10380                 ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
10381                 if (ret)
10382                         goto out_pages;
10383                 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
10384                                                     start >> PAGE_SHIFT,
10385                                                     end >> PAGE_SHIFT);
10386                 if (ret)
10387                         goto out_pages;
10388                 lock_extent(io_tree, start, end, &cached_state);
10389                 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
10390                 if (!ordered &&
10391                     !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
10392                         break;
10393                 if (ordered)
10394                         btrfs_put_ordered_extent(ordered);
10395                 unlock_extent(io_tree, start, end, &cached_state);
10396                 cond_resched();
10397         }
10398
10399         /*
10400          * We don't use the higher-level delalloc space functions because our
10401          * num_bytes and disk_num_bytes are different.
10402          */
10403         ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
10404         if (ret)
10405                 goto out_unlock;
10406         ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
10407         if (ret)
10408                 goto out_free_data_space;
10409         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
10410                                               false);
10411         if (ret)
10412                 goto out_qgroup_free_data;
10413
10414         /* Try an inline extent first. */
10415         if (start == 0 && encoded->unencoded_len == encoded->len &&
10416             encoded->unencoded_offset == 0) {
10417                 ret = cow_file_range_inline(inode, encoded->len, orig_count,
10418                                             compression, pages, true);
10419                 if (ret <= 0) {
10420                         if (ret == 0)
10421                                 ret = orig_count;
10422                         goto out_delalloc_release;
10423                 }
10424         }
10425
10426         ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
10427                                    disk_num_bytes, 0, 0, &ins, 1, 1);
10428         if (ret)
10429                 goto out_delalloc_release;
10430         extent_reserved = true;
10431
10432         em = create_io_em(inode, start, num_bytes,
10433                           start - encoded->unencoded_offset, ins.objectid,
10434                           ins.offset, ins.offset, ram_bytes, compression,
10435                           BTRFS_ORDERED_COMPRESSED);
10436         if (IS_ERR(em)) {
10437                 ret = PTR_ERR(em);
10438                 goto out_free_reserved;
10439         }
10440         free_extent_map(em);
10441
10442         ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
10443                                        ins.objectid, ins.offset,
10444                                        encoded->unencoded_offset,
10445                                        (1 << BTRFS_ORDERED_ENCODED) |
10446                                        (1 << BTRFS_ORDERED_COMPRESSED),
10447                                        compression);
10448         if (IS_ERR(ordered)) {
10449                 btrfs_drop_extent_map_range(inode, start, end, false);
10450                 ret = PTR_ERR(ordered);
10451                 goto out_free_reserved;
10452         }
10453         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10454
10455         if (start + encoded->len > inode->vfs_inode.i_size)
10456                 i_size_write(&inode->vfs_inode, start + encoded->len);
10457
10458         unlock_extent(io_tree, start, end, &cached_state);
10459
10460         btrfs_delalloc_release_extents(inode, num_bytes);
10461
10462         btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
10463         ret = orig_count;
10464         goto out;
10465
10466 out_free_reserved:
10467         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10468         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10469 out_delalloc_release:
10470         btrfs_delalloc_release_extents(inode, num_bytes);
10471         btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
10472 out_qgroup_free_data:
10473         if (ret < 0)
10474                 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
10475 out_free_data_space:
10476         /*
10477          * If btrfs_reserve_extent() succeeded, then we already decremented
10478          * bytes_may_use.
10479          */
10480         if (!extent_reserved)
10481                 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
10482 out_unlock:
10483         unlock_extent(io_tree, start, end, &cached_state);
10484 out_pages:
10485         for (i = 0; i < nr_pages; i++) {
10486                 if (pages[i])
10487                         __free_page(pages[i]);
10488         }
10489         kvfree(pages);
10490 out:
10491         if (ret >= 0)
10492                 iocb->ki_pos += encoded->len;
10493         return ret;
10494 }
10495
10496 #ifdef CONFIG_SWAP
10497 /*
10498  * Add an entry indicating a block group or device which is pinned by a
10499  * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10500  * negative errno on failure.
10501  */
10502 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10503                                   bool is_block_group)
10504 {
10505         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10506         struct btrfs_swapfile_pin *sp, *entry;
10507         struct rb_node **p;
10508         struct rb_node *parent = NULL;
10509
10510         sp = kmalloc(sizeof(*sp), GFP_NOFS);
10511         if (!sp)
10512                 return -ENOMEM;
10513         sp->ptr = ptr;
10514         sp->inode = inode;
10515         sp->is_block_group = is_block_group;
10516         sp->bg_extent_count = 1;
10517
10518         spin_lock(&fs_info->swapfile_pins_lock);
10519         p = &fs_info->swapfile_pins.rb_node;
10520         while (*p) {
10521                 parent = *p;
10522                 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10523                 if (sp->ptr < entry->ptr ||
10524                     (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10525                         p = &(*p)->rb_left;
10526                 } else if (sp->ptr > entry->ptr ||
10527                            (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10528                         p = &(*p)->rb_right;
10529                 } else {
10530                         if (is_block_group)
10531                                 entry->bg_extent_count++;
10532                         spin_unlock(&fs_info->swapfile_pins_lock);
10533                         kfree(sp);
10534                         return 1;
10535                 }
10536         }
10537         rb_link_node(&sp->node, parent, p);
10538         rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10539         spin_unlock(&fs_info->swapfile_pins_lock);
10540         return 0;
10541 }
10542
10543 /* Free all of the entries pinned by this swapfile. */
10544 static void btrfs_free_swapfile_pins(struct inode *inode)
10545 {
10546         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10547         struct btrfs_swapfile_pin *sp;
10548         struct rb_node *node, *next;
10549
10550         spin_lock(&fs_info->swapfile_pins_lock);
10551         node = rb_first(&fs_info->swapfile_pins);
10552         while (node) {
10553                 next = rb_next(node);
10554                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10555                 if (sp->inode == inode) {
10556                         rb_erase(&sp->node, &fs_info->swapfile_pins);
10557                         if (sp->is_block_group) {
10558                                 btrfs_dec_block_group_swap_extents(sp->ptr,
10559                                                            sp->bg_extent_count);
10560                                 btrfs_put_block_group(sp->ptr);
10561                         }
10562                         kfree(sp);
10563                 }
10564                 node = next;
10565         }
10566         spin_unlock(&fs_info->swapfile_pins_lock);
10567 }
10568
10569 struct btrfs_swap_info {
10570         u64 start;
10571         u64 block_start;
10572         u64 block_len;
10573         u64 lowest_ppage;
10574         u64 highest_ppage;
10575         unsigned long nr_pages;
10576         int nr_extents;
10577 };
10578
10579 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10580                                  struct btrfs_swap_info *bsi)
10581 {
10582         unsigned long nr_pages;
10583         unsigned long max_pages;
10584         u64 first_ppage, first_ppage_reported, next_ppage;
10585         int ret;
10586
10587         /*
10588          * Our swapfile may have had its size extended after the swap header was
10589          * written. In that case activating the swapfile should not go beyond
10590          * the max size set in the swap header.
10591          */
10592         if (bsi->nr_pages >= sis->max)
10593                 return 0;
10594
10595         max_pages = sis->max - bsi->nr_pages;
10596         first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10597         next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10598
10599         if (first_ppage >= next_ppage)
10600                 return 0;
10601         nr_pages = next_ppage - first_ppage;
10602         nr_pages = min(nr_pages, max_pages);
10603
10604         first_ppage_reported = first_ppage;
10605         if (bsi->start == 0)
10606                 first_ppage_reported++;
10607         if (bsi->lowest_ppage > first_ppage_reported)
10608                 bsi->lowest_ppage = first_ppage_reported;
10609         if (bsi->highest_ppage < (next_ppage - 1))
10610                 bsi->highest_ppage = next_ppage - 1;
10611
10612         ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10613         if (ret < 0)
10614                 return ret;
10615         bsi->nr_extents += ret;
10616         bsi->nr_pages += nr_pages;
10617         return 0;
10618 }
10619
10620 static void btrfs_swap_deactivate(struct file *file)
10621 {
10622         struct inode *inode = file_inode(file);
10623
10624         btrfs_free_swapfile_pins(inode);
10625         atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10626 }
10627
10628 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10629                                sector_t *span)
10630 {
10631         struct inode *inode = file_inode(file);
10632         struct btrfs_root *root = BTRFS_I(inode)->root;
10633         struct btrfs_fs_info *fs_info = root->fs_info;
10634         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10635         struct extent_state *cached_state = NULL;
10636         struct extent_map *em = NULL;
10637         struct btrfs_device *device = NULL;
10638         struct btrfs_swap_info bsi = {
10639                 .lowest_ppage = (sector_t)-1ULL,
10640         };
10641         int ret = 0;
10642         u64 isize;
10643         u64 start;
10644
10645         /*
10646          * If the swap file was just created, make sure delalloc is done. If the
10647          * file changes again after this, the user is doing something stupid and
10648          * we don't really care.
10649          */
10650         ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10651         if (ret)
10652                 return ret;
10653
10654         /*
10655          * The inode is locked, so these flags won't change after we check them.
10656          */
10657         if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10658                 btrfs_warn(fs_info, "swapfile must not be compressed");
10659                 return -EINVAL;
10660         }
10661         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10662                 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10663                 return -EINVAL;
10664         }
10665         if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10666                 btrfs_warn(fs_info, "swapfile must not be checksummed");
10667                 return -EINVAL;
10668         }
10669
10670         /*
10671          * Balance or device remove/replace/resize can move stuff around from
10672          * under us. The exclop protection makes sure they aren't running/won't
10673          * run concurrently while we are mapping the swap extents, and
10674          * fs_info->swapfile_pins prevents them from running while the swap
10675          * file is active and moving the extents. Note that this also prevents
10676          * a concurrent device add which isn't actually necessary, but it's not
10677          * really worth the trouble to allow it.
10678          */
10679         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10680                 btrfs_warn(fs_info,
10681            "cannot activate swapfile while exclusive operation is running");
10682                 return -EBUSY;
10683         }
10684
10685         /*
10686          * Prevent snapshot creation while we are activating the swap file.
10687          * We do not want to race with snapshot creation. If snapshot creation
10688          * already started before we bumped nr_swapfiles from 0 to 1 and
10689          * completes before the first write into the swap file after it is
10690          * activated, than that write would fallback to COW.
10691          */
10692         if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10693                 btrfs_exclop_finish(fs_info);
10694                 btrfs_warn(fs_info,
10695            "cannot activate swapfile because snapshot creation is in progress");
10696                 return -EINVAL;
10697         }
10698         /*
10699          * Snapshots can create extents which require COW even if NODATACOW is
10700          * set. We use this counter to prevent snapshots. We must increment it
10701          * before walking the extents because we don't want a concurrent
10702          * snapshot to run after we've already checked the extents.
10703          *
10704          * It is possible that subvolume is marked for deletion but still not
10705          * removed yet. To prevent this race, we check the root status before
10706          * activating the swapfile.
10707          */
10708         spin_lock(&root->root_item_lock);
10709         if (btrfs_root_dead(root)) {
10710                 spin_unlock(&root->root_item_lock);
10711
10712                 btrfs_exclop_finish(fs_info);
10713                 btrfs_warn(fs_info,
10714                 "cannot activate swapfile because subvolume %llu is being deleted",
10715                         root->root_key.objectid);
10716                 return -EPERM;
10717         }
10718         atomic_inc(&root->nr_swapfiles);
10719         spin_unlock(&root->root_item_lock);
10720
10721         isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10722
10723         lock_extent(io_tree, 0, isize - 1, &cached_state);
10724         start = 0;
10725         while (start < isize) {
10726                 u64 logical_block_start, physical_block_start;
10727                 struct btrfs_block_group *bg;
10728                 u64 len = isize - start;
10729
10730                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10731                 if (IS_ERR(em)) {
10732                         ret = PTR_ERR(em);
10733                         goto out;
10734                 }
10735
10736                 if (em->block_start == EXTENT_MAP_HOLE) {
10737                         btrfs_warn(fs_info, "swapfile must not have holes");
10738                         ret = -EINVAL;
10739                         goto out;
10740                 }
10741                 if (em->block_start == EXTENT_MAP_INLINE) {
10742                         /*
10743                          * It's unlikely we'll ever actually find ourselves
10744                          * here, as a file small enough to fit inline won't be
10745                          * big enough to store more than the swap header, but in
10746                          * case something changes in the future, let's catch it
10747                          * here rather than later.
10748                          */
10749                         btrfs_warn(fs_info, "swapfile must not be inline");
10750                         ret = -EINVAL;
10751                         goto out;
10752                 }
10753                 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10754                         btrfs_warn(fs_info, "swapfile must not be compressed");
10755                         ret = -EINVAL;
10756                         goto out;
10757                 }
10758
10759                 logical_block_start = em->block_start + (start - em->start);
10760                 len = min(len, em->len - (start - em->start));
10761                 free_extent_map(em);
10762                 em = NULL;
10763
10764                 ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
10765                 if (ret < 0) {
10766                         goto out;
10767                 } else if (ret) {
10768                         ret = 0;
10769                 } else {
10770                         btrfs_warn(fs_info,
10771                                    "swapfile must not be copy-on-write");
10772                         ret = -EINVAL;
10773                         goto out;
10774                 }
10775
10776                 em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10777                 if (IS_ERR(em)) {
10778                         ret = PTR_ERR(em);
10779                         goto out;
10780                 }
10781
10782                 if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10783                         btrfs_warn(fs_info,
10784                                    "swapfile must have single data profile");
10785                         ret = -EINVAL;
10786                         goto out;
10787                 }
10788
10789                 if (device == NULL) {
10790                         device = em->map_lookup->stripes[0].dev;
10791                         ret = btrfs_add_swapfile_pin(inode, device, false);
10792                         if (ret == 1)
10793                                 ret = 0;
10794                         else if (ret)
10795                                 goto out;
10796                 } else if (device != em->map_lookup->stripes[0].dev) {
10797                         btrfs_warn(fs_info, "swapfile must be on one device");
10798                         ret = -EINVAL;
10799                         goto out;
10800                 }
10801
10802                 physical_block_start = (em->map_lookup->stripes[0].physical +
10803                                         (logical_block_start - em->start));
10804                 len = min(len, em->len - (logical_block_start - em->start));
10805                 free_extent_map(em);
10806                 em = NULL;
10807
10808                 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10809                 if (!bg) {
10810                         btrfs_warn(fs_info,
10811                            "could not find block group containing swapfile");
10812                         ret = -EINVAL;
10813                         goto out;
10814                 }
10815
10816                 if (!btrfs_inc_block_group_swap_extents(bg)) {
10817                         btrfs_warn(fs_info,
10818                            "block group for swapfile at %llu is read-only%s",
10819                            bg->start,
10820                            atomic_read(&fs_info->scrubs_running) ?
10821                                        " (scrub running)" : "");
10822                         btrfs_put_block_group(bg);
10823                         ret = -EINVAL;
10824                         goto out;
10825                 }
10826
10827                 ret = btrfs_add_swapfile_pin(inode, bg, true);
10828                 if (ret) {
10829                         btrfs_put_block_group(bg);
10830                         if (ret == 1)
10831                                 ret = 0;
10832                         else
10833                                 goto out;
10834                 }
10835
10836                 if (bsi.block_len &&
10837                     bsi.block_start + bsi.block_len == physical_block_start) {
10838                         bsi.block_len += len;
10839                 } else {
10840                         if (bsi.block_len) {
10841                                 ret = btrfs_add_swap_extent(sis, &bsi);
10842                                 if (ret)
10843                                         goto out;
10844                         }
10845                         bsi.start = start;
10846                         bsi.block_start = physical_block_start;
10847                         bsi.block_len = len;
10848                 }
10849
10850                 start += len;
10851         }
10852
10853         if (bsi.block_len)
10854                 ret = btrfs_add_swap_extent(sis, &bsi);
10855
10856 out:
10857         if (!IS_ERR_OR_NULL(em))
10858                 free_extent_map(em);
10859
10860         unlock_extent(io_tree, 0, isize - 1, &cached_state);
10861
10862         if (ret)
10863                 btrfs_swap_deactivate(file);
10864
10865         btrfs_drew_write_unlock(&root->snapshot_lock);
10866
10867         btrfs_exclop_finish(fs_info);
10868
10869         if (ret)
10870                 return ret;
10871
10872         if (device)
10873                 sis->bdev = device->bdev;
10874         *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10875         sis->max = bsi.nr_pages;
10876         sis->pages = bsi.nr_pages - 1;
10877         sis->highest_bit = bsi.nr_pages - 1;
10878         return bsi.nr_extents;
10879 }
10880 #else
10881 static void btrfs_swap_deactivate(struct file *file)
10882 {
10883 }
10884
10885 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10886                                sector_t *span)
10887 {
10888         return -EOPNOTSUPP;
10889 }
10890 #endif
10891
10892 /*
10893  * Update the number of bytes used in the VFS' inode. When we replace extents in
10894  * a range (clone, dedupe, fallocate's zero range), we must update the number of
10895  * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10896  * always get a correct value.
10897  */
10898 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10899                               const u64 add_bytes,
10900                               const u64 del_bytes)
10901 {
10902         if (add_bytes == del_bytes)
10903                 return;
10904
10905         spin_lock(&inode->lock);
10906         if (del_bytes > 0)
10907                 inode_sub_bytes(&inode->vfs_inode, del_bytes);
10908         if (add_bytes > 0)
10909                 inode_add_bytes(&inode->vfs_inode, add_bytes);
10910         spin_unlock(&inode->lock);
10911 }
10912
10913 /*
10914  * Verify that there are no ordered extents for a given file range.
10915  *
10916  * @inode:   The target inode.
10917  * @start:   Start offset of the file range, should be sector size aligned.
10918  * @end:     End offset (inclusive) of the file range, its value +1 should be
10919  *           sector size aligned.
10920  *
10921  * This should typically be used for cases where we locked an inode's VFS lock in
10922  * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10923  * we have flushed all delalloc in the range, we have waited for all ordered
10924  * extents in the range to complete and finally we have locked the file range in
10925  * the inode's io_tree.
10926  */
10927 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10928 {
10929         struct btrfs_root *root = inode->root;
10930         struct btrfs_ordered_extent *ordered;
10931
10932         if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10933                 return;
10934
10935         ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10936         if (ordered) {
10937                 btrfs_err(root->fs_info,
10938 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10939                           start, end, btrfs_ino(inode), root->root_key.objectid,
10940                           ordered->file_offset,
10941                           ordered->file_offset + ordered->num_bytes - 1);
10942                 btrfs_put_ordered_extent(ordered);
10943         }
10944
10945         ASSERT(ordered == NULL);
10946 }
10947
10948 static const struct inode_operations btrfs_dir_inode_operations = {
10949         .getattr        = btrfs_getattr,
10950         .lookup         = btrfs_lookup,
10951         .create         = btrfs_create,
10952         .unlink         = btrfs_unlink,
10953         .link           = btrfs_link,
10954         .mkdir          = btrfs_mkdir,
10955         .rmdir          = btrfs_rmdir,
10956         .rename         = btrfs_rename2,
10957         .symlink        = btrfs_symlink,
10958         .setattr        = btrfs_setattr,
10959         .mknod          = btrfs_mknod,
10960         .listxattr      = btrfs_listxattr,
10961         .permission     = btrfs_permission,
10962         .get_inode_acl  = btrfs_get_acl,
10963         .set_acl        = btrfs_set_acl,
10964         .update_time    = btrfs_update_time,
10965         .tmpfile        = btrfs_tmpfile,
10966         .fileattr_get   = btrfs_fileattr_get,
10967         .fileattr_set   = btrfs_fileattr_set,
10968 };
10969
10970 static const struct file_operations btrfs_dir_file_operations = {
10971         .llseek         = generic_file_llseek,
10972         .read           = generic_read_dir,
10973         .iterate_shared = btrfs_real_readdir,
10974         .open           = btrfs_opendir,
10975         .unlocked_ioctl = btrfs_ioctl,
10976 #ifdef CONFIG_COMPAT
10977         .compat_ioctl   = btrfs_compat_ioctl,
10978 #endif
10979         .release        = btrfs_release_file,
10980         .fsync          = btrfs_sync_file,
10981 };
10982
10983 /*
10984  * btrfs doesn't support the bmap operation because swapfiles
10985  * use bmap to make a mapping of extents in the file.  They assume
10986  * these extents won't change over the life of the file and they
10987  * use the bmap result to do IO directly to the drive.
10988  *
10989  * the btrfs bmap call would return logical addresses that aren't
10990  * suitable for IO and they also will change frequently as COW
10991  * operations happen.  So, swapfile + btrfs == corruption.
10992  *
10993  * For now we're avoiding this by dropping bmap.
10994  */
10995 static const struct address_space_operations btrfs_aops = {
10996         .read_folio     = btrfs_read_folio,
10997         .writepages     = btrfs_writepages,
10998         .readahead      = btrfs_readahead,
10999         .invalidate_folio = btrfs_invalidate_folio,
11000         .release_folio  = btrfs_release_folio,
11001         .migrate_folio  = btrfs_migrate_folio,
11002         .dirty_folio    = filemap_dirty_folio,
11003         .error_remove_page = generic_error_remove_page,
11004         .swap_activate  = btrfs_swap_activate,
11005         .swap_deactivate = btrfs_swap_deactivate,
11006 };
11007
11008 static const struct inode_operations btrfs_file_inode_operations = {
11009         .getattr        = btrfs_getattr,
11010         .setattr        = btrfs_setattr,
11011         .listxattr      = btrfs_listxattr,
11012         .permission     = btrfs_permission,
11013         .fiemap         = btrfs_fiemap,
11014         .get_inode_acl  = btrfs_get_acl,
11015         .set_acl        = btrfs_set_acl,
11016         .update_time    = btrfs_update_time,
11017         .fileattr_get   = btrfs_fileattr_get,
11018         .fileattr_set   = btrfs_fileattr_set,
11019 };
11020 static const struct inode_operations btrfs_special_inode_operations = {
11021         .getattr        = btrfs_getattr,
11022         .setattr        = btrfs_setattr,
11023         .permission     = btrfs_permission,
11024         .listxattr      = btrfs_listxattr,
11025         .get_inode_acl  = btrfs_get_acl,
11026         .set_acl        = btrfs_set_acl,
11027         .update_time    = btrfs_update_time,
11028 };
11029 static const struct inode_operations btrfs_symlink_inode_operations = {
11030         .get_link       = page_get_link,
11031         .getattr        = btrfs_getattr,
11032         .setattr        = btrfs_setattr,
11033         .permission     = btrfs_permission,
11034         .listxattr      = btrfs_listxattr,
11035         .update_time    = btrfs_update_time,
11036 };
11037
11038 const struct dentry_operations btrfs_dentry_operations = {
11039         .d_delete       = btrfs_dentry_delete,
11040 };