Merge tag 'fsnotify_for_v5.17-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 28 Jan 2022 15:51:31 +0000 (17:51 +0200)
Pull fsnotify fixes from Jan Kara:
 "Fixes for userspace breakage caused by fsnotify changes ~3 years ago
  and one fanotify cleanup"

* tag 'fsnotify_for_v5.17-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
  fsnotify: fix fsnotify hooks in pseudo filesystems
  fsnotify: invalidate dcache before IN_DELETE event
  fanotify: remove variable set but not used

1  2 
fs/btrfs/ioctl.c
fs/namei.c
fs/notify/fanotify/fanotify_user.c

diff --combined fs/btrfs/ioctl.c
@@@ -1214,35 -1214,6 +1214,35 @@@ static int defrag_collect_targets(struc
                        goto next;
  
                /*
 +               * Our start offset might be in the middle of an existing extent
 +               * map, so take that into account.
 +               */
 +              range_len = em->len - (cur - em->start);
 +              /*
 +               * If this range of the extent map is already flagged for delalloc,
 +               * skip it, because:
 +               *
 +               * 1) We could deadlock later, when trying to reserve space for
 +               *    delalloc, because in case we can't immediately reserve space
 +               *    the flusher can start delalloc and wait for the respective
 +               *    ordered extents to complete. The deadlock would happen
 +               *    because we do the space reservation while holding the range
 +               *    locked, and starting writeback, or finishing an ordered
 +               *    extent, requires locking the range;
 +               *
 +               * 2) If there's delalloc there, it means there's dirty pages for
 +               *    which writeback has not started yet (we clean the delalloc
 +               *    flag when starting writeback and after creating an ordered
 +               *    extent). If we mark pages in an adjacent range for defrag,
 +               *    then we will have a larger contiguous range for delalloc,
 +               *    very likely resulting in a larger extent after writeback is
 +               *    triggered (except in a case of free space fragmentation).
 +               */
 +              if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
 +                                 EXTENT_DELALLOC, 0, NULL))
 +                      goto next;
 +
 +              /*
                 * For do_compress case, we want to compress all valid file
                 * extents, thus no @extent_thresh or mergeable check.
                 */
                        goto add;
  
                /* Skip too large extent */
 -              if (em->len >= extent_thresh)
 +              if (range_len >= extent_thresh)
                        goto next;
  
                next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
@@@ -1471,11 -1442,9 +1471,11 @@@ static int defrag_one_cluster(struct bt
        list_for_each_entry(entry, &target_list, list) {
                u32 range_len = entry->len;
  
 -              /* Reached the limit */
 -              if (max_sectors && max_sectors == *sectors_defragged)
 +              /* Reached or beyond the limit */
 +              if (max_sectors && *sectors_defragged >= max_sectors) {
 +                      ret = 1;
                        break;
 +              }
  
                if (max_sectors)
                        range_len = min_t(u32, range_len,
                                       extent_thresh, newer_than, do_compress);
                if (ret < 0)
                        break;
 -              *sectors_defragged += range_len;
 +              *sectors_defragged += range_len >>
 +                                    inode->root->fs_info->sectorsize_bits;
        }
  out:
        list_for_each_entry_safe(entry, tmp, &target_list, list) {
   * @newer_than:          minimum transid to defrag
   * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
   *               will be defragged.
 + *
 + * Return <0 for error.
 + * Return >=0 for the number of sectors defragged, and range->start will be updated
 + * to indicate the file offset where next defrag should be started at.
 + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
 + *  defragging all the range).
   */
  int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
                      struct btrfs_ioctl_defrag_range_args *range,
        int compress_type = BTRFS_COMPRESS_ZLIB;
        int ret = 0;
        u32 extent_thresh = range->extent_thresh;
 +      pgoff_t start_index;
  
        if (isize == 0)
                return 0;
  
        if (range->start + range->len > range->start) {
                /* Got a specific range */
 -              last_byte = min(isize, range->start + range->len) - 1;
 +              last_byte = min(isize, range->start + range->len);
        } else {
                /* Defrag until file end */
 -              last_byte = isize - 1;
 +              last_byte = isize;
        }
  
 +      /* Align the range */
 +      cur = round_down(range->start, fs_info->sectorsize);
 +      last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
 +
        /*
         * If we were not given a ra, allocate a readahead context. As
         * readahead is just an optimization, defrag will work without it so
                        file_ra_state_init(ra, inode->i_mapping);
        }
  
 -      /* Align the range */
 -      cur = round_down(range->start, fs_info->sectorsize);
 -      last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
 +      /*
 +       * Make writeback start from the beginning of the range, so that the
 +       * defrag range can be written sequentially.
 +       */
 +      start_index = cur >> PAGE_SHIFT;
 +      if (start_index < inode->i_mapping->writeback_index)
 +              inode->i_mapping->writeback_index = start_index;
  
        while (cur < last_byte) {
 +              const unsigned long prev_sectors_defragged = sectors_defragged;
                u64 cluster_end;
  
                /* The cluster size 256K should always be page aligned */
                BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
  
 +              if (btrfs_defrag_cancelled(fs_info)) {
 +                      ret = -EAGAIN;
 +                      break;
 +              }
 +
                /* We want the cluster end at page boundary when possible */
                cluster_end = (((cur >> PAGE_SHIFT) +
                               (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
                                cluster_end + 1 - cur, extent_thresh,
                                newer_than, do_compress,
                                &sectors_defragged, max_to_defrag);
 +
 +              if (sectors_defragged > prev_sectors_defragged)
 +                      balance_dirty_pages_ratelimited(inode->i_mapping);
 +
                btrfs_inode_unlock(inode, 0);
                if (ret < 0)
                        break;
                cur = cluster_end + 1;
 +              if (ret > 0) {
 +                      ret = 0;
 +                      break;
 +              }
        }
  
        if (ra_allocated)
                kfree(ra);
 +      /*
 +       * Update range.start for autodefrag, this will indicate where to start
 +       * in next run.
 +       */
 +      range->start = cur;
        if (sectors_defragged) {
                /*
                 * We have defragged some sectors, for compression case they
@@@ -3152,10 -3086,8 +3152,8 @@@ static noinline int btrfs_ioctl_snap_de
        btrfs_inode_lock(inode, 0);
        err = btrfs_delete_subvolume(dir, dentry);
        btrfs_inode_unlock(inode, 0);
-       if (!err) {
-               fsnotify_rmdir(dir, dentry);
-               d_delete(dentry);
-       }
+       if (!err)
+               d_delete_notify(dir, dentry);
  
  out_dput:
        dput(dentry);
diff --combined fs/namei.c
@@@ -1020,60 -1020,10 +1020,60 @@@ static inline void put_link(struct name
                path_put(&last->link);
  }
  
 -int sysctl_protected_symlinks __read_mostly = 0;
 -int sysctl_protected_hardlinks __read_mostly = 0;
 -int sysctl_protected_fifos __read_mostly;
 -int sysctl_protected_regular __read_mostly;
 +static int sysctl_protected_symlinks __read_mostly;
 +static int sysctl_protected_hardlinks __read_mostly;
 +static int sysctl_protected_fifos __read_mostly;
 +static int sysctl_protected_regular __read_mostly;
 +
 +#ifdef CONFIG_SYSCTL
 +static struct ctl_table namei_sysctls[] = {
 +      {
 +              .procname       = "protected_symlinks",
 +              .data           = &sysctl_protected_symlinks,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0600,
 +              .proc_handler   = proc_dointvec_minmax,
 +              .extra1         = SYSCTL_ZERO,
 +              .extra2         = SYSCTL_ONE,
 +      },
 +      {
 +              .procname       = "protected_hardlinks",
 +              .data           = &sysctl_protected_hardlinks,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0600,
 +              .proc_handler   = proc_dointvec_minmax,
 +              .extra1         = SYSCTL_ZERO,
 +              .extra2         = SYSCTL_ONE,
 +      },
 +      {
 +              .procname       = "protected_fifos",
 +              .data           = &sysctl_protected_fifos,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0600,
 +              .proc_handler   = proc_dointvec_minmax,
 +              .extra1         = SYSCTL_ZERO,
 +              .extra2         = SYSCTL_TWO,
 +      },
 +      {
 +              .procname       = "protected_regular",
 +              .data           = &sysctl_protected_regular,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0600,
 +              .proc_handler   = proc_dointvec_minmax,
 +              .extra1         = SYSCTL_ZERO,
 +              .extra2         = SYSCTL_TWO,
 +      },
 +      { }
 +};
 +
 +static int __init init_fs_namei_sysctls(void)
 +{
 +      register_sysctl_init("fs", namei_sysctls);
 +      return 0;
 +}
 +fs_initcall(init_fs_namei_sysctls);
 +
 +#endif /* CONFIG_SYSCTL */
  
  /**
   * may_follow_link - Check symlink following for unsafe situations
@@@ -4024,13 -3974,12 +4024,12 @@@ int vfs_rmdir(struct user_namespace *mn
        dentry->d_inode->i_flags |= S_DEAD;
        dont_mount(dentry);
        detach_mounts(dentry);
-       fsnotify_rmdir(dir, dentry);
  
  out:
        inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
-               d_delete(dentry);
+               d_delete_notify(dir, dentry);
        return error;
  }
  EXPORT_SYMBOL(vfs_rmdir);
@@@ -4152,7 -4101,6 +4151,6 @@@ int vfs_unlink(struct user_namespace *m
                        if (!error) {
                                dont_mount(dentry);
                                detach_mounts(dentry);
-                               fsnotify_unlink(dir, dentry);
                        }
                }
        }
        inode_unlock(target);
  
        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
-       if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+       if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+               fsnotify_unlink(dir, dentry);
+       } else if (!error) {
                fsnotify_link_count(target);
-               d_delete(dentry);
+               d_delete_notify(dir, dentry);
        }
  
        return error;
@@@ -59,7 -59,7 +59,7 @@@ static int fanotify_max_queued_events _
  static long ft_zero = 0;
  static long ft_int_max = INT_MAX;
  
 -struct ctl_table fanotify_table[] = {
 +static struct ctl_table fanotify_table[] = {
        {
                .procname       = "max_user_groups",
                .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
        },
        { }
  };
 +
 +static void __init fanotify_sysctls_init(void)
 +{
 +      register_sysctl("fs/fanotify", fanotify_table);
 +}
 +#else
 +#define fanotify_sysctls_init() do { } while (0)
  #endif /* CONFIG_SYSCTL */
  
  /*
@@@ -158,7 -151,6 +158,6 @@@ static size_t fanotify_event_len(unsign
                                 struct fanotify_event *event)
  {
        size_t event_len = FAN_EVENT_METADATA_LEN;
-       struct fanotify_info *info;
        int fh_len;
        int dot_len = 0;
  
        if (fanotify_is_error_event(event->mask))
                event_len += FANOTIFY_ERROR_INFO_LEN;
  
-       info = fanotify_event_info(event);
        if (fanotify_event_has_any_dir_fh(event)) {
                event_len += fanotify_dir_name_info_len(event);
        } else if ((info_mode & FAN_REPORT_NAME) &&
@@@ -1750,7 -1740,6 +1747,7 @@@ static int __init fanotify_user_setup(v
        init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
                                        FANOTIFY_DEFAULT_MAX_GROUPS;
        init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
 +      fanotify_sysctls_init();
  
        return 0;
  }